1 // VERSION 2 2 /* 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 4 * 5 * Use of this source code is governed by a BSD-style license 6 * that can be found in the LICENSE file in the root of the source 7 * tree. An additional intellectual property rights grant can be found 8 * in the file PATENTS. All contributing project authors may 9 * be found in the AUTHORS file in the root of the source tree. 10 */ 11 12 #include "libyuv/row.h" 13 14 #ifdef __cplusplus 15 namespace libyuv { 16 extern "C" { 17 #endif 18 19 // This module is for GCC x86 and x64. 20 #if !defined(LIBYUV_DISABLE_X86) && \ 21 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) 22 23 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) 24 25 // Constants for ARGB 26 static vec8 kARGBToY = { 27 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 28 }; 29 30 // JPeg full range. 31 static vec8 kARGBToYJ = { 32 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 33 }; 34 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) 35 36 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) 37 38 static vec8 kARGBToU = { 39 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 40 }; 41 42 static vec8 kARGBToUJ = { 43 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 44 }; 45 46 static vec8 kARGBToV = { 47 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, 48 }; 49 50 static vec8 kARGBToVJ = { 51 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 52 }; 53 54 // Constants for BGRA 55 static vec8 kBGRAToY = { 56 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 57 }; 58 59 static vec8 kBGRAToU = { 60 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 61 }; 62 63 static vec8 kBGRAToV = { 64 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 65 }; 66 67 // Constants for ABGR 68 static vec8 kABGRToY = { 69 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 70 }; 71 72 static vec8 kABGRToU = { 73 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 74 }; 75 76 static vec8 kABGRToV = { 77 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 78 }; 79 80 // Constants for RGBA. 81 static vec8 kRGBAToY = { 82 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 83 }; 84 85 static vec8 kRGBAToU = { 86 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 87 }; 88 89 static vec8 kRGBAToV = { 90 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 91 }; 92 93 static uvec8 kAddY16 = { 94 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u 95 }; 96 97 // 7 bit fixed point 0.5. 98 static vec16 kAddYJ64 = { 99 64, 64, 64, 64, 64, 64, 64, 64 100 }; 101 102 static uvec8 kAddUV128 = { 103 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 104 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 105 }; 106 107 static uvec16 kAddUVJ128 = { 108 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u 109 }; 110 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) 111 112 #ifdef HAS_RGB24TOARGBROW_SSSE3 113 114 // Shuffle table for converting RGB24 to ARGB. 115 static uvec8 kShuffleMaskRGB24ToARGB = { 116 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u 117 }; 118 119 // Shuffle table for converting RAW to ARGB. 120 static uvec8 kShuffleMaskRAWToARGB = { 121 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u 122 }; 123 124 // Shuffle table for converting RAW to RGB24. First 8. 125 static const uvec8 kShuffleMaskRAWToRGB24_0 = { 126 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, 127 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 128 }; 129 130 // Shuffle table for converting RAW to RGB24. Middle 8. 131 static const uvec8 kShuffleMaskRAWToRGB24_1 = { 132 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, 133 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 134 }; 135 136 // Shuffle table for converting RAW to RGB24. Last 8. 137 static const uvec8 kShuffleMaskRAWToRGB24_2 = { 138 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, 139 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 140 }; 141 142 // Shuffle table for converting ARGB to RGB24. 143 static uvec8 kShuffleMaskARGBToRGB24 = { 144 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u 145 }; 146 147 // Shuffle table for converting ARGB to RAW. 148 static uvec8 kShuffleMaskARGBToRAW = { 149 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u 150 }; 151 152 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 153 static uvec8 kShuffleMaskARGBToRGB24_0 = { 154 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u 155 }; 156 157 // YUY2 shuf 16 Y to 32 Y. 158 static const lvec8 kShuffleYUY2Y = { 159 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 160 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 161 }; 162 163 // YUY2 shuf 8 UV to 16 UV. 164 static const lvec8 kShuffleYUY2UV = { 165 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15, 166 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 167 }; 168 169 // UYVY shuf 16 Y to 32 Y. 170 static const lvec8 kShuffleUYVYY = { 171 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, 172 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 173 }; 174 175 // UYVY shuf 8 UV to 16 UV. 176 static const lvec8 kShuffleUYVYUV = { 177 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14, 178 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 179 }; 180 181 // NV21 shuf 8 VU to 16 UV. 182 static const lvec8 kShuffleNV21 = { 183 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, 184 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, 185 }; 186 #endif // HAS_RGB24TOARGBROW_SSSE3 187 188 #ifdef HAS_J400TOARGBROW_SSE2 189 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { 190 asm volatile ( 191 "pcmpeqb %%xmm5,%%xmm5 \n" 192 "pslld $0x18,%%xmm5 \n" 193 LABELALIGN 194 "1: \n" 195 "movq " MEMACCESS(0) ",%%xmm0 \n" 196 "lea " MEMLEA(0x8,0) ",%0 \n" 197 "punpcklbw %%xmm0,%%xmm0 \n" 198 "movdqa %%xmm0,%%xmm1 \n" 199 "punpcklwd %%xmm0,%%xmm0 \n" 200 "punpckhwd %%xmm1,%%xmm1 \n" 201 "por %%xmm5,%%xmm0 \n" 202 "por %%xmm5,%%xmm1 \n" 203 "movdqu %%xmm0," MEMACCESS(1) " \n" 204 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 205 "lea " MEMLEA(0x20,1) ",%1 \n" 206 "sub $0x8,%2 \n" 207 "jg 1b \n" 208 : "+r"(src_y), // %0 209 "+r"(dst_argb), // %1 210 "+r"(width) // %2 211 :: "memory", "cc", "xmm0", "xmm1", "xmm5" 212 ); 213 } 214 #endif // HAS_J400TOARGBROW_SSE2 215 216 #ifdef HAS_RGB24TOARGBROW_SSSE3 217 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) { 218 asm volatile ( 219 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 220 "pslld $0x18,%%xmm5 \n" 221 "movdqa %3,%%xmm4 \n" 222 LABELALIGN 223 "1: \n" 224 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 225 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 226 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" 227 "lea " MEMLEA(0x30,0) ",%0 \n" 228 "movdqa %%xmm3,%%xmm2 \n" 229 "palignr $0x8,%%xmm1,%%xmm2 \n" 230 "pshufb %%xmm4,%%xmm2 \n" 231 "por %%xmm5,%%xmm2 \n" 232 "palignr $0xc,%%xmm0,%%xmm1 \n" 233 "pshufb %%xmm4,%%xmm0 \n" 234 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" 235 "por %%xmm5,%%xmm0 \n" 236 "pshufb %%xmm4,%%xmm1 \n" 237 "movdqu %%xmm0," MEMACCESS(1) " \n" 238 "por %%xmm5,%%xmm1 \n" 239 "palignr $0x4,%%xmm3,%%xmm3 \n" 240 "pshufb %%xmm4,%%xmm3 \n" 241 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 242 "por %%xmm5,%%xmm3 \n" 243 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" 244 "lea " MEMLEA(0x40,1) ",%1 \n" 245 "sub $0x10,%2 \n" 246 "jg 1b \n" 247 : "+r"(src_rgb24), // %0 248 "+r"(dst_argb), // %1 249 "+r"(width) // %2 250 : "m"(kShuffleMaskRGB24ToARGB) // %3 251 : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 252 ); 253 } 254 255 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) { 256 asm volatile ( 257 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 258 "pslld $0x18,%%xmm5 \n" 259 "movdqa %3,%%xmm4 \n" 260 LABELALIGN 261 "1: \n" 262 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 263 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 264 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" 265 "lea " MEMLEA(0x30,0) ",%0 \n" 266 "movdqa %%xmm3,%%xmm2 \n" 267 "palignr $0x8,%%xmm1,%%xmm2 \n" 268 "pshufb %%xmm4,%%xmm2 \n" 269 "por %%xmm5,%%xmm2 \n" 270 "palignr $0xc,%%xmm0,%%xmm1 \n" 271 "pshufb %%xmm4,%%xmm0 \n" 272 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" 273 "por %%xmm5,%%xmm0 \n" 274 "pshufb %%xmm4,%%xmm1 \n" 275 "movdqu %%xmm0," MEMACCESS(1) " \n" 276 "por %%xmm5,%%xmm1 \n" 277 "palignr $0x4,%%xmm3,%%xmm3 \n" 278 "pshufb %%xmm4,%%xmm3 \n" 279 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 280 "por %%xmm5,%%xmm3 \n" 281 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" 282 "lea " MEMLEA(0x40,1) ",%1 \n" 283 "sub $0x10,%2 \n" 284 "jg 1b \n" 285 : "+r"(src_raw), // %0 286 "+r"(dst_argb), // %1 287 "+r"(width) // %2 288 : "m"(kShuffleMaskRAWToARGB) // %3 289 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 290 ); 291 } 292 293 void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) { 294 asm volatile ( 295 "movdqa %3,%%xmm3 \n" 296 "movdqa %4,%%xmm4 \n" 297 "movdqa %5,%%xmm5 \n" 298 LABELALIGN 299 "1: \n" 300 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 301 "movdqu " MEMACCESS2(0x4,0) ",%%xmm1 \n" 302 "movdqu " MEMACCESS2(0x8,0) ",%%xmm2 \n" 303 "lea " MEMLEA(0x18,0) ",%0 \n" 304 "pshufb %%xmm3,%%xmm0 \n" 305 "pshufb %%xmm4,%%xmm1 \n" 306 "pshufb %%xmm5,%%xmm2 \n" 307 "movq %%xmm0," MEMACCESS(1) " \n" 308 "movq %%xmm1," MEMACCESS2(0x8,1) " \n" 309 "movq %%xmm2," MEMACCESS2(0x10,1) " \n" 310 "lea " MEMLEA(0x18,1) ",%1 \n" 311 "sub $0x8,%2 \n" 312 "jg 1b \n" 313 : "+r"(src_raw), // %0 314 "+r"(dst_rgb24), // %1 315 "+r"(width) // %2 316 : "m"(kShuffleMaskRAWToRGB24_0), // %3 317 "m"(kShuffleMaskRAWToRGB24_1), // %4 318 "m"(kShuffleMaskRAWToRGB24_2) // %5 319 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 320 ); 321 } 322 323 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { 324 asm volatile ( 325 "mov $0x1080108,%%eax \n" 326 "movd %%eax,%%xmm5 \n" 327 "pshufd $0x0,%%xmm5,%%xmm5 \n" 328 "mov $0x20802080,%%eax \n" 329 "movd %%eax,%%xmm6 \n" 330 "pshufd $0x0,%%xmm6,%%xmm6 \n" 331 "pcmpeqb %%xmm3,%%xmm3 \n" 332 "psllw $0xb,%%xmm3 \n" 333 "pcmpeqb %%xmm4,%%xmm4 \n" 334 "psllw $0xa,%%xmm4 \n" 335 "psrlw $0x5,%%xmm4 \n" 336 "pcmpeqb %%xmm7,%%xmm7 \n" 337 "psllw $0x8,%%xmm7 \n" 338 "sub %0,%1 \n" 339 "sub %0,%1 \n" 340 LABELALIGN 341 "1: \n" 342 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 343 "movdqa %%xmm0,%%xmm1 \n" 344 "movdqa %%xmm0,%%xmm2 \n" 345 "pand %%xmm3,%%xmm1 \n" 346 "psllw $0xb,%%xmm2 \n" 347 "pmulhuw %%xmm5,%%xmm1 \n" 348 "pmulhuw %%xmm5,%%xmm2 \n" 349 "psllw $0x8,%%xmm1 \n" 350 "por %%xmm2,%%xmm1 \n" 351 "pand %%xmm4,%%xmm0 \n" 352 "pmulhuw %%xmm6,%%xmm0 \n" 353 "por %%xmm7,%%xmm0 \n" 354 "movdqa %%xmm1,%%xmm2 \n" 355 "punpcklbw %%xmm0,%%xmm1 \n" 356 "punpckhbw %%xmm0,%%xmm2 \n" 357 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) 358 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) 359 "lea " MEMLEA(0x10,0) ",%0 \n" 360 "sub $0x8,%2 \n" 361 "jg 1b \n" 362 : "+r"(src), // %0 363 "+r"(dst), // %1 364 "+r"(width) // %2 365 : 366 : "memory", "cc", "eax", NACL_R14 367 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 368 ); 369 } 370 371 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { 372 asm volatile ( 373 "mov $0x1080108,%%eax \n" 374 "movd %%eax,%%xmm5 \n" 375 "pshufd $0x0,%%xmm5,%%xmm5 \n" 376 "mov $0x42004200,%%eax \n" 377 "movd %%eax,%%xmm6 \n" 378 "pshufd $0x0,%%xmm6,%%xmm6 \n" 379 "pcmpeqb %%xmm3,%%xmm3 \n" 380 "psllw $0xb,%%xmm3 \n" 381 "movdqa %%xmm3,%%xmm4 \n" 382 "psrlw $0x6,%%xmm4 \n" 383 "pcmpeqb %%xmm7,%%xmm7 \n" 384 "psllw $0x8,%%xmm7 \n" 385 "sub %0,%1 \n" 386 "sub %0,%1 \n" 387 LABELALIGN 388 "1: \n" 389 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 390 "movdqa %%xmm0,%%xmm1 \n" 391 "movdqa %%xmm0,%%xmm2 \n" 392 "psllw $0x1,%%xmm1 \n" 393 "psllw $0xb,%%xmm2 \n" 394 "pand %%xmm3,%%xmm1 \n" 395 "pmulhuw %%xmm5,%%xmm2 \n" 396 "pmulhuw %%xmm5,%%xmm1 \n" 397 "psllw $0x8,%%xmm1 \n" 398 "por %%xmm2,%%xmm1 \n" 399 "movdqa %%xmm0,%%xmm2 \n" 400 "pand %%xmm4,%%xmm0 \n" 401 "psraw $0x8,%%xmm2 \n" 402 "pmulhuw %%xmm6,%%xmm0 \n" 403 "pand %%xmm7,%%xmm2 \n" 404 "por %%xmm2,%%xmm0 \n" 405 "movdqa %%xmm1,%%xmm2 \n" 406 "punpcklbw %%xmm0,%%xmm1 \n" 407 "punpckhbw %%xmm0,%%xmm2 \n" 408 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) 409 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) 410 "lea " MEMLEA(0x10,0) ",%0 \n" 411 "sub $0x8,%2 \n" 412 "jg 1b \n" 413 : "+r"(src), // %0 414 "+r"(dst), // %1 415 "+r"(width) // %2 416 : 417 : "memory", "cc", "eax", NACL_R14 418 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 419 ); 420 } 421 422 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { 423 asm volatile ( 424 "mov $0xf0f0f0f,%%eax \n" 425 "movd %%eax,%%xmm4 \n" 426 "pshufd $0x0,%%xmm4,%%xmm4 \n" 427 "movdqa %%xmm4,%%xmm5 \n" 428 "pslld $0x4,%%xmm5 \n" 429 "sub %0,%1 \n" 430 "sub %0,%1 \n" 431 LABELALIGN 432 "1: \n" 433 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 434 "movdqa %%xmm0,%%xmm2 \n" 435 "pand %%xmm4,%%xmm0 \n" 436 "pand %%xmm5,%%xmm2 \n" 437 "movdqa %%xmm0,%%xmm1 \n" 438 "movdqa %%xmm2,%%xmm3 \n" 439 "psllw $0x4,%%xmm1 \n" 440 "psrlw $0x4,%%xmm3 \n" 441 "por %%xmm1,%%xmm0 \n" 442 "por %%xmm3,%%xmm2 \n" 443 "movdqa %%xmm0,%%xmm1 \n" 444 "punpcklbw %%xmm2,%%xmm0 \n" 445 "punpckhbw %%xmm2,%%xmm1 \n" 446 MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2) 447 MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2) 448 "lea " MEMLEA(0x10,0) ",%0 \n" 449 "sub $0x8,%2 \n" 450 "jg 1b \n" 451 : "+r"(src), // %0 452 "+r"(dst), // %1 453 "+r"(width) // %2 454 : 455 : "memory", "cc", "eax", NACL_R14 456 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 457 ); 458 } 459 460 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) { 461 asm volatile ( 462 "movdqa %3,%%xmm6 \n" 463 LABELALIGN 464 "1: \n" 465 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 466 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 467 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 468 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 469 "lea " MEMLEA(0x40,0) ",%0 \n" 470 "pshufb %%xmm6,%%xmm0 \n" 471 "pshufb %%xmm6,%%xmm1 \n" 472 "pshufb %%xmm6,%%xmm2 \n" 473 "pshufb %%xmm6,%%xmm3 \n" 474 "movdqa %%xmm1,%%xmm4 \n" 475 "psrldq $0x4,%%xmm1 \n" 476 "pslldq $0xc,%%xmm4 \n" 477 "movdqa %%xmm2,%%xmm5 \n" 478 "por %%xmm4,%%xmm0 \n" 479 "pslldq $0x8,%%xmm5 \n" 480 "movdqu %%xmm0," MEMACCESS(1) " \n" 481 "por %%xmm5,%%xmm1 \n" 482 "psrldq $0x8,%%xmm2 \n" 483 "pslldq $0x4,%%xmm3 \n" 484 "por %%xmm3,%%xmm2 \n" 485 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 486 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" 487 "lea " MEMLEA(0x30,1) ",%1 \n" 488 "sub $0x10,%2 \n" 489 "jg 1b \n" 490 : "+r"(src), // %0 491 "+r"(dst), // %1 492 "+r"(width) // %2 493 : "m"(kShuffleMaskARGBToRGB24) // %3 494 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 495 ); 496 } 497 498 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) { 499 asm volatile ( 500 "movdqa %3,%%xmm6 \n" 501 LABELALIGN 502 "1: \n" 503 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 504 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 505 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 506 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 507 "lea " MEMLEA(0x40,0) ",%0 \n" 508 "pshufb %%xmm6,%%xmm0 \n" 509 "pshufb %%xmm6,%%xmm1 \n" 510 "pshufb %%xmm6,%%xmm2 \n" 511 "pshufb %%xmm6,%%xmm3 \n" 512 "movdqa %%xmm1,%%xmm4 \n" 513 "psrldq $0x4,%%xmm1 \n" 514 "pslldq $0xc,%%xmm4 \n" 515 "movdqa %%xmm2,%%xmm5 \n" 516 "por %%xmm4,%%xmm0 \n" 517 "pslldq $0x8,%%xmm5 \n" 518 "movdqu %%xmm0," MEMACCESS(1) " \n" 519 "por %%xmm5,%%xmm1 \n" 520 "psrldq $0x8,%%xmm2 \n" 521 "pslldq $0x4,%%xmm3 \n" 522 "por %%xmm3,%%xmm2 \n" 523 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 524 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" 525 "lea " MEMLEA(0x30,1) ",%1 \n" 526 "sub $0x10,%2 \n" 527 "jg 1b \n" 528 : "+r"(src), // %0 529 "+r"(dst), // %1 530 "+r"(width) // %2 531 : "m"(kShuffleMaskARGBToRAW) // %3 532 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 533 ); 534 } 535 536 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) { 537 asm volatile ( 538 "pcmpeqb %%xmm3,%%xmm3 \n" 539 "psrld $0x1b,%%xmm3 \n" 540 "pcmpeqb %%xmm4,%%xmm4 \n" 541 "psrld $0x1a,%%xmm4 \n" 542 "pslld $0x5,%%xmm4 \n" 543 "pcmpeqb %%xmm5,%%xmm5 \n" 544 "pslld $0xb,%%xmm5 \n" 545 LABELALIGN 546 "1: \n" 547 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 548 "movdqa %%xmm0,%%xmm1 \n" 549 "movdqa %%xmm0,%%xmm2 \n" 550 "pslld $0x8,%%xmm0 \n" 551 "psrld $0x3,%%xmm1 \n" 552 "psrld $0x5,%%xmm2 \n" 553 "psrad $0x10,%%xmm0 \n" 554 "pand %%xmm3,%%xmm1 \n" 555 "pand %%xmm4,%%xmm2 \n" 556 "pand %%xmm5,%%xmm0 \n" 557 "por %%xmm2,%%xmm1 \n" 558 "por %%xmm1,%%xmm0 \n" 559 "packssdw %%xmm0,%%xmm0 \n" 560 "lea " MEMLEA(0x10,0) ",%0 \n" 561 "movq %%xmm0," MEMACCESS(1) " \n" 562 "lea " MEMLEA(0x8,1) ",%1 \n" 563 "sub $0x4,%2 \n" 564 "jg 1b \n" 565 : "+r"(src), // %0 566 "+r"(dst), // %1 567 "+r"(width) // %2 568 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 569 ); 570 } 571 572 void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst, 573 const uint32 dither4, int width) { 574 asm volatile ( 575 "movd %3,%%xmm6 \n" 576 "punpcklbw %%xmm6,%%xmm6 \n" 577 "movdqa %%xmm6,%%xmm7 \n" 578 "punpcklwd %%xmm6,%%xmm6 \n" 579 "punpckhwd %%xmm7,%%xmm7 \n" 580 "pcmpeqb %%xmm3,%%xmm3 \n" 581 "psrld $0x1b,%%xmm3 \n" 582 "pcmpeqb %%xmm4,%%xmm4 \n" 583 "psrld $0x1a,%%xmm4 \n" 584 "pslld $0x5,%%xmm4 \n" 585 "pcmpeqb %%xmm5,%%xmm5 \n" 586 "pslld $0xb,%%xmm5 \n" 587 588 LABELALIGN 589 "1: \n" 590 "movdqu (%0),%%xmm0 \n" 591 "paddusb %%xmm6,%%xmm0 \n" 592 "movdqa %%xmm0,%%xmm1 \n" 593 "movdqa %%xmm0,%%xmm2 \n" 594 "pslld $0x8,%%xmm0 \n" 595 "psrld $0x3,%%xmm1 \n" 596 "psrld $0x5,%%xmm2 \n" 597 "psrad $0x10,%%xmm0 \n" 598 "pand %%xmm3,%%xmm1 \n" 599 "pand %%xmm4,%%xmm2 \n" 600 "pand %%xmm5,%%xmm0 \n" 601 "por %%xmm2,%%xmm1 \n" 602 "por %%xmm1,%%xmm0 \n" 603 "packssdw %%xmm0,%%xmm0 \n" 604 "lea 0x10(%0),%0 \n" 605 "movq %%xmm0,(%1) \n" 606 "lea 0x8(%1),%1 \n" 607 "sub $0x4,%2 \n" 608 "jg 1b \n" 609 : "+r"(src), // %0 610 "+r"(dst), // %1 611 "+r"(width) // %2 612 : "m"(dither4) // %3 613 : "memory", "cc", 614 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 615 ); 616 } 617 618 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 619 void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst, 620 const uint32 dither4, int width) { 621 asm volatile ( 622 "vbroadcastss %3,%%xmm6 \n" 623 "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" 624 "vpermq $0xd8,%%ymm6,%%ymm6 \n" 625 "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" 626 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" 627 "vpsrld $0x1b,%%ymm3,%%ymm3 \n" 628 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" 629 "vpsrld $0x1a,%%ymm4,%%ymm4 \n" 630 "vpslld $0x5,%%ymm4,%%ymm4 \n" 631 "vpslld $0xb,%%ymm3,%%ymm5 \n" 632 633 LABELALIGN 634 "1: \n" 635 "vmovdqu (%0),%%ymm0 \n" 636 "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" 637 "vpsrld $0x5,%%ymm0,%%ymm2 \n" 638 "vpsrld $0x3,%%ymm0,%%ymm1 \n" 639 "vpsrld $0x8,%%ymm0,%%ymm0 \n" 640 "vpand %%ymm4,%%ymm2,%%ymm2 \n" 641 "vpand %%ymm3,%%ymm1,%%ymm1 \n" 642 "vpand %%ymm5,%%ymm0,%%ymm0 \n" 643 "vpor %%ymm2,%%ymm1,%%ymm1 \n" 644 "vpor %%ymm1,%%ymm0,%%ymm0 \n" 645 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" 646 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 647 "lea 0x20(%0),%0 \n" 648 "vmovdqu %%xmm0,(%1) \n" 649 "lea 0x10(%1),%1 \n" 650 "sub $0x8,%2 \n" 651 "jg 1b \n" 652 "vzeroupper \n" 653 : "+r"(src), // %0 654 "+r"(dst), // %1 655 "+r"(width) // %2 656 : "m"(dither4) // %3 657 : "memory", "cc", 658 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 659 ); 660 } 661 #endif // HAS_ARGBTORGB565DITHERROW_AVX2 662 663 664 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) { 665 asm volatile ( 666 "pcmpeqb %%xmm4,%%xmm4 \n" 667 "psrld $0x1b,%%xmm4 \n" 668 "movdqa %%xmm4,%%xmm5 \n" 669 "pslld $0x5,%%xmm5 \n" 670 "movdqa %%xmm4,%%xmm6 \n" 671 "pslld $0xa,%%xmm6 \n" 672 "pcmpeqb %%xmm7,%%xmm7 \n" 673 "pslld $0xf,%%xmm7 \n" 674 LABELALIGN 675 "1: \n" 676 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 677 "movdqa %%xmm0,%%xmm1 \n" 678 "movdqa %%xmm0,%%xmm2 \n" 679 "movdqa %%xmm0,%%xmm3 \n" 680 "psrad $0x10,%%xmm0 \n" 681 "psrld $0x3,%%xmm1 \n" 682 "psrld $0x6,%%xmm2 \n" 683 "psrld $0x9,%%xmm3 \n" 684 "pand %%xmm7,%%xmm0 \n" 685 "pand %%xmm4,%%xmm1 \n" 686 "pand %%xmm5,%%xmm2 \n" 687 "pand %%xmm6,%%xmm3 \n" 688 "por %%xmm1,%%xmm0 \n" 689 "por %%xmm3,%%xmm2 \n" 690 "por %%xmm2,%%xmm0 \n" 691 "packssdw %%xmm0,%%xmm0 \n" 692 "lea " MEMLEA(0x10,0) ",%0 \n" 693 "movq %%xmm0," MEMACCESS(1) " \n" 694 "lea " MEMLEA(0x8,1) ",%1 \n" 695 "sub $0x4,%2 \n" 696 "jg 1b \n" 697 : "+r"(src), // %0 698 "+r"(dst), // %1 699 "+r"(width) // %2 700 :: "memory", "cc", 701 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 702 ); 703 } 704 705 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) { 706 asm volatile ( 707 "pcmpeqb %%xmm4,%%xmm4 \n" 708 "psllw $0xc,%%xmm4 \n" 709 "movdqa %%xmm4,%%xmm3 \n" 710 "psrlw $0x8,%%xmm3 \n" 711 LABELALIGN 712 "1: \n" 713 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 714 "movdqa %%xmm0,%%xmm1 \n" 715 "pand %%xmm3,%%xmm0 \n" 716 "pand %%xmm4,%%xmm1 \n" 717 "psrlq $0x4,%%xmm0 \n" 718 "psrlq $0x8,%%xmm1 \n" 719 "por %%xmm1,%%xmm0 \n" 720 "packuswb %%xmm0,%%xmm0 \n" 721 "lea " MEMLEA(0x10,0) ",%0 \n" 722 "movq %%xmm0," MEMACCESS(1) " \n" 723 "lea " MEMLEA(0x8,1) ",%1 \n" 724 "sub $0x4,%2 \n" 725 "jg 1b \n" 726 : "+r"(src), // %0 727 "+r"(dst), // %1 728 "+r"(width) // %2 729 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" 730 ); 731 } 732 #endif // HAS_RGB24TOARGBROW_SSSE3 733 734 #ifdef HAS_ARGBTOYROW_SSSE3 735 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. 736 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { 737 asm volatile ( 738 "movdqa %3,%%xmm4 \n" 739 "movdqa %4,%%xmm5 \n" 740 LABELALIGN 741 "1: \n" 742 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 743 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 744 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 745 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 746 "pmaddubsw %%xmm4,%%xmm0 \n" 747 "pmaddubsw %%xmm4,%%xmm1 \n" 748 "pmaddubsw %%xmm4,%%xmm2 \n" 749 "pmaddubsw %%xmm4,%%xmm3 \n" 750 "lea " MEMLEA(0x40,0) ",%0 \n" 751 "phaddw %%xmm1,%%xmm0 \n" 752 "phaddw %%xmm3,%%xmm2 \n" 753 "psrlw $0x7,%%xmm0 \n" 754 "psrlw $0x7,%%xmm2 \n" 755 "packuswb %%xmm2,%%xmm0 \n" 756 "paddb %%xmm5,%%xmm0 \n" 757 "movdqu %%xmm0," MEMACCESS(1) " \n" 758 "lea " MEMLEA(0x10,1) ",%1 \n" 759 "sub $0x10,%2 \n" 760 "jg 1b \n" 761 : "+r"(src_argb), // %0 762 "+r"(dst_y), // %1 763 "+r"(width) // %2 764 : "m"(kARGBToY), // %3 765 "m"(kAddY16) // %4 766 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 767 ); 768 } 769 #endif // HAS_ARGBTOYROW_SSSE3 770 771 #ifdef HAS_ARGBTOYJROW_SSSE3 772 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. 773 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. 774 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { 775 asm volatile ( 776 "movdqa %3,%%xmm4 \n" 777 "movdqa %4,%%xmm5 \n" 778 LABELALIGN 779 "1: \n" 780 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 781 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 782 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 783 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 784 "pmaddubsw %%xmm4,%%xmm0 \n" 785 "pmaddubsw %%xmm4,%%xmm1 \n" 786 "pmaddubsw %%xmm4,%%xmm2 \n" 787 "pmaddubsw %%xmm4,%%xmm3 \n" 788 "lea " MEMLEA(0x40,0) ",%0 \n" 789 "phaddw %%xmm1,%%xmm0 \n" 790 "phaddw %%xmm3,%%xmm2 \n" 791 "paddw %%xmm5,%%xmm0 \n" 792 "paddw %%xmm5,%%xmm2 \n" 793 "psrlw $0x7,%%xmm0 \n" 794 "psrlw $0x7,%%xmm2 \n" 795 "packuswb %%xmm2,%%xmm0 \n" 796 "movdqu %%xmm0," MEMACCESS(1) " \n" 797 "lea " MEMLEA(0x10,1) ",%1 \n" 798 "sub $0x10,%2 \n" 799 "jg 1b \n" 800 : "+r"(src_argb), // %0 801 "+r"(dst_y), // %1 802 "+r"(width) // %2 803 : "m"(kARGBToYJ), // %3 804 "m"(kAddYJ64) // %4 805 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 806 ); 807 } 808 #endif // HAS_ARGBTOYJROW_SSSE3 809 810 #ifdef HAS_ARGBTOYROW_AVX2 811 // vpermd for vphaddw + vpackuswb vpermd. 812 static const lvec32 kPermdARGBToY_AVX = { 813 0, 4, 1, 5, 2, 6, 3, 7 814 }; 815 816 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. 817 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { 818 asm volatile ( 819 "vbroadcastf128 %3,%%ymm4 \n" 820 "vbroadcastf128 %4,%%ymm5 \n" 821 "vmovdqu %5,%%ymm6 \n" 822 LABELALIGN 823 "1: \n" 824 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 825 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 826 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" 827 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" 828 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" 829 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" 830 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" 831 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" 832 "lea " MEMLEA(0x80,0) ",%0 \n" 833 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. 834 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" 835 "vpsrlw $0x7,%%ymm0,%%ymm0 \n" 836 "vpsrlw $0x7,%%ymm2,%%ymm2 \n" 837 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. 838 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. 839 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y 840 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 841 "lea " MEMLEA(0x20,1) ",%1 \n" 842 "sub $0x20,%2 \n" 843 "jg 1b \n" 844 "vzeroupper \n" 845 : "+r"(src_argb), // %0 846 "+r"(dst_y), // %1 847 "+r"(width) // %2 848 : "m"(kARGBToY), // %3 849 "m"(kAddY16), // %4 850 "m"(kPermdARGBToY_AVX) // %5 851 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 852 ); 853 } 854 #endif // HAS_ARGBTOYROW_AVX2 855 856 #ifdef HAS_ARGBTOYJROW_AVX2 857 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. 858 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { 859 asm volatile ( 860 "vbroadcastf128 %3,%%ymm4 \n" 861 "vbroadcastf128 %4,%%ymm5 \n" 862 "vmovdqu %5,%%ymm6 \n" 863 LABELALIGN 864 "1: \n" 865 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 866 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 867 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" 868 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" 869 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" 870 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" 871 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" 872 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" 873 "lea " MEMLEA(0x80,0) ",%0 \n" 874 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. 875 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" 876 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding. 877 "vpaddw %%ymm5,%%ymm2,%%ymm2 \n" 878 "vpsrlw $0x7,%%ymm0,%%ymm0 \n" 879 "vpsrlw $0x7,%%ymm2,%%ymm2 \n" 880 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. 881 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. 882 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 883 "lea " MEMLEA(0x20,1) ",%1 \n" 884 "sub $0x20,%2 \n" 885 "jg 1b \n" 886 "vzeroupper \n" 887 : "+r"(src_argb), // %0 888 "+r"(dst_y), // %1 889 "+r"(width) // %2 890 : "m"(kARGBToYJ), // %3 891 "m"(kAddYJ64), // %4 892 "m"(kPermdARGBToY_AVX) // %5 893 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 894 ); 895 } 896 #endif // HAS_ARGBTOYJROW_AVX2 897 898 #ifdef HAS_ARGBTOUVROW_SSSE3 899 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 900 uint8* dst_u, uint8* dst_v, int width) { 901 asm volatile ( 902 "movdqa %5,%%xmm3 \n" 903 "movdqa %6,%%xmm4 \n" 904 "movdqa %7,%%xmm5 \n" 905 "sub %1,%2 \n" 906 LABELALIGN 907 "1: \n" 908 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 909 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 910 "pavgb %%xmm7,%%xmm0 \n" 911 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 912 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 913 "pavgb %%xmm7,%%xmm1 \n" 914 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 915 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 916 "pavgb %%xmm7,%%xmm2 \n" 917 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 918 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 919 "pavgb %%xmm7,%%xmm6 \n" 920 921 "lea " MEMLEA(0x40,0) ",%0 \n" 922 "movdqa %%xmm0,%%xmm7 \n" 923 "shufps $0x88,%%xmm1,%%xmm0 \n" 924 "shufps $0xdd,%%xmm1,%%xmm7 \n" 925 "pavgb %%xmm7,%%xmm0 \n" 926 "movdqa %%xmm2,%%xmm7 \n" 927 "shufps $0x88,%%xmm6,%%xmm2 \n" 928 "shufps $0xdd,%%xmm6,%%xmm7 \n" 929 "pavgb %%xmm7,%%xmm2 \n" 930 "movdqa %%xmm0,%%xmm1 \n" 931 "movdqa %%xmm2,%%xmm6 \n" 932 "pmaddubsw %%xmm4,%%xmm0 \n" 933 "pmaddubsw %%xmm4,%%xmm2 \n" 934 "pmaddubsw %%xmm3,%%xmm1 \n" 935 "pmaddubsw %%xmm3,%%xmm6 \n" 936 "phaddw %%xmm2,%%xmm0 \n" 937 "phaddw %%xmm6,%%xmm1 \n" 938 "psraw $0x8,%%xmm0 \n" 939 "psraw $0x8,%%xmm1 \n" 940 "packsswb %%xmm1,%%xmm0 \n" 941 "paddb %%xmm5,%%xmm0 \n" 942 "movlps %%xmm0," MEMACCESS(1) " \n" 943 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 944 "lea " MEMLEA(0x8,1) ",%1 \n" 945 "sub $0x10,%3 \n" 946 "jg 1b \n" 947 : "+r"(src_argb0), // %0 948 "+r"(dst_u), // %1 949 "+r"(dst_v), // %2 950 "+rm"(width) // %3 951 : "r"((intptr_t)(src_stride_argb)), // %4 952 "m"(kARGBToV), // %5 953 "m"(kARGBToU), // %6 954 "m"(kAddUV128) // %7 955 : "memory", "cc", NACL_R14 956 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 957 ); 958 } 959 #endif // HAS_ARGBTOUVROW_SSSE3 960 961 #ifdef HAS_ARGBTOUVROW_AVX2 962 // vpshufb for vphaddw + vpackuswb packed to shorts. 963 static const lvec8 kShufARGBToUV_AVX = { 964 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 965 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 966 }; 967 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, 968 uint8* dst_u, uint8* dst_v, int width) { 969 asm volatile ( 970 "vbroadcastf128 %5,%%ymm5 \n" 971 "vbroadcastf128 %6,%%ymm6 \n" 972 "vbroadcastf128 %7,%%ymm7 \n" 973 "sub %1,%2 \n" 974 LABELALIGN 975 "1: \n" 976 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 977 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 978 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" 979 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" 980 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 981 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) 982 VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2) 983 VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3) 984 "lea " MEMLEA(0x80,0) ",%0 \n" 985 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" 986 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" 987 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" 988 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" 989 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" 990 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" 991 992 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" 993 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" 994 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" 995 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" 996 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" 997 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" 998 "vpsraw $0x8,%%ymm1,%%ymm1 \n" 999 "vpsraw $0x8,%%ymm0,%%ymm0 \n" 1000 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" 1001 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 1002 "vpshufb %8,%%ymm0,%%ymm0 \n" 1003 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" 1004 1005 "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n" 1006 VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1) 1007 "lea " MEMLEA(0x10,1) ",%1 \n" 1008 "sub $0x20,%3 \n" 1009 "jg 1b \n" 1010 "vzeroupper \n" 1011 : "+r"(src_argb0), // %0 1012 "+r"(dst_u), // %1 1013 "+r"(dst_v), // %2 1014 "+rm"(width) // %3 1015 : "r"((intptr_t)(src_stride_argb)), // %4 1016 "m"(kAddUV128), // %5 1017 "m"(kARGBToV), // %6 1018 "m"(kARGBToU), // %7 1019 "m"(kShufARGBToUV_AVX) // %8 1020 : "memory", "cc", NACL_R14 1021 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1022 ); 1023 } 1024 #endif // HAS_ARGBTOUVROW_AVX2 1025 1026 #ifdef HAS_ARGBTOUVJROW_AVX2 1027 void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, 1028 uint8* dst_u, uint8* dst_v, int width) { 1029 asm volatile ( 1030 "vbroadcastf128 %5,%%ymm5 \n" 1031 "vbroadcastf128 %6,%%ymm6 \n" 1032 "vbroadcastf128 %7,%%ymm7 \n" 1033 "sub %1,%2 \n" 1034 LABELALIGN 1035 "1: \n" 1036 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 1037 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 1038 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" 1039 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" 1040 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 1041 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) 1042 VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2) 1043 VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3) 1044 "lea " MEMLEA(0x80,0) ",%0 \n" 1045 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" 1046 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" 1047 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" 1048 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" 1049 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" 1050 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" 1051 1052 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" 1053 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" 1054 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" 1055 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" 1056 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" 1057 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" 1058 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" 1059 "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" 1060 "vpsraw $0x8,%%ymm1,%%ymm1 \n" 1061 "vpsraw $0x8,%%ymm0,%%ymm0 \n" 1062 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" 1063 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 1064 "vpshufb %8,%%ymm0,%%ymm0 \n" 1065 1066 "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n" 1067 VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1) 1068 "lea " MEMLEA(0x10,1) ",%1 \n" 1069 "sub $0x20,%3 \n" 1070 "jg 1b \n" 1071 "vzeroupper \n" 1072 : "+r"(src_argb0), // %0 1073 "+r"(dst_u), // %1 1074 "+r"(dst_v), // %2 1075 "+rm"(width) // %3 1076 : "r"((intptr_t)(src_stride_argb)), // %4 1077 "m"(kAddUVJ128), // %5 1078 "m"(kARGBToVJ), // %6 1079 "m"(kARGBToUJ), // %7 1080 "m"(kShufARGBToUV_AVX) // %8 1081 : "memory", "cc", NACL_R14 1082 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1083 ); 1084 } 1085 #endif // HAS_ARGBTOUVJROW_AVX2 1086 1087 #ifdef HAS_ARGBTOUVJROW_SSSE3 1088 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1089 uint8* dst_u, uint8* dst_v, int width) { 1090 asm volatile ( 1091 "movdqa %5,%%xmm3 \n" 1092 "movdqa %6,%%xmm4 \n" 1093 "movdqa %7,%%xmm5 \n" 1094 "sub %1,%2 \n" 1095 LABELALIGN 1096 "1: \n" 1097 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1098 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 1099 "pavgb %%xmm7,%%xmm0 \n" 1100 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1101 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 1102 "pavgb %%xmm7,%%xmm1 \n" 1103 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1104 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 1105 "pavgb %%xmm7,%%xmm2 \n" 1106 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1107 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 1108 "pavgb %%xmm7,%%xmm6 \n" 1109 1110 "lea " MEMLEA(0x40,0) ",%0 \n" 1111 "movdqa %%xmm0,%%xmm7 \n" 1112 "shufps $0x88,%%xmm1,%%xmm0 \n" 1113 "shufps $0xdd,%%xmm1,%%xmm7 \n" 1114 "pavgb %%xmm7,%%xmm0 \n" 1115 "movdqa %%xmm2,%%xmm7 \n" 1116 "shufps $0x88,%%xmm6,%%xmm2 \n" 1117 "shufps $0xdd,%%xmm6,%%xmm7 \n" 1118 "pavgb %%xmm7,%%xmm2 \n" 1119 "movdqa %%xmm0,%%xmm1 \n" 1120 "movdqa %%xmm2,%%xmm6 \n" 1121 "pmaddubsw %%xmm4,%%xmm0 \n" 1122 "pmaddubsw %%xmm4,%%xmm2 \n" 1123 "pmaddubsw %%xmm3,%%xmm1 \n" 1124 "pmaddubsw %%xmm3,%%xmm6 \n" 1125 "phaddw %%xmm2,%%xmm0 \n" 1126 "phaddw %%xmm6,%%xmm1 \n" 1127 "paddw %%xmm5,%%xmm0 \n" 1128 "paddw %%xmm5,%%xmm1 \n" 1129 "psraw $0x8,%%xmm0 \n" 1130 "psraw $0x8,%%xmm1 \n" 1131 "packsswb %%xmm1,%%xmm0 \n" 1132 "movlps %%xmm0," MEMACCESS(1) " \n" 1133 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1134 "lea " MEMLEA(0x8,1) ",%1 \n" 1135 "sub $0x10,%3 \n" 1136 "jg 1b \n" 1137 : "+r"(src_argb0), // %0 1138 "+r"(dst_u), // %1 1139 "+r"(dst_v), // %2 1140 "+rm"(width) // %3 1141 : "r"((intptr_t)(src_stride_argb)), // %4 1142 "m"(kARGBToVJ), // %5 1143 "m"(kARGBToUJ), // %6 1144 "m"(kAddUVJ128) // %7 1145 : "memory", "cc", NACL_R14 1146 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1147 ); 1148 } 1149 #endif // HAS_ARGBTOUVJROW_SSSE3 1150 1151 #ifdef HAS_ARGBTOUV444ROW_SSSE3 1152 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1153 int width) { 1154 asm volatile ( 1155 "movdqa %4,%%xmm3 \n" 1156 "movdqa %5,%%xmm4 \n" 1157 "movdqa %6,%%xmm5 \n" 1158 "sub %1,%2 \n" 1159 LABELALIGN 1160 "1: \n" 1161 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1162 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1163 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1164 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1165 "pmaddubsw %%xmm4,%%xmm0 \n" 1166 "pmaddubsw %%xmm4,%%xmm1 \n" 1167 "pmaddubsw %%xmm4,%%xmm2 \n" 1168 "pmaddubsw %%xmm4,%%xmm6 \n" 1169 "phaddw %%xmm1,%%xmm0 \n" 1170 "phaddw %%xmm6,%%xmm2 \n" 1171 "psraw $0x8,%%xmm0 \n" 1172 "psraw $0x8,%%xmm2 \n" 1173 "packsswb %%xmm2,%%xmm0 \n" 1174 "paddb %%xmm5,%%xmm0 \n" 1175 "movdqu %%xmm0," MEMACCESS(1) " \n" 1176 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1177 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1178 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1179 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1180 "pmaddubsw %%xmm3,%%xmm0 \n" 1181 "pmaddubsw %%xmm3,%%xmm1 \n" 1182 "pmaddubsw %%xmm3,%%xmm2 \n" 1183 "pmaddubsw %%xmm3,%%xmm6 \n" 1184 "phaddw %%xmm1,%%xmm0 \n" 1185 "phaddw %%xmm6,%%xmm2 \n" 1186 "psraw $0x8,%%xmm0 \n" 1187 "psraw $0x8,%%xmm2 \n" 1188 "packsswb %%xmm2,%%xmm0 \n" 1189 "paddb %%xmm5,%%xmm0 \n" 1190 "lea " MEMLEA(0x40,0) ",%0 \n" 1191 MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1) 1192 "lea " MEMLEA(0x10,1) ",%1 \n" 1193 "sub $0x10,%3 \n" 1194 "jg 1b \n" 1195 : "+r"(src_argb), // %0 1196 "+r"(dst_u), // %1 1197 "+r"(dst_v), // %2 1198 "+rm"(width) // %3 1199 : "m"(kARGBToV), // %4 1200 "m"(kARGBToU), // %5 1201 "m"(kAddUV128) // %6 1202 : "memory", "cc", NACL_R14 1203 "xmm0", "xmm1", "xmm2", "xmm6" 1204 ); 1205 } 1206 #endif // HAS_ARGBTOUV444ROW_SSSE3 1207 1208 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) { 1209 asm volatile ( 1210 "movdqa %4,%%xmm5 \n" 1211 "movdqa %3,%%xmm4 \n" 1212 LABELALIGN 1213 "1: \n" 1214 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1215 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1216 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1217 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1218 "pmaddubsw %%xmm4,%%xmm0 \n" 1219 "pmaddubsw %%xmm4,%%xmm1 \n" 1220 "pmaddubsw %%xmm4,%%xmm2 \n" 1221 "pmaddubsw %%xmm4,%%xmm3 \n" 1222 "lea " MEMLEA(0x40,0) ",%0 \n" 1223 "phaddw %%xmm1,%%xmm0 \n" 1224 "phaddw %%xmm3,%%xmm2 \n" 1225 "psrlw $0x7,%%xmm0 \n" 1226 "psrlw $0x7,%%xmm2 \n" 1227 "packuswb %%xmm2,%%xmm0 \n" 1228 "paddb %%xmm5,%%xmm0 \n" 1229 "movdqu %%xmm0," MEMACCESS(1) " \n" 1230 "lea " MEMLEA(0x10,1) ",%1 \n" 1231 "sub $0x10,%2 \n" 1232 "jg 1b \n" 1233 : "+r"(src_bgra), // %0 1234 "+r"(dst_y), // %1 1235 "+r"(width) // %2 1236 : "m"(kBGRAToY), // %3 1237 "m"(kAddY16) // %4 1238 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1239 ); 1240 } 1241 1242 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, 1243 uint8* dst_u, uint8* dst_v, int width) { 1244 asm volatile ( 1245 "movdqa %5,%%xmm3 \n" 1246 "movdqa %6,%%xmm4 \n" 1247 "movdqa %7,%%xmm5 \n" 1248 "sub %1,%2 \n" 1249 LABELALIGN 1250 "1: \n" 1251 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1252 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 1253 "pavgb %%xmm7,%%xmm0 \n" 1254 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1255 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 1256 "pavgb %%xmm7,%%xmm1 \n" 1257 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1258 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 1259 "pavgb %%xmm7,%%xmm2 \n" 1260 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1261 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 1262 "pavgb %%xmm7,%%xmm6 \n" 1263 1264 "lea " MEMLEA(0x40,0) ",%0 \n" 1265 "movdqa %%xmm0,%%xmm7 \n" 1266 "shufps $0x88,%%xmm1,%%xmm0 \n" 1267 "shufps $0xdd,%%xmm1,%%xmm7 \n" 1268 "pavgb %%xmm7,%%xmm0 \n" 1269 "movdqa %%xmm2,%%xmm7 \n" 1270 "shufps $0x88,%%xmm6,%%xmm2 \n" 1271 "shufps $0xdd,%%xmm6,%%xmm7 \n" 1272 "pavgb %%xmm7,%%xmm2 \n" 1273 "movdqa %%xmm0,%%xmm1 \n" 1274 "movdqa %%xmm2,%%xmm6 \n" 1275 "pmaddubsw %%xmm4,%%xmm0 \n" 1276 "pmaddubsw %%xmm4,%%xmm2 \n" 1277 "pmaddubsw %%xmm3,%%xmm1 \n" 1278 "pmaddubsw %%xmm3,%%xmm6 \n" 1279 "phaddw %%xmm2,%%xmm0 \n" 1280 "phaddw %%xmm6,%%xmm1 \n" 1281 "psraw $0x8,%%xmm0 \n" 1282 "psraw $0x8,%%xmm1 \n" 1283 "packsswb %%xmm1,%%xmm0 \n" 1284 "paddb %%xmm5,%%xmm0 \n" 1285 "movlps %%xmm0," MEMACCESS(1) " \n" 1286 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1287 "lea " MEMLEA(0x8,1) ",%1 \n" 1288 "sub $0x10,%3 \n" 1289 "jg 1b \n" 1290 : "+r"(src_bgra0), // %0 1291 "+r"(dst_u), // %1 1292 "+r"(dst_v), // %2 1293 "+rm"(width) // %3 1294 : "r"((intptr_t)(src_stride_bgra)), // %4 1295 "m"(kBGRAToV), // %5 1296 "m"(kBGRAToU), // %6 1297 "m"(kAddUV128) // %7 1298 : "memory", "cc", NACL_R14 1299 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1300 ); 1301 } 1302 1303 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) { 1304 asm volatile ( 1305 "movdqa %4,%%xmm5 \n" 1306 "movdqa %3,%%xmm4 \n" 1307 LABELALIGN 1308 "1: \n" 1309 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1310 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1311 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1312 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1313 "pmaddubsw %%xmm4,%%xmm0 \n" 1314 "pmaddubsw %%xmm4,%%xmm1 \n" 1315 "pmaddubsw %%xmm4,%%xmm2 \n" 1316 "pmaddubsw %%xmm4,%%xmm3 \n" 1317 "lea " MEMLEA(0x40,0) ",%0 \n" 1318 "phaddw %%xmm1,%%xmm0 \n" 1319 "phaddw %%xmm3,%%xmm2 \n" 1320 "psrlw $0x7,%%xmm0 \n" 1321 "psrlw $0x7,%%xmm2 \n" 1322 "packuswb %%xmm2,%%xmm0 \n" 1323 "paddb %%xmm5,%%xmm0 \n" 1324 "movdqu %%xmm0," MEMACCESS(1) " \n" 1325 "lea " MEMLEA(0x10,1) ",%1 \n" 1326 "sub $0x10,%2 \n" 1327 "jg 1b \n" 1328 : "+r"(src_abgr), // %0 1329 "+r"(dst_y), // %1 1330 "+r"(width) // %2 1331 : "m"(kABGRToY), // %3 1332 "m"(kAddY16) // %4 1333 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1334 ); 1335 } 1336 1337 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) { 1338 asm volatile ( 1339 "movdqa %4,%%xmm5 \n" 1340 "movdqa %3,%%xmm4 \n" 1341 LABELALIGN 1342 "1: \n" 1343 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1344 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1345 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1346 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1347 "pmaddubsw %%xmm4,%%xmm0 \n" 1348 "pmaddubsw %%xmm4,%%xmm1 \n" 1349 "pmaddubsw %%xmm4,%%xmm2 \n" 1350 "pmaddubsw %%xmm4,%%xmm3 \n" 1351 "lea " MEMLEA(0x40,0) ",%0 \n" 1352 "phaddw %%xmm1,%%xmm0 \n" 1353 "phaddw %%xmm3,%%xmm2 \n" 1354 "psrlw $0x7,%%xmm0 \n" 1355 "psrlw $0x7,%%xmm2 \n" 1356 "packuswb %%xmm2,%%xmm0 \n" 1357 "paddb %%xmm5,%%xmm0 \n" 1358 "movdqu %%xmm0," MEMACCESS(1) " \n" 1359 "lea " MEMLEA(0x10,1) ",%1 \n" 1360 "sub $0x10,%2 \n" 1361 "jg 1b \n" 1362 : "+r"(src_rgba), // %0 1363 "+r"(dst_y), // %1 1364 "+r"(width) // %2 1365 : "m"(kRGBAToY), // %3 1366 "m"(kAddY16) // %4 1367 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1368 ); 1369 } 1370 1371 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, 1372 uint8* dst_u, uint8* dst_v, int width) { 1373 asm volatile ( 1374 "movdqa %5,%%xmm3 \n" 1375 "movdqa %6,%%xmm4 \n" 1376 "movdqa %7,%%xmm5 \n" 1377 "sub %1,%2 \n" 1378 LABELALIGN 1379 "1: \n" 1380 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1381 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 1382 "pavgb %%xmm7,%%xmm0 \n" 1383 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1384 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 1385 "pavgb %%xmm7,%%xmm1 \n" 1386 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1387 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 1388 "pavgb %%xmm7,%%xmm2 \n" 1389 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1390 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 1391 "pavgb %%xmm7,%%xmm6 \n" 1392 1393 "lea " MEMLEA(0x40,0) ",%0 \n" 1394 "movdqa %%xmm0,%%xmm7 \n" 1395 "shufps $0x88,%%xmm1,%%xmm0 \n" 1396 "shufps $0xdd,%%xmm1,%%xmm7 \n" 1397 "pavgb %%xmm7,%%xmm0 \n" 1398 "movdqa %%xmm2,%%xmm7 \n" 1399 "shufps $0x88,%%xmm6,%%xmm2 \n" 1400 "shufps $0xdd,%%xmm6,%%xmm7 \n" 1401 "pavgb %%xmm7,%%xmm2 \n" 1402 "movdqa %%xmm0,%%xmm1 \n" 1403 "movdqa %%xmm2,%%xmm6 \n" 1404 "pmaddubsw %%xmm4,%%xmm0 \n" 1405 "pmaddubsw %%xmm4,%%xmm2 \n" 1406 "pmaddubsw %%xmm3,%%xmm1 \n" 1407 "pmaddubsw %%xmm3,%%xmm6 \n" 1408 "phaddw %%xmm2,%%xmm0 \n" 1409 "phaddw %%xmm6,%%xmm1 \n" 1410 "psraw $0x8,%%xmm0 \n" 1411 "psraw $0x8,%%xmm1 \n" 1412 "packsswb %%xmm1,%%xmm0 \n" 1413 "paddb %%xmm5,%%xmm0 \n" 1414 "movlps %%xmm0," MEMACCESS(1) " \n" 1415 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1416 "lea " MEMLEA(0x8,1) ",%1 \n" 1417 "sub $0x10,%3 \n" 1418 "jg 1b \n" 1419 : "+r"(src_abgr0), // %0 1420 "+r"(dst_u), // %1 1421 "+r"(dst_v), // %2 1422 "+rm"(width) // %3 1423 : "r"((intptr_t)(src_stride_abgr)), // %4 1424 "m"(kABGRToV), // %5 1425 "m"(kABGRToU), // %6 1426 "m"(kAddUV128) // %7 1427 : "memory", "cc", NACL_R14 1428 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1429 ); 1430 } 1431 1432 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, 1433 uint8* dst_u, uint8* dst_v, int width) { 1434 asm volatile ( 1435 "movdqa %5,%%xmm3 \n" 1436 "movdqa %6,%%xmm4 \n" 1437 "movdqa %7,%%xmm5 \n" 1438 "sub %1,%2 \n" 1439 LABELALIGN 1440 "1: \n" 1441 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1442 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 1443 "pavgb %%xmm7,%%xmm0 \n" 1444 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1445 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 1446 "pavgb %%xmm7,%%xmm1 \n" 1447 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1448 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 1449 "pavgb %%xmm7,%%xmm2 \n" 1450 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1451 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 1452 "pavgb %%xmm7,%%xmm6 \n" 1453 1454 "lea " MEMLEA(0x40,0) ",%0 \n" 1455 "movdqa %%xmm0,%%xmm7 \n" 1456 "shufps $0x88,%%xmm1,%%xmm0 \n" 1457 "shufps $0xdd,%%xmm1,%%xmm7 \n" 1458 "pavgb %%xmm7,%%xmm0 \n" 1459 "movdqa %%xmm2,%%xmm7 \n" 1460 "shufps $0x88,%%xmm6,%%xmm2 \n" 1461 "shufps $0xdd,%%xmm6,%%xmm7 \n" 1462 "pavgb %%xmm7,%%xmm2 \n" 1463 "movdqa %%xmm0,%%xmm1 \n" 1464 "movdqa %%xmm2,%%xmm6 \n" 1465 "pmaddubsw %%xmm4,%%xmm0 \n" 1466 "pmaddubsw %%xmm4,%%xmm2 \n" 1467 "pmaddubsw %%xmm3,%%xmm1 \n" 1468 "pmaddubsw %%xmm3,%%xmm6 \n" 1469 "phaddw %%xmm2,%%xmm0 \n" 1470 "phaddw %%xmm6,%%xmm1 \n" 1471 "psraw $0x8,%%xmm0 \n" 1472 "psraw $0x8,%%xmm1 \n" 1473 "packsswb %%xmm1,%%xmm0 \n" 1474 "paddb %%xmm5,%%xmm0 \n" 1475 "movlps %%xmm0," MEMACCESS(1) " \n" 1476 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1477 "lea " MEMLEA(0x8,1) ",%1 \n" 1478 "sub $0x10,%3 \n" 1479 "jg 1b \n" 1480 : "+r"(src_rgba0), // %0 1481 "+r"(dst_u), // %1 1482 "+r"(dst_v), // %2 1483 "+rm"(width) // %3 1484 : "r"((intptr_t)(src_stride_rgba)), // %4 1485 "m"(kRGBAToV), // %5 1486 "m"(kRGBAToU), // %6 1487 "m"(kAddUV128) // %7 1488 : "memory", "cc", NACL_R14 1489 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1490 ); 1491 } 1492 1493 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) 1494 1495 // Read 8 UV from 444 1496 #define READYUV444 \ 1497 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1498 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1499 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ 1500 "punpcklbw %%xmm1,%%xmm0 \n" \ 1501 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1502 "punpcklbw %%xmm4,%%xmm4 \n" \ 1503 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" 1504 1505 // Read 4 UV from 422, upsample to 8 UV 1506 #define READYUV422 \ 1507 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1508 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1509 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ 1510 "punpcklbw %%xmm1,%%xmm0 \n" \ 1511 "punpcklwd %%xmm0,%%xmm0 \n" \ 1512 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1513 "punpcklbw %%xmm4,%%xmm4 \n" \ 1514 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" 1515 1516 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. 1517 #define READYUVA422 \ 1518 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1519 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1520 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ 1521 "punpcklbw %%xmm1,%%xmm0 \n" \ 1522 "punpcklwd %%xmm0,%%xmm0 \n" \ 1523 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1524 "punpcklbw %%xmm4,%%xmm4 \n" \ 1525 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ 1526 "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \ 1527 "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n" 1528 1529 // Read 2 UV from 411, upsample to 8 UV. 1530 // reading 4 bytes is an msan violation. 1531 // "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" 1532 // MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) 1533 // pinsrw fails with drmemory 1534 // __asm pinsrw xmm0, [esi], 0 /* U */ 1535 // __asm pinsrw xmm1, [esi + edi], 0 /* V */ 1536 #define READYUV411_TEMP \ 1537 "movzwl " MEMACCESS([u_buf]) ",%[temp] \n" \ 1538 "movd %[temp],%%xmm0 \n" \ 1539 MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) " \n" \ 1540 "movd %[temp],%%xmm1 \n" \ 1541 "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ 1542 "punpcklbw %%xmm1,%%xmm0 \n" \ 1543 "punpcklwd %%xmm0,%%xmm0 \n" \ 1544 "punpckldq %%xmm0,%%xmm0 \n" \ 1545 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1546 "punpcklbw %%xmm4,%%xmm4 \n" \ 1547 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" 1548 1549 // Read 4 UV from NV12, upsample to 8 UV 1550 #define READNV12 \ 1551 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ 1552 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ 1553 "punpcklwd %%xmm0,%%xmm0 \n" \ 1554 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1555 "punpcklbw %%xmm4,%%xmm4 \n" \ 1556 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" 1557 1558 // Read 4 VU from NV21, upsample to 8 UV 1559 #define READNV21 \ 1560 "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ 1561 "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \ 1562 "pshufb %[kShuffleNV21], %%xmm0 \n" \ 1563 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1564 "punpcklbw %%xmm4,%%xmm4 \n" \ 1565 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" 1566 1567 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. 1568 #define READYUY2 \ 1569 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \ 1570 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ 1571 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \ 1572 "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ 1573 "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" 1574 1575 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. 1576 #define READUYVY \ 1577 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ 1578 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ 1579 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \ 1580 "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ 1581 "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n" 1582 1583 #if defined(__x86_64__) 1584 #define YUVTORGB_SETUP(yuvconstants) \ 1585 "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \ 1586 "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \ 1587 "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \ 1588 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \ 1589 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \ 1590 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \ 1591 "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n" 1592 // Convert 8 pixels: 8 UV and 8 Y 1593 #define YUVTORGB(yuvconstants) \ 1594 "movdqa %%xmm0,%%xmm1 \n" \ 1595 "movdqa %%xmm0,%%xmm2 \n" \ 1596 "movdqa %%xmm0,%%xmm3 \n" \ 1597 "movdqa %%xmm11,%%xmm0 \n" \ 1598 "pmaddubsw %%xmm8,%%xmm1 \n" \ 1599 "psubw %%xmm1,%%xmm0 \n" \ 1600 "movdqa %%xmm12,%%xmm1 \n" \ 1601 "pmaddubsw %%xmm9,%%xmm2 \n" \ 1602 "psubw %%xmm2,%%xmm1 \n" \ 1603 "movdqa %%xmm13,%%xmm2 \n" \ 1604 "pmaddubsw %%xmm10,%%xmm3 \n" \ 1605 "psubw %%xmm3,%%xmm2 \n" \ 1606 "pmulhuw %%xmm14,%%xmm4 \n" \ 1607 "paddsw %%xmm4,%%xmm0 \n" \ 1608 "paddsw %%xmm4,%%xmm1 \n" \ 1609 "paddsw %%xmm4,%%xmm2 \n" \ 1610 "psraw $0x6,%%xmm0 \n" \ 1611 "psraw $0x6,%%xmm1 \n" \ 1612 "psraw $0x6,%%xmm2 \n" \ 1613 "packuswb %%xmm0,%%xmm0 \n" \ 1614 "packuswb %%xmm1,%%xmm1 \n" \ 1615 "packuswb %%xmm2,%%xmm2 \n" 1616 #define YUVTORGB_REGS \ 1617 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", 1618 1619 #else 1620 #define YUVTORGB_SETUP(yuvconstants) 1621 // Convert 8 pixels: 8 UV and 8 Y 1622 #define YUVTORGB(yuvconstants) \ 1623 "movdqa %%xmm0,%%xmm1 \n" \ 1624 "movdqa %%xmm0,%%xmm2 \n" \ 1625 "movdqa %%xmm0,%%xmm3 \n" \ 1626 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \ 1627 "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \ 1628 "psubw %%xmm1,%%xmm0 \n" \ 1629 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \ 1630 "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \ 1631 "psubw %%xmm2,%%xmm1 \n" \ 1632 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \ 1633 "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \ 1634 "psubw %%xmm3,%%xmm2 \n" \ 1635 "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \ 1636 "paddsw %%xmm4,%%xmm0 \n" \ 1637 "paddsw %%xmm4,%%xmm1 \n" \ 1638 "paddsw %%xmm4,%%xmm2 \n" \ 1639 "psraw $0x6,%%xmm0 \n" \ 1640 "psraw $0x6,%%xmm1 \n" \ 1641 "psraw $0x6,%%xmm2 \n" \ 1642 "packuswb %%xmm0,%%xmm0 \n" \ 1643 "packuswb %%xmm1,%%xmm1 \n" \ 1644 "packuswb %%xmm2,%%xmm2 \n" 1645 #define YUVTORGB_REGS 1646 #endif 1647 1648 // Store 8 ARGB values. 1649 #define STOREARGB \ 1650 "punpcklbw %%xmm1,%%xmm0 \n" \ 1651 "punpcklbw %%xmm5,%%xmm2 \n" \ 1652 "movdqa %%xmm0,%%xmm1 \n" \ 1653 "punpcklwd %%xmm2,%%xmm0 \n" \ 1654 "punpckhwd %%xmm2,%%xmm1 \n" \ 1655 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ 1656 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ 1657 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" 1658 1659 // Store 8 RGBA values. 1660 #define STORERGBA \ 1661 "pcmpeqb %%xmm5,%%xmm5 \n" \ 1662 "punpcklbw %%xmm2,%%xmm1 \n" \ 1663 "punpcklbw %%xmm0,%%xmm5 \n" \ 1664 "movdqa %%xmm5,%%xmm0 \n" \ 1665 "punpcklwd %%xmm1,%%xmm5 \n" \ 1666 "punpckhwd %%xmm1,%%xmm0 \n" \ 1667 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ 1668 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ 1669 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" 1670 1671 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, 1672 const uint8* u_buf, 1673 const uint8* v_buf, 1674 uint8* dst_argb, 1675 const struct YuvConstants* yuvconstants, 1676 int width) { 1677 asm volatile ( 1678 YUVTORGB_SETUP(yuvconstants) 1679 "sub %[u_buf],%[v_buf] \n" 1680 "pcmpeqb %%xmm5,%%xmm5 \n" 1681 LABELALIGN 1682 "1: \n" 1683 READYUV444 1684 YUVTORGB(yuvconstants) 1685 STOREARGB 1686 "sub $0x8,%[width] \n" 1687 "jg 1b \n" 1688 : [y_buf]"+r"(y_buf), // %[y_buf] 1689 [u_buf]"+r"(u_buf), // %[u_buf] 1690 [v_buf]"+r"(v_buf), // %[v_buf] 1691 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1692 [width]"+rm"(width) // %[width] 1693 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1694 : "memory", "cc", NACL_R14 YUVTORGB_REGS 1695 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1696 ); 1697 } 1698 1699 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, 1700 const uint8* u_buf, 1701 const uint8* v_buf, 1702 uint8* dst_rgb24, 1703 const struct YuvConstants* yuvconstants, 1704 int width) { 1705 asm volatile ( 1706 YUVTORGB_SETUP(yuvconstants) 1707 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" 1708 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" 1709 "sub %[u_buf],%[v_buf] \n" 1710 LABELALIGN 1711 "1: \n" 1712 READYUV422 1713 YUVTORGB(yuvconstants) 1714 "punpcklbw %%xmm1,%%xmm0 \n" 1715 "punpcklbw %%xmm2,%%xmm2 \n" 1716 "movdqa %%xmm0,%%xmm1 \n" 1717 "punpcklwd %%xmm2,%%xmm0 \n" 1718 "punpckhwd %%xmm2,%%xmm1 \n" 1719 "pshufb %%xmm5,%%xmm0 \n" 1720 "pshufb %%xmm6,%%xmm1 \n" 1721 "palignr $0xc,%%xmm0,%%xmm1 \n" 1722 "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n" 1723 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n" 1724 "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n" 1725 "subl $0x8,%[width] \n" 1726 "jg 1b \n" 1727 : [y_buf]"+r"(y_buf), // %[y_buf] 1728 [u_buf]"+r"(u_buf), // %[u_buf] 1729 [v_buf]"+r"(v_buf), // %[v_buf] 1730 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] 1731 #if defined(__i386__) && defined(__pic__) 1732 [width]"+m"(width) // %[width] 1733 #else 1734 [width]"+rm"(width) // %[width] 1735 #endif 1736 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 1737 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), 1738 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) 1739 : "memory", "cc", NACL_R14 YUVTORGB_REGS 1740 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 1741 ); 1742 } 1743 1744 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, 1745 const uint8* u_buf, 1746 const uint8* v_buf, 1747 uint8* dst_argb, 1748 const struct YuvConstants* yuvconstants, 1749 int width) { 1750 asm volatile ( 1751 YUVTORGB_SETUP(yuvconstants) 1752 "sub %[u_buf],%[v_buf] \n" 1753 "pcmpeqb %%xmm5,%%xmm5 \n" 1754 LABELALIGN 1755 "1: \n" 1756 READYUV422 1757 YUVTORGB(yuvconstants) 1758 STOREARGB 1759 "sub $0x8,%[width] \n" 1760 "jg 1b \n" 1761 : [y_buf]"+r"(y_buf), // %[y_buf] 1762 [u_buf]"+r"(u_buf), // %[u_buf] 1763 [v_buf]"+r"(v_buf), // %[v_buf] 1764 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1765 [width]"+rm"(width) // %[width] 1766 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1767 : "memory", "cc", NACL_R14 YUVTORGB_REGS 1768 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1769 ); 1770 } 1771 1772 #ifdef HAS_I422ALPHATOARGBROW_SSSE3 1773 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, 1774 const uint8* u_buf, 1775 const uint8* v_buf, 1776 const uint8* a_buf, 1777 uint8* dst_argb, 1778 const struct YuvConstants* yuvconstants, 1779 int width) { 1780 asm volatile ( 1781 YUVTORGB_SETUP(yuvconstants) 1782 "sub %[u_buf],%[v_buf] \n" 1783 LABELALIGN 1784 "1: \n" 1785 READYUVA422 1786 YUVTORGB(yuvconstants) 1787 STOREARGB 1788 "subl $0x8,%[width] \n" 1789 "jg 1b \n" 1790 : [y_buf]"+r"(y_buf), // %[y_buf] 1791 [u_buf]"+r"(u_buf), // %[u_buf] 1792 [v_buf]"+r"(v_buf), // %[v_buf] 1793 [a_buf]"+r"(a_buf), // %[a_buf] 1794 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1795 #if defined(__i386__) && defined(__pic__) 1796 [width]"+m"(width) // %[width] 1797 #else 1798 [width]"+rm"(width) // %[width] 1799 #endif 1800 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1801 : "memory", "cc", NACL_R14 YUVTORGB_REGS 1802 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1803 ); 1804 } 1805 #endif // HAS_I422ALPHATOARGBROW_SSSE3 1806 1807 #ifdef HAS_I411TOARGBROW_SSSE3 1808 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, 1809 const uint8* u_buf, 1810 const uint8* v_buf, 1811 uint8* dst_argb, 1812 const struct YuvConstants* yuvconstants, 1813 int width) { 1814 int temp; 1815 asm volatile ( 1816 YUVTORGB_SETUP(yuvconstants) 1817 "sub %[u_buf],%[v_buf] \n" 1818 "pcmpeqb %%xmm5,%%xmm5 \n" 1819 LABELALIGN 1820 "1: \n" 1821 READYUV411_TEMP 1822 YUVTORGB(yuvconstants) 1823 STOREARGB 1824 "subl $0x8,%[width] \n" 1825 "jg 1b \n" 1826 : [y_buf]"+r"(y_buf), // %[y_buf] 1827 [u_buf]"+r"(u_buf), // %[u_buf] 1828 [v_buf]"+r"(v_buf), // %[v_buf] 1829 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1830 [temp]"=&r"(temp), // %[temp] 1831 #if defined(__i386__) && defined(__pic__) 1832 [width]"+m"(width) // %[width] 1833 #else 1834 [width]"+rm"(width) // %[width] 1835 #endif 1836 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1837 : "memory", "cc", NACL_R14 YUVTORGB_REGS 1838 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1839 ); 1840 } 1841 #endif 1842 1843 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, 1844 const uint8* uv_buf, 1845 uint8* dst_argb, 1846 const struct YuvConstants* yuvconstants, 1847 int width) { 1848 asm volatile ( 1849 YUVTORGB_SETUP(yuvconstants) 1850 "pcmpeqb %%xmm5,%%xmm5 \n" 1851 LABELALIGN 1852 "1: \n" 1853 READNV12 1854 YUVTORGB(yuvconstants) 1855 STOREARGB 1856 "sub $0x8,%[width] \n" 1857 "jg 1b \n" 1858 : [y_buf]"+r"(y_buf), // %[y_buf] 1859 [uv_buf]"+r"(uv_buf), // %[uv_buf] 1860 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1861 [width]"+rm"(width) // %[width] 1862 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1863 : "memory", "cc", YUVTORGB_REGS // Does not use r14. 1864 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1865 ); 1866 } 1867 1868 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, 1869 const uint8* vu_buf, 1870 uint8* dst_argb, 1871 const struct YuvConstants* yuvconstants, 1872 int width) { 1873 asm volatile ( 1874 YUVTORGB_SETUP(yuvconstants) 1875 "pcmpeqb %%xmm5,%%xmm5 \n" 1876 LABELALIGN 1877 "1: \n" 1878 READNV21 1879 YUVTORGB(yuvconstants) 1880 STOREARGB 1881 "sub $0x8,%[width] \n" 1882 "jg 1b \n" 1883 : [y_buf]"+r"(y_buf), // %[y_buf] 1884 [vu_buf]"+r"(vu_buf), // %[vu_buf] 1885 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1886 [width]"+rm"(width) // %[width] 1887 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 1888 [kShuffleNV21]"m"(kShuffleNV21) 1889 : "memory", "cc", YUVTORGB_REGS // Does not use r14. 1890 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1891 ); 1892 } 1893 1894 void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, 1895 uint8* dst_argb, 1896 const struct YuvConstants* yuvconstants, 1897 int width) { 1898 asm volatile ( 1899 YUVTORGB_SETUP(yuvconstants) 1900 "pcmpeqb %%xmm5,%%xmm5 \n" 1901 LABELALIGN 1902 "1: \n" 1903 READYUY2 1904 YUVTORGB(yuvconstants) 1905 STOREARGB 1906 "sub $0x8,%[width] \n" 1907 "jg 1b \n" 1908 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] 1909 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1910 [width]"+rm"(width) // %[width] 1911 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 1912 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), 1913 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) 1914 : "memory", "cc", YUVTORGB_REGS // Does not use r14. 1915 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1916 ); 1917 } 1918 1919 void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf, 1920 uint8* dst_argb, 1921 const struct YuvConstants* yuvconstants, 1922 int width) { 1923 asm volatile ( 1924 YUVTORGB_SETUP(yuvconstants) 1925 "pcmpeqb %%xmm5,%%xmm5 \n" 1926 LABELALIGN 1927 "1: \n" 1928 READUYVY 1929 YUVTORGB(yuvconstants) 1930 STOREARGB 1931 "sub $0x8,%[width] \n" 1932 "jg 1b \n" 1933 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] 1934 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1935 [width]"+rm"(width) // %[width] 1936 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 1937 [kShuffleUYVYY]"m"(kShuffleUYVYY), 1938 [kShuffleUYVYUV]"m"(kShuffleUYVYUV) 1939 : "memory", "cc", YUVTORGB_REGS // Does not use r14. 1940 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1941 ); 1942 } 1943 1944 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, 1945 const uint8* u_buf, 1946 const uint8* v_buf, 1947 uint8* dst_rgba, 1948 const struct YuvConstants* yuvconstants, 1949 int width) { 1950 asm volatile ( 1951 YUVTORGB_SETUP(yuvconstants) 1952 "sub %[u_buf],%[v_buf] \n" 1953 "pcmpeqb %%xmm5,%%xmm5 \n" 1954 LABELALIGN 1955 "1: \n" 1956 READYUV422 1957 YUVTORGB(yuvconstants) 1958 STORERGBA 1959 "sub $0x8,%[width] \n" 1960 "jg 1b \n" 1961 : [y_buf]"+r"(y_buf), // %[y_buf] 1962 [u_buf]"+r"(u_buf), // %[u_buf] 1963 [v_buf]"+r"(v_buf), // %[v_buf] 1964 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] 1965 [width]"+rm"(width) // %[width] 1966 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1967 : "memory", "cc", NACL_R14 YUVTORGB_REGS 1968 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1969 ); 1970 } 1971 1972 #endif // HAS_I422TOARGBROW_SSSE3 1973 1974 // Read 16 UV from 444 1975 #define READYUV444_AVX2 \ 1976 "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1977 MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1978 "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \ 1979 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 1980 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ 1981 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ 1982 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1983 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ 1984 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ 1985 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" 1986 1987 // Read 8 UV from 422, upsample to 16 UV. 1988 #define READYUV422_AVX2 \ 1989 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1990 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1991 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ 1992 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ 1993 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 1994 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ 1995 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1996 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ 1997 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ 1998 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" 1999 2000 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. 2001 #define READYUVA422_AVX2 \ 2002 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 2003 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ 2004 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ 2005 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ 2006 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 2007 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ 2008 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 2009 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ 2010 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ 2011 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ 2012 "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \ 2013 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ 2014 "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" 2015 2016 // Read 4 UV from 411, upsample to 16 UV. 2017 #define READYUV411_AVX2 \ 2018 "vmovd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 2019 MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1) \ 2020 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ 2021 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ 2022 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ 2023 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 2024 "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" \ 2025 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 2026 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ 2027 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ 2028 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" 2029 2030 // Read 8 UV from NV12, upsample to 16 UV. 2031 #define READNV12_AVX2 \ 2032 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ 2033 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ 2034 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 2035 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ 2036 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 2037 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ 2038 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ 2039 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" 2040 2041 // Read 8 VU from NV21, upsample to 16 UV. 2042 #define READNV21_AVX2 \ 2043 "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ 2044 "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \ 2045 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 2046 "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \ 2047 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 2048 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ 2049 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ 2050 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" 2051 2052 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. 2053 #define READYUY2_AVX2 \ 2054 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \ 2055 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ 2056 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \ 2057 "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ 2058 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n" 2059 2060 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. 2061 #define READUYVY_AVX2 \ 2062 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \ 2063 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ 2064 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \ 2065 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ 2066 "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n" 2067 2068 #if defined(__x86_64__) 2069 #define YUVTORGB_SETUP_AVX2(yuvconstants) \ 2070 "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \ 2071 "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \ 2072 "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \ 2073 "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \ 2074 "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \ 2075 "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \ 2076 "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n" 2077 #define YUVTORGB_AVX2(yuvconstants) \ 2078 "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ 2079 "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ 2080 "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ 2081 "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \ 2082 "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \ 2083 "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \ 2084 "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ 2085 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ 2086 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ 2087 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ 2088 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ 2089 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ 2090 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ 2091 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ 2092 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ 2093 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" 2094 #define YUVTORGB_REGS_AVX2 \ 2095 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", 2096 #else // Convert 16 pixels: 16 UV and 16 Y. 2097 #define YUVTORGB_SETUP_AVX2(yuvconstants) 2098 #define YUVTORGB_AVX2(yuvconstants) \ 2099 "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \ 2100 "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \ 2101 "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \ 2102 "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \ 2103 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ 2104 "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \ 2105 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ 2106 "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \ 2107 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ 2108 "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \ 2109 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ 2110 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ 2111 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ 2112 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ 2113 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ 2114 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ 2115 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ 2116 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ 2117 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" 2118 #define YUVTORGB_REGS_AVX2 2119 #endif 2120 2121 // Store 16 ARGB values. 2122 #define STOREARGB_AVX2 \ 2123 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ 2124 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 2125 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ 2126 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ 2127 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ 2128 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ 2129 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ 2130 "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \ 2131 "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n" 2132 2133 #ifdef HAS_I444TOARGBROW_AVX2 2134 // 16 pixels 2135 // 16 UV values with 16 Y producing 16 ARGB (64 bytes). 2136 void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf, 2137 const uint8* u_buf, 2138 const uint8* v_buf, 2139 uint8* dst_argb, 2140 const struct YuvConstants* yuvconstants, 2141 int width) { 2142 asm volatile ( 2143 YUVTORGB_SETUP_AVX2(yuvconstants) 2144 "sub %[u_buf],%[v_buf] \n" 2145 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2146 LABELALIGN 2147 "1: \n" 2148 READYUV444_AVX2 2149 YUVTORGB_AVX2(yuvconstants) 2150 STOREARGB_AVX2 2151 "sub $0x10,%[width] \n" 2152 "jg 1b \n" 2153 "vzeroupper \n" 2154 : [y_buf]"+r"(y_buf), // %[y_buf] 2155 [u_buf]"+r"(u_buf), // %[u_buf] 2156 [v_buf]"+r"(v_buf), // %[v_buf] 2157 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2158 [width]"+rm"(width) // %[width] 2159 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2160 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 2161 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2162 ); 2163 } 2164 #endif // HAS_I444TOARGBROW_AVX2 2165 2166 #ifdef HAS_I411TOARGBROW_AVX2 2167 // 16 pixels 2168 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2169 void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf, 2170 const uint8* u_buf, 2171 const uint8* v_buf, 2172 uint8* dst_argb, 2173 const struct YuvConstants* yuvconstants, 2174 int width) { 2175 asm volatile ( 2176 YUVTORGB_SETUP_AVX2(yuvconstants) 2177 "sub %[u_buf],%[v_buf] \n" 2178 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2179 LABELALIGN 2180 "1: \n" 2181 READYUV411_AVX2 2182 YUVTORGB_AVX2(yuvconstants) 2183 STOREARGB_AVX2 2184 "sub $0x10,%[width] \n" 2185 "jg 1b \n" 2186 "vzeroupper \n" 2187 : [y_buf]"+r"(y_buf), // %[y_buf] 2188 [u_buf]"+r"(u_buf), // %[u_buf] 2189 [v_buf]"+r"(v_buf), // %[v_buf] 2190 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2191 [width]"+rm"(width) // %[width] 2192 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2193 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 2194 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2195 ); 2196 } 2197 #endif // HAS_I411TOARGBROW_AVX2 2198 2199 #if defined(HAS_I422TOARGBROW_AVX2) 2200 // 16 pixels 2201 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2202 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, 2203 const uint8* u_buf, 2204 const uint8* v_buf, 2205 uint8* dst_argb, 2206 const struct YuvConstants* yuvconstants, 2207 int width) { 2208 asm volatile ( 2209 YUVTORGB_SETUP_AVX2(yuvconstants) 2210 "sub %[u_buf],%[v_buf] \n" 2211 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2212 LABELALIGN 2213 "1: \n" 2214 READYUV422_AVX2 2215 YUVTORGB_AVX2(yuvconstants) 2216 STOREARGB_AVX2 2217 "sub $0x10,%[width] \n" 2218 "jg 1b \n" 2219 "vzeroupper \n" 2220 : [y_buf]"+r"(y_buf), // %[y_buf] 2221 [u_buf]"+r"(u_buf), // %[u_buf] 2222 [v_buf]"+r"(v_buf), // %[v_buf] 2223 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2224 [width]"+rm"(width) // %[width] 2225 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2226 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 2227 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2228 ); 2229 } 2230 #endif // HAS_I422TOARGBROW_AVX2 2231 2232 #if defined(HAS_I422ALPHATOARGBROW_AVX2) 2233 // 16 pixels 2234 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. 2235 void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, 2236 const uint8* u_buf, 2237 const uint8* v_buf, 2238 const uint8* a_buf, 2239 uint8* dst_argb, 2240 const struct YuvConstants* yuvconstants, 2241 int width) { 2242 asm volatile ( 2243 YUVTORGB_SETUP_AVX2(yuvconstants) 2244 "sub %[u_buf],%[v_buf] \n" 2245 LABELALIGN 2246 "1: \n" 2247 READYUVA422_AVX2 2248 YUVTORGB_AVX2(yuvconstants) 2249 STOREARGB_AVX2 2250 "subl $0x10,%[width] \n" 2251 "jg 1b \n" 2252 "vzeroupper \n" 2253 : [y_buf]"+r"(y_buf), // %[y_buf] 2254 [u_buf]"+r"(u_buf), // %[u_buf] 2255 [v_buf]"+r"(v_buf), // %[v_buf] 2256 [a_buf]"+r"(a_buf), // %[a_buf] 2257 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2258 #if defined(__i386__) && defined(__pic__) 2259 [width]"+m"(width) // %[width] 2260 #else 2261 [width]"+rm"(width) // %[width] 2262 #endif 2263 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2264 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 2265 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2266 ); 2267 } 2268 #endif // HAS_I422ALPHATOARGBROW_AVX2 2269 2270 #if defined(HAS_I422TORGBAROW_AVX2) 2271 // 16 pixels 2272 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). 2273 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, 2274 const uint8* u_buf, 2275 const uint8* v_buf, 2276 uint8* dst_argb, 2277 const struct YuvConstants* yuvconstants, 2278 int width) { 2279 asm volatile ( 2280 YUVTORGB_SETUP_AVX2(yuvconstants) 2281 "sub %[u_buf],%[v_buf] \n" 2282 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2283 LABELALIGN 2284 "1: \n" 2285 READYUV422_AVX2 2286 YUVTORGB_AVX2(yuvconstants) 2287 2288 // Step 3: Weave into RGBA 2289 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" 2290 "vpermq $0xd8,%%ymm1,%%ymm1 \n" 2291 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n" 2292 "vpermq $0xd8,%%ymm2,%%ymm2 \n" 2293 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" 2294 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" 2295 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" 2296 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" 2297 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" 2298 "sub $0x10,%[width] \n" 2299 "jg 1b \n" 2300 "vzeroupper \n" 2301 : [y_buf]"+r"(y_buf), // %[y_buf] 2302 [u_buf]"+r"(u_buf), // %[u_buf] 2303 [v_buf]"+r"(v_buf), // %[v_buf] 2304 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2305 [width]"+rm"(width) // %[width] 2306 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2307 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 2308 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2309 ); 2310 } 2311 #endif // HAS_I422TORGBAROW_AVX2 2312 2313 #if defined(HAS_NV12TOARGBROW_AVX2) 2314 // 16 pixels. 2315 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2316 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, 2317 const uint8* uv_buf, 2318 uint8* dst_argb, 2319 const struct YuvConstants* yuvconstants, 2320 int width) { 2321 asm volatile ( 2322 YUVTORGB_SETUP_AVX2(yuvconstants) 2323 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2324 LABELALIGN 2325 "1: \n" 2326 READNV12_AVX2 2327 YUVTORGB_AVX2(yuvconstants) 2328 STOREARGB_AVX2 2329 "sub $0x10,%[width] \n" 2330 "jg 1b \n" 2331 "vzeroupper \n" 2332 : [y_buf]"+r"(y_buf), // %[y_buf] 2333 [uv_buf]"+r"(uv_buf), // %[uv_buf] 2334 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2335 [width]"+rm"(width) // %[width] 2336 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2337 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. 2338 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2339 ); 2340 } 2341 #endif // HAS_NV12TOARGBROW_AVX2 2342 2343 #if defined(HAS_NV21TOARGBROW_AVX2) 2344 // 16 pixels. 2345 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2346 void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf, 2347 const uint8* vu_buf, 2348 uint8* dst_argb, 2349 const struct YuvConstants* yuvconstants, 2350 int width) { 2351 asm volatile ( 2352 YUVTORGB_SETUP_AVX2(yuvconstants) 2353 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2354 LABELALIGN 2355 "1: \n" 2356 READNV21_AVX2 2357 YUVTORGB_AVX2(yuvconstants) 2358 STOREARGB_AVX2 2359 "sub $0x10,%[width] \n" 2360 "jg 1b \n" 2361 "vzeroupper \n" 2362 : [y_buf]"+r"(y_buf), // %[y_buf] 2363 [vu_buf]"+r"(vu_buf), // %[vu_buf] 2364 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2365 [width]"+rm"(width) // %[width] 2366 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 2367 [kShuffleNV21]"m"(kShuffleNV21) 2368 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. 2369 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2370 ); 2371 } 2372 #endif // HAS_NV21TOARGBROW_AVX2 2373 2374 #if defined(HAS_YUY2TOARGBROW_AVX2) 2375 // 16 pixels. 2376 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). 2377 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, 2378 uint8* dst_argb, 2379 const struct YuvConstants* yuvconstants, 2380 int width) { 2381 asm volatile ( 2382 YUVTORGB_SETUP_AVX2(yuvconstants) 2383 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2384 LABELALIGN 2385 "1: \n" 2386 READYUY2_AVX2 2387 YUVTORGB_AVX2(yuvconstants) 2388 STOREARGB_AVX2 2389 "sub $0x10,%[width] \n" 2390 "jg 1b \n" 2391 "vzeroupper \n" 2392 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] 2393 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2394 [width]"+rm"(width) // %[width] 2395 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 2396 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), 2397 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) 2398 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. 2399 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2400 ); 2401 } 2402 #endif // HAS_YUY2TOARGBROW_AVX2 2403 2404 #if defined(HAS_UYVYTOARGBROW_AVX2) 2405 // 16 pixels. 2406 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). 2407 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, 2408 uint8* dst_argb, 2409 const struct YuvConstants* yuvconstants, 2410 int width) { 2411 asm volatile ( 2412 YUVTORGB_SETUP_AVX2(yuvconstants) 2413 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2414 LABELALIGN 2415 "1: \n" 2416 READUYVY_AVX2 2417 YUVTORGB_AVX2(yuvconstants) 2418 STOREARGB_AVX2 2419 "sub $0x10,%[width] \n" 2420 "jg 1b \n" 2421 "vzeroupper \n" 2422 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] 2423 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2424 [width]"+rm"(width) // %[width] 2425 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 2426 [kShuffleUYVYY]"m"(kShuffleUYVYY), 2427 [kShuffleUYVYUV]"m"(kShuffleUYVYUV) 2428 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. 2429 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2430 ); 2431 } 2432 #endif // HAS_UYVYTOARGBROW_AVX2 2433 2434 #ifdef HAS_I400TOARGBROW_SSE2 2435 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { 2436 asm volatile ( 2437 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 2438 "movd %%eax,%%xmm2 \n" 2439 "pshufd $0x0,%%xmm2,%%xmm2 \n" 2440 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16 2441 "movd %%eax,%%xmm3 \n" 2442 "pshufd $0x0,%%xmm3,%%xmm3 \n" 2443 "pcmpeqb %%xmm4,%%xmm4 \n" 2444 "pslld $0x18,%%xmm4 \n" 2445 LABELALIGN 2446 "1: \n" 2447 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 2448 "movq " MEMACCESS(0) ",%%xmm0 \n" 2449 "lea " MEMLEA(0x8,0) ",%0 \n" 2450 "punpcklbw %%xmm0,%%xmm0 \n" 2451 "pmulhuw %%xmm2,%%xmm0 \n" 2452 "psubusw %%xmm3,%%xmm0 \n" 2453 "psrlw $6, %%xmm0 \n" 2454 "packuswb %%xmm0,%%xmm0 \n" 2455 2456 // Step 2: Weave into ARGB 2457 "punpcklbw %%xmm0,%%xmm0 \n" 2458 "movdqa %%xmm0,%%xmm1 \n" 2459 "punpcklwd %%xmm0,%%xmm0 \n" 2460 "punpckhwd %%xmm1,%%xmm1 \n" 2461 "por %%xmm4,%%xmm0 \n" 2462 "por %%xmm4,%%xmm1 \n" 2463 "movdqu %%xmm0," MEMACCESS(1) " \n" 2464 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 2465 "lea " MEMLEA(0x20,1) ",%1 \n" 2466 2467 "sub $0x8,%2 \n" 2468 "jg 1b \n" 2469 : "+r"(y_buf), // %0 2470 "+r"(dst_argb), // %1 2471 "+rm"(width) // %2 2472 : 2473 : "memory", "cc", "eax" 2474 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" 2475 ); 2476 } 2477 #endif // HAS_I400TOARGBROW_SSE2 2478 2479 #ifdef HAS_I400TOARGBROW_AVX2 2480 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). 2481 // note: vpunpcklbw mutates and vpackuswb unmutates. 2482 void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) { 2483 asm volatile ( 2484 "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16 2485 "vmovd %%eax,%%xmm2 \n" 2486 "vbroadcastss %%xmm2,%%ymm2 \n" 2487 "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164 2488 "vmovd %%eax,%%xmm3 \n" 2489 "vbroadcastss %%xmm3,%%ymm3 \n" 2490 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" 2491 "vpslld $0x18,%%ymm4,%%ymm4 \n" 2492 2493 LABELALIGN 2494 "1: \n" 2495 // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 2496 "vmovdqu " MEMACCESS(0) ",%%xmm0 \n" 2497 "lea " MEMLEA(0x10,0) ",%0 \n" 2498 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 2499 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" 2500 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" 2501 "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n" 2502 "vpsrlw $0x6,%%ymm0,%%ymm0 \n" 2503 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" 2504 "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n" 2505 "vpermq $0xd8,%%ymm1,%%ymm1 \n" 2506 "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n" 2507 "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n" 2508 "vpor %%ymm4,%%ymm0,%%ymm0 \n" 2509 "vpor %%ymm4,%%ymm1,%%ymm1 \n" 2510 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 2511 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" 2512 "lea " MEMLEA(0x40,1) ",%1 \n" 2513 "sub $0x10,%2 \n" 2514 "jg 1b \n" 2515 "vzeroupper \n" 2516 : "+r"(y_buf), // %0 2517 "+r"(dst_argb), // %1 2518 "+rm"(width) // %2 2519 : 2520 : "memory", "cc", "eax" 2521 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" 2522 ); 2523 } 2524 #endif // HAS_I400TOARGBROW_AVX2 2525 2526 #ifdef HAS_MIRRORROW_SSSE3 2527 // Shuffle table for reversing the bytes. 2528 static uvec8 kShuffleMirror = { 2529 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 2530 }; 2531 2532 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 2533 intptr_t temp_width = (intptr_t)(width); 2534 asm volatile ( 2535 "movdqa %3,%%xmm5 \n" 2536 LABELALIGN 2537 "1: \n" 2538 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 2539 "pshufb %%xmm5,%%xmm0 \n" 2540 "movdqu %%xmm0," MEMACCESS(1) " \n" 2541 "lea " MEMLEA(0x10,1) ",%1 \n" 2542 "sub $0x10,%2 \n" 2543 "jg 1b \n" 2544 : "+r"(src), // %0 2545 "+r"(dst), // %1 2546 "+r"(temp_width) // %2 2547 : "m"(kShuffleMirror) // %3 2548 : "memory", "cc", NACL_R14 2549 "xmm0", "xmm5" 2550 ); 2551 } 2552 #endif // HAS_MIRRORROW_SSSE3 2553 2554 #ifdef HAS_MIRRORROW_AVX2 2555 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { 2556 intptr_t temp_width = (intptr_t)(width); 2557 asm volatile ( 2558 "vbroadcastf128 %3,%%ymm5 \n" 2559 LABELALIGN 2560 "1: \n" 2561 MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0 2562 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" 2563 "vpermq $0x4e,%%ymm0,%%ymm0 \n" 2564 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 2565 "lea " MEMLEA(0x20,1) ",%1 \n" 2566 "sub $0x20,%2 \n" 2567 "jg 1b \n" 2568 "vzeroupper \n" 2569 : "+r"(src), // %0 2570 "+r"(dst), // %1 2571 "+r"(temp_width) // %2 2572 : "m"(kShuffleMirror) // %3 2573 : "memory", "cc", NACL_R14 2574 "xmm0", "xmm5" 2575 ); 2576 } 2577 #endif // HAS_MIRRORROW_AVX2 2578 2579 #ifdef HAS_MIRRORUVROW_SSSE3 2580 // Shuffle table for reversing the bytes of UV channels. 2581 static uvec8 kShuffleMirrorUV = { 2582 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u 2583 }; 2584 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, 2585 int width) { 2586 intptr_t temp_width = (intptr_t)(width); 2587 asm volatile ( 2588 "movdqa %4,%%xmm1 \n" 2589 "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" 2590 "sub %1,%2 \n" 2591 LABELALIGN 2592 "1: \n" 2593 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2594 "lea " MEMLEA(-0x10,0) ",%0 \n" 2595 "pshufb %%xmm1,%%xmm0 \n" 2596 "movlpd %%xmm0," MEMACCESS(1) " \n" 2597 MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2) 2598 "lea " MEMLEA(0x8,1) ",%1 \n" 2599 "sub $8,%3 \n" 2600 "jg 1b \n" 2601 : "+r"(src), // %0 2602 "+r"(dst_u), // %1 2603 "+r"(dst_v), // %2 2604 "+r"(temp_width) // %3 2605 : "m"(kShuffleMirrorUV) // %4 2606 : "memory", "cc", NACL_R14 2607 "xmm0", "xmm1" 2608 ); 2609 } 2610 #endif // HAS_MIRRORUVROW_SSSE3 2611 2612 #ifdef HAS_ARGBMIRRORROW_SSE2 2613 2614 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { 2615 intptr_t temp_width = (intptr_t)(width); 2616 asm volatile ( 2617 "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n" 2618 LABELALIGN 2619 "1: \n" 2620 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2621 "pshufd $0x1b,%%xmm0,%%xmm0 \n" 2622 "lea " MEMLEA(-0x10,0) ",%0 \n" 2623 "movdqu %%xmm0," MEMACCESS(1) " \n" 2624 "lea " MEMLEA(0x10,1) ",%1 \n" 2625 "sub $0x4,%2 \n" 2626 "jg 1b \n" 2627 : "+r"(src), // %0 2628 "+r"(dst), // %1 2629 "+r"(temp_width) // %2 2630 : 2631 : "memory", "cc" 2632 , "xmm0" 2633 ); 2634 } 2635 #endif // HAS_ARGBMIRRORROW_SSE2 2636 2637 #ifdef HAS_ARGBMIRRORROW_AVX2 2638 // Shuffle table for reversing the bytes. 2639 static const ulvec32 kARGBShuffleMirror_AVX2 = { 2640 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 2641 }; 2642 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { 2643 intptr_t temp_width = (intptr_t)(width); 2644 asm volatile ( 2645 "vmovdqu %3,%%ymm5 \n" 2646 LABELALIGN 2647 "1: \n" 2648 VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0 2649 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 2650 "lea " MEMLEA(0x20,1) ",%1 \n" 2651 "sub $0x8,%2 \n" 2652 "jg 1b \n" 2653 "vzeroupper \n" 2654 : "+r"(src), // %0 2655 "+r"(dst), // %1 2656 "+r"(temp_width) // %2 2657 : "m"(kARGBShuffleMirror_AVX2) // %3 2658 : "memory", "cc", NACL_R14 2659 "xmm0", "xmm5" 2660 ); 2661 } 2662 #endif // HAS_ARGBMIRRORROW_AVX2 2663 2664 #ifdef HAS_SPLITUVROW_AVX2 2665 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 2666 int width) { 2667 asm volatile ( 2668 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2669 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 2670 "sub %1,%2 \n" 2671 LABELALIGN 2672 "1: \n" 2673 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 2674 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 2675 "lea " MEMLEA(0x40,0) ",%0 \n" 2676 "vpsrlw $0x8,%%ymm0,%%ymm2 \n" 2677 "vpsrlw $0x8,%%ymm1,%%ymm3 \n" 2678 "vpand %%ymm5,%%ymm0,%%ymm0 \n" 2679 "vpand %%ymm5,%%ymm1,%%ymm1 \n" 2680 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 2681 "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" 2682 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 2683 "vpermq $0xd8,%%ymm2,%%ymm2 \n" 2684 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 2685 MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2) 2686 "lea " MEMLEA(0x20,1) ",%1 \n" 2687 "sub $0x20,%3 \n" 2688 "jg 1b \n" 2689 "vzeroupper \n" 2690 : "+r"(src_uv), // %0 2691 "+r"(dst_u), // %1 2692 "+r"(dst_v), // %2 2693 "+r"(width) // %3 2694 : 2695 : "memory", "cc", NACL_R14 2696 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 2697 ); 2698 } 2699 #endif // HAS_SPLITUVROW_AVX2 2700 2701 #ifdef HAS_SPLITUVROW_SSE2 2702 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 2703 int width) { 2704 asm volatile ( 2705 "pcmpeqb %%xmm5,%%xmm5 \n" 2706 "psrlw $0x8,%%xmm5 \n" 2707 "sub %1,%2 \n" 2708 LABELALIGN 2709 "1: \n" 2710 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2711 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 2712 "lea " MEMLEA(0x20,0) ",%0 \n" 2713 "movdqa %%xmm0,%%xmm2 \n" 2714 "movdqa %%xmm1,%%xmm3 \n" 2715 "pand %%xmm5,%%xmm0 \n" 2716 "pand %%xmm5,%%xmm1 \n" 2717 "packuswb %%xmm1,%%xmm0 \n" 2718 "psrlw $0x8,%%xmm2 \n" 2719 "psrlw $0x8,%%xmm3 \n" 2720 "packuswb %%xmm3,%%xmm2 \n" 2721 "movdqu %%xmm0," MEMACCESS(1) " \n" 2722 MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) 2723 "lea " MEMLEA(0x10,1) ",%1 \n" 2724 "sub $0x10,%3 \n" 2725 "jg 1b \n" 2726 : "+r"(src_uv), // %0 2727 "+r"(dst_u), // %1 2728 "+r"(dst_v), // %2 2729 "+r"(width) // %3 2730 : 2731 : "memory", "cc", NACL_R14 2732 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 2733 ); 2734 } 2735 #endif // HAS_SPLITUVROW_SSE2 2736 2737 #ifdef HAS_MERGEUVROW_AVX2 2738 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 2739 int width) { 2740 asm volatile ( 2741 "sub %0,%1 \n" 2742 LABELALIGN 2743 "1: \n" 2744 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 2745 MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1 2746 "lea " MEMLEA(0x20,0) ",%0 \n" 2747 "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" 2748 "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" 2749 "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n" 2750 "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n" 2751 "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n" 2752 "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n" 2753 "lea " MEMLEA(0x40,2) ",%2 \n" 2754 "sub $0x20,%3 \n" 2755 "jg 1b \n" 2756 "vzeroupper \n" 2757 : "+r"(src_u), // %0 2758 "+r"(src_v), // %1 2759 "+r"(dst_uv), // %2 2760 "+r"(width) // %3 2761 : 2762 : "memory", "cc", NACL_R14 2763 "xmm0", "xmm1", "xmm2" 2764 ); 2765 } 2766 #endif // HAS_MERGEUVROW_AVX2 2767 2768 #ifdef HAS_MERGEUVROW_SSE2 2769 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 2770 int width) { 2771 asm volatile ( 2772 "sub %0,%1 \n" 2773 LABELALIGN 2774 "1: \n" 2775 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2776 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 2777 "lea " MEMLEA(0x10,0) ",%0 \n" 2778 "movdqa %%xmm0,%%xmm2 \n" 2779 "punpcklbw %%xmm1,%%xmm0 \n" 2780 "punpckhbw %%xmm1,%%xmm2 \n" 2781 "movdqu %%xmm0," MEMACCESS(2) " \n" 2782 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" 2783 "lea " MEMLEA(0x20,2) ",%2 \n" 2784 "sub $0x10,%3 \n" 2785 "jg 1b \n" 2786 : "+r"(src_u), // %0 2787 "+r"(src_v), // %1 2788 "+r"(dst_uv), // %2 2789 "+r"(width) // %3 2790 : 2791 : "memory", "cc", NACL_R14 2792 "xmm0", "xmm1", "xmm2" 2793 ); 2794 } 2795 #endif // HAS_MERGEUVROW_SSE2 2796 2797 #ifdef HAS_COPYROW_SSE2 2798 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { 2799 asm volatile ( 2800 "test $0xf,%0 \n" 2801 "jne 2f \n" 2802 "test $0xf,%1 \n" 2803 "jne 2f \n" 2804 LABELALIGN 2805 "1: \n" 2806 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 2807 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 2808 "lea " MEMLEA(0x20,0) ",%0 \n" 2809 "movdqa %%xmm0," MEMACCESS(1) " \n" 2810 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" 2811 "lea " MEMLEA(0x20,1) ",%1 \n" 2812 "sub $0x20,%2 \n" 2813 "jg 1b \n" 2814 "jmp 9f \n" 2815 LABELALIGN 2816 "2: \n" 2817 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2818 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 2819 "lea " MEMLEA(0x20,0) ",%0 \n" 2820 "movdqu %%xmm0," MEMACCESS(1) " \n" 2821 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 2822 "lea " MEMLEA(0x20,1) ",%1 \n" 2823 "sub $0x20,%2 \n" 2824 "jg 2b \n" 2825 "9: \n" 2826 : "+r"(src), // %0 2827 "+r"(dst), // %1 2828 "+r"(count) // %2 2829 : 2830 : "memory", "cc" 2831 , "xmm0", "xmm1" 2832 ); 2833 } 2834 #endif // HAS_COPYROW_SSE2 2835 2836 #ifdef HAS_COPYROW_AVX 2837 void CopyRow_AVX(const uint8* src, uint8* dst, int count) { 2838 asm volatile ( 2839 LABELALIGN 2840 "1: \n" 2841 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 2842 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 2843 "lea " MEMLEA(0x40,0) ",%0 \n" 2844 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 2845 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" 2846 "lea " MEMLEA(0x40,1) ",%1 \n" 2847 "sub $0x40,%2 \n" 2848 "jg 1b \n" 2849 : "+r"(src), // %0 2850 "+r"(dst), // %1 2851 "+r"(count) // %2 2852 : 2853 : "memory", "cc" 2854 , "xmm0", "xmm1" 2855 ); 2856 } 2857 #endif // HAS_COPYROW_AVX 2858 2859 #ifdef HAS_COPYROW_ERMS 2860 // Multiple of 1. 2861 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { 2862 size_t width_tmp = (size_t)(width); 2863 asm volatile ( 2864 "rep movsb " MEMMOVESTRING(0,1) " \n" 2865 : "+S"(src), // %0 2866 "+D"(dst), // %1 2867 "+c"(width_tmp) // %2 2868 : 2869 : "memory", "cc" 2870 ); 2871 } 2872 #endif // HAS_COPYROW_ERMS 2873 2874 #ifdef HAS_ARGBCOPYALPHAROW_SSE2 2875 // width in pixels 2876 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 2877 asm volatile ( 2878 "pcmpeqb %%xmm0,%%xmm0 \n" 2879 "pslld $0x18,%%xmm0 \n" 2880 "pcmpeqb %%xmm1,%%xmm1 \n" 2881 "psrld $0x8,%%xmm1 \n" 2882 LABELALIGN 2883 "1: \n" 2884 "movdqu " MEMACCESS(0) ",%%xmm2 \n" 2885 "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" 2886 "lea " MEMLEA(0x20,0) ",%0 \n" 2887 "movdqu " MEMACCESS(1) ",%%xmm4 \n" 2888 "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n" 2889 "pand %%xmm0,%%xmm2 \n" 2890 "pand %%xmm0,%%xmm3 \n" 2891 "pand %%xmm1,%%xmm4 \n" 2892 "pand %%xmm1,%%xmm5 \n" 2893 "por %%xmm4,%%xmm2 \n" 2894 "por %%xmm5,%%xmm3 \n" 2895 "movdqu %%xmm2," MEMACCESS(1) " \n" 2896 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" 2897 "lea " MEMLEA(0x20,1) ",%1 \n" 2898 "sub $0x8,%2 \n" 2899 "jg 1b \n" 2900 : "+r"(src), // %0 2901 "+r"(dst), // %1 2902 "+r"(width) // %2 2903 : 2904 : "memory", "cc" 2905 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2906 ); 2907 } 2908 #endif // HAS_ARGBCOPYALPHAROW_SSE2 2909 2910 #ifdef HAS_ARGBCOPYALPHAROW_AVX2 2911 // width in pixels 2912 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { 2913 asm volatile ( 2914 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" 2915 "vpsrld $0x8,%%ymm0,%%ymm0 \n" 2916 LABELALIGN 2917 "1: \n" 2918 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" 2919 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n" 2920 "lea " MEMLEA(0x40,0) ",%0 \n" 2921 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" 2922 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" 2923 "vmovdqu %%ymm1," MEMACCESS(1) " \n" 2924 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" 2925 "lea " MEMLEA(0x40,1) ",%1 \n" 2926 "sub $0x10,%2 \n" 2927 "jg 1b \n" 2928 "vzeroupper \n" 2929 : "+r"(src), // %0 2930 "+r"(dst), // %1 2931 "+r"(width) // %2 2932 : 2933 : "memory", "cc" 2934 , "xmm0", "xmm1", "xmm2" 2935 ); 2936 } 2937 #endif // HAS_ARGBCOPYALPHAROW_AVX2 2938 2939 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 2940 // width in pixels 2941 void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { 2942 asm volatile ( 2943 LABELALIGN 2944 "1: \n" 2945 "movdqu " MEMACCESS(0) ", %%xmm0 \n" 2946 "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n" 2947 "lea " MEMLEA(0x20, 0) ", %0 \n" 2948 "psrld $0x18, %%xmm0 \n" 2949 "psrld $0x18, %%xmm1 \n" 2950 "packssdw %%xmm1, %%xmm0 \n" 2951 "packuswb %%xmm0, %%xmm0 \n" 2952 "movq %%xmm0," MEMACCESS(1) " \n" 2953 "lea " MEMLEA(0x8, 1) ", %1 \n" 2954 "sub $0x8, %2 \n" 2955 "jg 1b \n" 2956 : "+r"(src_argb), // %0 2957 "+r"(dst_a), // %1 2958 "+rm"(width) // %2 2959 : 2960 : "memory", "cc" 2961 , "xmm0", "xmm1" 2962 ); 2963 } 2964 #endif // HAS_ARGBEXTRACTALPHAROW_SSE2 2965 2966 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 2967 // width in pixels 2968 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 2969 asm volatile ( 2970 "pcmpeqb %%xmm0,%%xmm0 \n" 2971 "pslld $0x18,%%xmm0 \n" 2972 "pcmpeqb %%xmm1,%%xmm1 \n" 2973 "psrld $0x8,%%xmm1 \n" 2974 LABELALIGN 2975 "1: \n" 2976 "movq " MEMACCESS(0) ",%%xmm2 \n" 2977 "lea " MEMLEA(0x8,0) ",%0 \n" 2978 "punpcklbw %%xmm2,%%xmm2 \n" 2979 "punpckhwd %%xmm2,%%xmm3 \n" 2980 "punpcklwd %%xmm2,%%xmm2 \n" 2981 "movdqu " MEMACCESS(1) ",%%xmm4 \n" 2982 "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n" 2983 "pand %%xmm0,%%xmm2 \n" 2984 "pand %%xmm0,%%xmm3 \n" 2985 "pand %%xmm1,%%xmm4 \n" 2986 "pand %%xmm1,%%xmm5 \n" 2987 "por %%xmm4,%%xmm2 \n" 2988 "por %%xmm5,%%xmm3 \n" 2989 "movdqu %%xmm2," MEMACCESS(1) " \n" 2990 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" 2991 "lea " MEMLEA(0x20,1) ",%1 \n" 2992 "sub $0x8,%2 \n" 2993 "jg 1b \n" 2994 : "+r"(src), // %0 2995 "+r"(dst), // %1 2996 "+r"(width) // %2 2997 : 2998 : "memory", "cc" 2999 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 3000 ); 3001 } 3002 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 3003 3004 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 3005 // width in pixels 3006 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { 3007 asm volatile ( 3008 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" 3009 "vpsrld $0x8,%%ymm0,%%ymm0 \n" 3010 LABELALIGN 3011 "1: \n" 3012 "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n" 3013 "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n" 3014 "lea " MEMLEA(0x10,0) ",%0 \n" 3015 "vpslld $0x18,%%ymm1,%%ymm1 \n" 3016 "vpslld $0x18,%%ymm2,%%ymm2 \n" 3017 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" 3018 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" 3019 "vmovdqu %%ymm1," MEMACCESS(1) " \n" 3020 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" 3021 "lea " MEMLEA(0x40,1) ",%1 \n" 3022 "sub $0x10,%2 \n" 3023 "jg 1b \n" 3024 "vzeroupper \n" 3025 : "+r"(src), // %0 3026 "+r"(dst), // %1 3027 "+r"(width) // %2 3028 : 3029 : "memory", "cc" 3030 , "xmm0", "xmm1", "xmm2" 3031 ); 3032 } 3033 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 3034 3035 #ifdef HAS_SETROW_X86 3036 void SetRow_X86(uint8* dst, uint8 v8, int width) { 3037 size_t width_tmp = (size_t)(width >> 2); 3038 const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. 3039 asm volatile ( 3040 "rep stosl " MEMSTORESTRING(eax,0) " \n" 3041 : "+D"(dst), // %0 3042 "+c"(width_tmp) // %1 3043 : "a"(v32) // %2 3044 : "memory", "cc"); 3045 } 3046 3047 void SetRow_ERMS(uint8* dst, uint8 v8, int width) { 3048 size_t width_tmp = (size_t)(width); 3049 asm volatile ( 3050 "rep stosb " MEMSTORESTRING(al,0) " \n" 3051 : "+D"(dst), // %0 3052 "+c"(width_tmp) // %1 3053 : "a"(v8) // %2 3054 : "memory", "cc"); 3055 } 3056 3057 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) { 3058 size_t width_tmp = (size_t)(width); 3059 asm volatile ( 3060 "rep stosl " MEMSTORESTRING(eax,0) " \n" 3061 : "+D"(dst_argb), // %0 3062 "+c"(width_tmp) // %1 3063 : "a"(v32) // %2 3064 : "memory", "cc"); 3065 } 3066 #endif // HAS_SETROW_X86 3067 3068 #ifdef HAS_YUY2TOYROW_SSE2 3069 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) { 3070 asm volatile ( 3071 "pcmpeqb %%xmm5,%%xmm5 \n" 3072 "psrlw $0x8,%%xmm5 \n" 3073 LABELALIGN 3074 "1: \n" 3075 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3076 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3077 "lea " MEMLEA(0x20,0) ",%0 \n" 3078 "pand %%xmm5,%%xmm0 \n" 3079 "pand %%xmm5,%%xmm1 \n" 3080 "packuswb %%xmm1,%%xmm0 \n" 3081 "movdqu %%xmm0," MEMACCESS(1) " \n" 3082 "lea " MEMLEA(0x10,1) ",%1 \n" 3083 "sub $0x10,%2 \n" 3084 "jg 1b \n" 3085 : "+r"(src_yuy2), // %0 3086 "+r"(dst_y), // %1 3087 "+r"(width) // %2 3088 : 3089 : "memory", "cc" 3090 , "xmm0", "xmm1", "xmm5" 3091 ); 3092 } 3093 3094 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, 3095 uint8* dst_u, uint8* dst_v, int width) { 3096 asm volatile ( 3097 "pcmpeqb %%xmm5,%%xmm5 \n" 3098 "psrlw $0x8,%%xmm5 \n" 3099 "sub %1,%2 \n" 3100 LABELALIGN 3101 "1: \n" 3102 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3103 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3104 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 3105 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 3106 "lea " MEMLEA(0x20,0) ",%0 \n" 3107 "pavgb %%xmm2,%%xmm0 \n" 3108 "pavgb %%xmm3,%%xmm1 \n" 3109 "psrlw $0x8,%%xmm0 \n" 3110 "psrlw $0x8,%%xmm1 \n" 3111 "packuswb %%xmm1,%%xmm0 \n" 3112 "movdqa %%xmm0,%%xmm1 \n" 3113 "pand %%xmm5,%%xmm0 \n" 3114 "packuswb %%xmm0,%%xmm0 \n" 3115 "psrlw $0x8,%%xmm1 \n" 3116 "packuswb %%xmm1,%%xmm1 \n" 3117 "movq %%xmm0," MEMACCESS(1) " \n" 3118 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 3119 "lea " MEMLEA(0x8,1) ",%1 \n" 3120 "sub $0x10,%3 \n" 3121 "jg 1b \n" 3122 : "+r"(src_yuy2), // %0 3123 "+r"(dst_u), // %1 3124 "+r"(dst_v), // %2 3125 "+r"(width) // %3 3126 : "r"((intptr_t)(stride_yuy2)) // %4 3127 : "memory", "cc", NACL_R14 3128 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 3129 ); 3130 } 3131 3132 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, 3133 uint8* dst_u, uint8* dst_v, int width) { 3134 asm volatile ( 3135 "pcmpeqb %%xmm5,%%xmm5 \n" 3136 "psrlw $0x8,%%xmm5 \n" 3137 "sub %1,%2 \n" 3138 LABELALIGN 3139 "1: \n" 3140 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3141 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3142 "lea " MEMLEA(0x20,0) ",%0 \n" 3143 "psrlw $0x8,%%xmm0 \n" 3144 "psrlw $0x8,%%xmm1 \n" 3145 "packuswb %%xmm1,%%xmm0 \n" 3146 "movdqa %%xmm0,%%xmm1 \n" 3147 "pand %%xmm5,%%xmm0 \n" 3148 "packuswb %%xmm0,%%xmm0 \n" 3149 "psrlw $0x8,%%xmm1 \n" 3150 "packuswb %%xmm1,%%xmm1 \n" 3151 "movq %%xmm0," MEMACCESS(1) " \n" 3152 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 3153 "lea " MEMLEA(0x8,1) ",%1 \n" 3154 "sub $0x10,%3 \n" 3155 "jg 1b \n" 3156 : "+r"(src_yuy2), // %0 3157 "+r"(dst_u), // %1 3158 "+r"(dst_v), // %2 3159 "+r"(width) // %3 3160 : 3161 : "memory", "cc", NACL_R14 3162 "xmm0", "xmm1", "xmm5" 3163 ); 3164 } 3165 3166 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) { 3167 asm volatile ( 3168 LABELALIGN 3169 "1: \n" 3170 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3171 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3172 "lea " MEMLEA(0x20,0) ",%0 \n" 3173 "psrlw $0x8,%%xmm0 \n" 3174 "psrlw $0x8,%%xmm1 \n" 3175 "packuswb %%xmm1,%%xmm0 \n" 3176 "movdqu %%xmm0," MEMACCESS(1) " \n" 3177 "lea " MEMLEA(0x10,1) ",%1 \n" 3178 "sub $0x10,%2 \n" 3179 "jg 1b \n" 3180 : "+r"(src_uyvy), // %0 3181 "+r"(dst_y), // %1 3182 "+r"(width) // %2 3183 : 3184 : "memory", "cc" 3185 , "xmm0", "xmm1" 3186 ); 3187 } 3188 3189 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, 3190 uint8* dst_u, uint8* dst_v, int width) { 3191 asm volatile ( 3192 "pcmpeqb %%xmm5,%%xmm5 \n" 3193 "psrlw $0x8,%%xmm5 \n" 3194 "sub %1,%2 \n" 3195 LABELALIGN 3196 "1: \n" 3197 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3198 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3199 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 3200 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 3201 "lea " MEMLEA(0x20,0) ",%0 \n" 3202 "pavgb %%xmm2,%%xmm0 \n" 3203 "pavgb %%xmm3,%%xmm1 \n" 3204 "pand %%xmm5,%%xmm0 \n" 3205 "pand %%xmm5,%%xmm1 \n" 3206 "packuswb %%xmm1,%%xmm0 \n" 3207 "movdqa %%xmm0,%%xmm1 \n" 3208 "pand %%xmm5,%%xmm0 \n" 3209 "packuswb %%xmm0,%%xmm0 \n" 3210 "psrlw $0x8,%%xmm1 \n" 3211 "packuswb %%xmm1,%%xmm1 \n" 3212 "movq %%xmm0," MEMACCESS(1) " \n" 3213 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 3214 "lea " MEMLEA(0x8,1) ",%1 \n" 3215 "sub $0x10,%3 \n" 3216 "jg 1b \n" 3217 : "+r"(src_uyvy), // %0 3218 "+r"(dst_u), // %1 3219 "+r"(dst_v), // %2 3220 "+r"(width) // %3 3221 : "r"((intptr_t)(stride_uyvy)) // %4 3222 : "memory", "cc", NACL_R14 3223 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 3224 ); 3225 } 3226 3227 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, 3228 uint8* dst_u, uint8* dst_v, int width) { 3229 asm volatile ( 3230 "pcmpeqb %%xmm5,%%xmm5 \n" 3231 "psrlw $0x8,%%xmm5 \n" 3232 "sub %1,%2 \n" 3233 LABELALIGN 3234 "1: \n" 3235 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3236 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3237 "lea " MEMLEA(0x20,0) ",%0 \n" 3238 "pand %%xmm5,%%xmm0 \n" 3239 "pand %%xmm5,%%xmm1 \n" 3240 "packuswb %%xmm1,%%xmm0 \n" 3241 "movdqa %%xmm0,%%xmm1 \n" 3242 "pand %%xmm5,%%xmm0 \n" 3243 "packuswb %%xmm0,%%xmm0 \n" 3244 "psrlw $0x8,%%xmm1 \n" 3245 "packuswb %%xmm1,%%xmm1 \n" 3246 "movq %%xmm0," MEMACCESS(1) " \n" 3247 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 3248 "lea " MEMLEA(0x8,1) ",%1 \n" 3249 "sub $0x10,%3 \n" 3250 "jg 1b \n" 3251 : "+r"(src_uyvy), // %0 3252 "+r"(dst_u), // %1 3253 "+r"(dst_v), // %2 3254 "+r"(width) // %3 3255 : 3256 : "memory", "cc", NACL_R14 3257 "xmm0", "xmm1", "xmm5" 3258 ); 3259 } 3260 #endif // HAS_YUY2TOYROW_SSE2 3261 3262 #ifdef HAS_YUY2TOYROW_AVX2 3263 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) { 3264 asm volatile ( 3265 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3266 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3267 LABELALIGN 3268 "1: \n" 3269 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3270 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 3271 "lea " MEMLEA(0x40,0) ",%0 \n" 3272 "vpand %%ymm5,%%ymm0,%%ymm0 \n" 3273 "vpand %%ymm5,%%ymm1,%%ymm1 \n" 3274 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 3275 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3276 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 3277 "lea " MEMLEA(0x20,1) ",%1 \n" 3278 "sub $0x20,%2 \n" 3279 "jg 1b \n" 3280 "vzeroupper \n" 3281 : "+r"(src_yuy2), // %0 3282 "+r"(dst_y), // %1 3283 "+r"(width) // %2 3284 : 3285 : "memory", "cc" 3286 , "xmm0", "xmm1", "xmm5" 3287 ); 3288 } 3289 3290 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, 3291 uint8* dst_u, uint8* dst_v, int width) { 3292 asm volatile ( 3293 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3294 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3295 "sub %1,%2 \n" 3296 LABELALIGN 3297 "1: \n" 3298 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3299 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 3300 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 3301 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) 3302 "lea " MEMLEA(0x40,0) ",%0 \n" 3303 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3304 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" 3305 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 3306 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3307 "vpand %%ymm5,%%ymm0,%%ymm1 \n" 3308 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3309 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" 3310 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" 3311 "vpermq $0xd8,%%ymm1,%%ymm1 \n" 3312 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3313 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" 3314 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) 3315 "lea " MEMLEA(0x10,1) ",%1 \n" 3316 "sub $0x20,%3 \n" 3317 "jg 1b \n" 3318 "vzeroupper \n" 3319 : "+r"(src_yuy2), // %0 3320 "+r"(dst_u), // %1 3321 "+r"(dst_v), // %2 3322 "+r"(width) // %3 3323 : "r"((intptr_t)(stride_yuy2)) // %4 3324 : "memory", "cc", NACL_R14 3325 "xmm0", "xmm1", "xmm5" 3326 ); 3327 } 3328 3329 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, 3330 uint8* dst_u, uint8* dst_v, int width) { 3331 asm volatile ( 3332 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3333 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3334 "sub %1,%2 \n" 3335 LABELALIGN 3336 "1: \n" 3337 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3338 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 3339 "lea " MEMLEA(0x40,0) ",%0 \n" 3340 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3341 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" 3342 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 3343 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3344 "vpand %%ymm5,%%ymm0,%%ymm1 \n" 3345 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3346 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" 3347 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" 3348 "vpermq $0xd8,%%ymm1,%%ymm1 \n" 3349 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3350 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" 3351 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) 3352 "lea " MEMLEA(0x10,1) ",%1 \n" 3353 "sub $0x20,%3 \n" 3354 "jg 1b \n" 3355 "vzeroupper \n" 3356 : "+r"(src_yuy2), // %0 3357 "+r"(dst_u), // %1 3358 "+r"(dst_v), // %2 3359 "+r"(width) // %3 3360 : 3361 : "memory", "cc", NACL_R14 3362 "xmm0", "xmm1", "xmm5" 3363 ); 3364 } 3365 3366 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) { 3367 asm volatile ( 3368 LABELALIGN 3369 "1: \n" 3370 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3371 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 3372 "lea " MEMLEA(0x40,0) ",%0 \n" 3373 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3374 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" 3375 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 3376 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3377 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 3378 "lea " MEMLEA(0x20,1) ",%1 \n" 3379 "sub $0x20,%2 \n" 3380 "jg 1b \n" 3381 "vzeroupper \n" 3382 : "+r"(src_uyvy), // %0 3383 "+r"(dst_y), // %1 3384 "+r"(width) // %2 3385 : 3386 : "memory", "cc" 3387 , "xmm0", "xmm1", "xmm5" 3388 ); 3389 } 3390 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, 3391 uint8* dst_u, uint8* dst_v, int width) { 3392 asm volatile ( 3393 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3394 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3395 "sub %1,%2 \n" 3396 3397 LABELALIGN 3398 "1: \n" 3399 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3400 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 3401 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 3402 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) 3403 "lea " MEMLEA(0x40,0) ",%0 \n" 3404 "vpand %%ymm5,%%ymm0,%%ymm0 \n" 3405 "vpand %%ymm5,%%ymm1,%%ymm1 \n" 3406 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 3407 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3408 "vpand %%ymm5,%%ymm0,%%ymm1 \n" 3409 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3410 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" 3411 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" 3412 "vpermq $0xd8,%%ymm1,%%ymm1 \n" 3413 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3414 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" 3415 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) 3416 "lea " MEMLEA(0x10,1) ",%1 \n" 3417 "sub $0x20,%3 \n" 3418 "jg 1b \n" 3419 "vzeroupper \n" 3420 : "+r"(src_uyvy), // %0 3421 "+r"(dst_u), // %1 3422 "+r"(dst_v), // %2 3423 "+r"(width) // %3 3424 : "r"((intptr_t)(stride_uyvy)) // %4 3425 : "memory", "cc", NACL_R14 3426 "xmm0", "xmm1", "xmm5" 3427 ); 3428 } 3429 3430 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, 3431 uint8* dst_u, uint8* dst_v, int width) { 3432 asm volatile ( 3433 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3434 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3435 "sub %1,%2 \n" 3436 LABELALIGN 3437 "1: \n" 3438 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3439 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 3440 "lea " MEMLEA(0x40,0) ",%0 \n" 3441 "vpand %%ymm5,%%ymm0,%%ymm0 \n" 3442 "vpand %%ymm5,%%ymm1,%%ymm1 \n" 3443 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 3444 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3445 "vpand %%ymm5,%%ymm0,%%ymm1 \n" 3446 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3447 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" 3448 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" 3449 "vpermq $0xd8,%%ymm1,%%ymm1 \n" 3450 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3451 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" 3452 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) 3453 "lea " MEMLEA(0x10,1) ",%1 \n" 3454 "sub $0x20,%3 \n" 3455 "jg 1b \n" 3456 "vzeroupper \n" 3457 : "+r"(src_uyvy), // %0 3458 "+r"(dst_u), // %1 3459 "+r"(dst_v), // %2 3460 "+r"(width) // %3 3461 : 3462 : "memory", "cc", NACL_R14 3463 "xmm0", "xmm1", "xmm5" 3464 ); 3465 } 3466 #endif // HAS_YUY2TOYROW_AVX2 3467 3468 #ifdef HAS_ARGBBLENDROW_SSSE3 3469 // Shuffle table for isolating alpha. 3470 static uvec8 kShuffleAlpha = { 3471 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 3472 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 3473 }; 3474 3475 // Blend 8 pixels at a time 3476 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, 3477 uint8* dst_argb, int width) { 3478 asm volatile ( 3479 "pcmpeqb %%xmm7,%%xmm7 \n" 3480 "psrlw $0xf,%%xmm7 \n" 3481 "pcmpeqb %%xmm6,%%xmm6 \n" 3482 "psrlw $0x8,%%xmm6 \n" 3483 "pcmpeqb %%xmm5,%%xmm5 \n" 3484 "psllw $0x8,%%xmm5 \n" 3485 "pcmpeqb %%xmm4,%%xmm4 \n" 3486 "pslld $0x18,%%xmm4 \n" 3487 "sub $0x4,%3 \n" 3488 "jl 49f \n" 3489 3490 // 4 pixel loop. 3491 LABELALIGN 3492 "40: \n" 3493 "movdqu " MEMACCESS(0) ",%%xmm3 \n" 3494 "lea " MEMLEA(0x10,0) ",%0 \n" 3495 "movdqa %%xmm3,%%xmm0 \n" 3496 "pxor %%xmm4,%%xmm3 \n" 3497 "movdqu " MEMACCESS(1) ",%%xmm2 \n" 3498 "pshufb %4,%%xmm3 \n" 3499 "pand %%xmm6,%%xmm2 \n" 3500 "paddw %%xmm7,%%xmm3 \n" 3501 "pmullw %%xmm3,%%xmm2 \n" 3502 "movdqu " MEMACCESS(1) ",%%xmm1 \n" 3503 "lea " MEMLEA(0x10,1) ",%1 \n" 3504 "psrlw $0x8,%%xmm1 \n" 3505 "por %%xmm4,%%xmm0 \n" 3506 "pmullw %%xmm3,%%xmm1 \n" 3507 "psrlw $0x8,%%xmm2 \n" 3508 "paddusb %%xmm2,%%xmm0 \n" 3509 "pand %%xmm5,%%xmm1 \n" 3510 "paddusb %%xmm1,%%xmm0 \n" 3511 "movdqu %%xmm0," MEMACCESS(2) " \n" 3512 "lea " MEMLEA(0x10,2) ",%2 \n" 3513 "sub $0x4,%3 \n" 3514 "jge 40b \n" 3515 3516 "49: \n" 3517 "add $0x3,%3 \n" 3518 "jl 99f \n" 3519 3520 // 1 pixel loop. 3521 "91: \n" 3522 "movd " MEMACCESS(0) ",%%xmm3 \n" 3523 "lea " MEMLEA(0x4,0) ",%0 \n" 3524 "movdqa %%xmm3,%%xmm0 \n" 3525 "pxor %%xmm4,%%xmm3 \n" 3526 "movd " MEMACCESS(1) ",%%xmm2 \n" 3527 "pshufb %4,%%xmm3 \n" 3528 "pand %%xmm6,%%xmm2 \n" 3529 "paddw %%xmm7,%%xmm3 \n" 3530 "pmullw %%xmm3,%%xmm2 \n" 3531 "movd " MEMACCESS(1) ",%%xmm1 \n" 3532 "lea " MEMLEA(0x4,1) ",%1 \n" 3533 "psrlw $0x8,%%xmm1 \n" 3534 "por %%xmm4,%%xmm0 \n" 3535 "pmullw %%xmm3,%%xmm1 \n" 3536 "psrlw $0x8,%%xmm2 \n" 3537 "paddusb %%xmm2,%%xmm0 \n" 3538 "pand %%xmm5,%%xmm1 \n" 3539 "paddusb %%xmm1,%%xmm0 \n" 3540 "movd %%xmm0," MEMACCESS(2) " \n" 3541 "lea " MEMLEA(0x4,2) ",%2 \n" 3542 "sub $0x1,%3 \n" 3543 "jge 91b \n" 3544 "99: \n" 3545 : "+r"(src_argb0), // %0 3546 "+r"(src_argb1), // %1 3547 "+r"(dst_argb), // %2 3548 "+r"(width) // %3 3549 : "m"(kShuffleAlpha) // %4 3550 : "memory", "cc" 3551 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 3552 ); 3553 } 3554 #endif // HAS_ARGBBLENDROW_SSSE3 3555 3556 #ifdef HAS_BLENDPLANEROW_SSSE3 3557 // Blend 8 pixels at a time. 3558 // unsigned version of math 3559 // =((A2*C2)+(B2*(255-C2))+255)/256 3560 // signed version of math 3561 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 3562 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, 3563 const uint8* alpha, uint8* dst, int width) { 3564 asm volatile ( 3565 "pcmpeqb %%xmm5,%%xmm5 \n" 3566 "psllw $0x8,%%xmm5 \n" 3567 "mov $0x80808080,%%eax \n" 3568 "movd %%eax,%%xmm6 \n" 3569 "pshufd $0x0,%%xmm6,%%xmm6 \n" 3570 "mov $0x807f807f,%%eax \n" 3571 "movd %%eax,%%xmm7 \n" 3572 "pshufd $0x0,%%xmm7,%%xmm7 \n" 3573 "sub %2,%0 \n" 3574 "sub %2,%1 \n" 3575 "sub %2,%3 \n" 3576 3577 // 8 pixel loop. 3578 LABELALIGN 3579 "1: \n" 3580 "movq (%2),%%xmm0 \n" 3581 "punpcklbw %%xmm0,%%xmm0 \n" 3582 "pxor %%xmm5,%%xmm0 \n" 3583 "movq (%0,%2,1),%%xmm1 \n" 3584 "movq (%1,%2,1),%%xmm2 \n" 3585 "punpcklbw %%xmm2,%%xmm1 \n" 3586 "psubb %%xmm6,%%xmm1 \n" 3587 "pmaddubsw %%xmm1,%%xmm0 \n" 3588 "paddw %%xmm7,%%xmm0 \n" 3589 "psrlw $0x8,%%xmm0 \n" 3590 "packuswb %%xmm0,%%xmm0 \n" 3591 "movq %%xmm0,(%3,%2,1) \n" 3592 "lea 0x8(%2),%2 \n" 3593 "sub $0x8,%4 \n" 3594 "jg 1b \n" 3595 : "+r"(src0), // %0 3596 "+r"(src1), // %1 3597 "+r"(alpha), // %2 3598 "+r"(dst), // %3 3599 "+rm"(width) // %4 3600 :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7" 3601 ); 3602 } 3603 #endif // HAS_BLENDPLANEROW_SSSE3 3604 3605 #ifdef HAS_BLENDPLANEROW_AVX2 3606 // Blend 32 pixels at a time. 3607 // unsigned version of math 3608 // =((A2*C2)+(B2*(255-C2))+255)/256 3609 // signed version of math 3610 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 3611 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, 3612 const uint8* alpha, uint8* dst, int width) { 3613 asm volatile ( 3614 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3615 "vpsllw $0x8,%%ymm5,%%ymm5 \n" 3616 "mov $0x80808080,%%eax \n" 3617 "vmovd %%eax,%%xmm6 \n" 3618 "vbroadcastss %%xmm6,%%ymm6 \n" 3619 "mov $0x807f807f,%%eax \n" 3620 "vmovd %%eax,%%xmm7 \n" 3621 "vbroadcastss %%xmm7,%%ymm7 \n" 3622 "sub %2,%0 \n" 3623 "sub %2,%1 \n" 3624 "sub %2,%3 \n" 3625 3626 // 32 pixel loop. 3627 LABELALIGN 3628 "1: \n" 3629 "vmovdqu (%2),%%ymm0 \n" 3630 "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" 3631 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" 3632 "vpxor %%ymm5,%%ymm3,%%ymm3 \n" 3633 "vpxor %%ymm5,%%ymm0,%%ymm0 \n" 3634 "vmovdqu (%0,%2,1),%%ymm1 \n" 3635 "vmovdqu (%1,%2,1),%%ymm2 \n" 3636 "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" 3637 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" 3638 "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" 3639 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" 3640 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" 3641 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" 3642 "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" 3643 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" 3644 "vpsrlw $0x8,%%ymm3,%%ymm3 \n" 3645 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3646 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" 3647 "vmovdqu %%ymm0,(%3,%2,1) \n" 3648 "lea 0x20(%2),%2 \n" 3649 "sub $0x20,%4 \n" 3650 "jg 1b \n" 3651 "vzeroupper \n" 3652 : "+r"(src0), // %0 3653 "+r"(src1), // %1 3654 "+r"(alpha), // %2 3655 "+r"(dst), // %3 3656 "+rm"(width) // %4 3657 :: "memory", "cc", "eax", 3658 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 3659 ); 3660 } 3661 #endif // HAS_BLENDPLANEROW_AVX2 3662 3663 #ifdef HAS_ARGBATTENUATEROW_SSSE3 3664 // Shuffle table duplicating alpha 3665 static uvec8 kShuffleAlpha0 = { 3666 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u 3667 }; 3668 static uvec8 kShuffleAlpha1 = { 3669 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 3670 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u 3671 }; 3672 // Attenuate 4 pixels at a time. 3673 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 3674 asm volatile ( 3675 "pcmpeqb %%xmm3,%%xmm3 \n" 3676 "pslld $0x18,%%xmm3 \n" 3677 "movdqa %3,%%xmm4 \n" 3678 "movdqa %4,%%xmm5 \n" 3679 3680 // 4 pixel loop. 3681 LABELALIGN 3682 "1: \n" 3683 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3684 "pshufb %%xmm4,%%xmm0 \n" 3685 "movdqu " MEMACCESS(0) ",%%xmm1 \n" 3686 "punpcklbw %%xmm1,%%xmm1 \n" 3687 "pmulhuw %%xmm1,%%xmm0 \n" 3688 "movdqu " MEMACCESS(0) ",%%xmm1 \n" 3689 "pshufb %%xmm5,%%xmm1 \n" 3690 "movdqu " MEMACCESS(0) ",%%xmm2 \n" 3691 "punpckhbw %%xmm2,%%xmm2 \n" 3692 "pmulhuw %%xmm2,%%xmm1 \n" 3693 "movdqu " MEMACCESS(0) ",%%xmm2 \n" 3694 "lea " MEMLEA(0x10,0) ",%0 \n" 3695 "pand %%xmm3,%%xmm2 \n" 3696 "psrlw $0x8,%%xmm0 \n" 3697 "psrlw $0x8,%%xmm1 \n" 3698 "packuswb %%xmm1,%%xmm0 \n" 3699 "por %%xmm2,%%xmm0 \n" 3700 "movdqu %%xmm0," MEMACCESS(1) " \n" 3701 "lea " MEMLEA(0x10,1) ",%1 \n" 3702 "sub $0x4,%2 \n" 3703 "jg 1b \n" 3704 : "+r"(src_argb), // %0 3705 "+r"(dst_argb), // %1 3706 "+r"(width) // %2 3707 : "m"(kShuffleAlpha0), // %3 3708 "m"(kShuffleAlpha1) // %4 3709 : "memory", "cc" 3710 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 3711 ); 3712 } 3713 #endif // HAS_ARGBATTENUATEROW_SSSE3 3714 3715 #ifdef HAS_ARGBATTENUATEROW_AVX2 3716 // Shuffle table duplicating alpha. 3717 static const uvec8 kShuffleAlpha_AVX2 = { 3718 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u 3719 }; 3720 // Attenuate 8 pixels at a time. 3721 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { 3722 asm volatile ( 3723 "vbroadcastf128 %3,%%ymm4 \n" 3724 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3725 "vpslld $0x18,%%ymm5,%%ymm5 \n" 3726 "sub %0,%1 \n" 3727 3728 // 8 pixel loop. 3729 LABELALIGN 3730 "1: \n" 3731 "vmovdqu " MEMACCESS(0) ",%%ymm6 \n" 3732 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" 3733 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" 3734 "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" 3735 "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" 3736 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" 3737 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" 3738 "vpand %%ymm5,%%ymm6,%%ymm6 \n" 3739 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3740 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" 3741 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 3742 "vpor %%ymm6,%%ymm0,%%ymm0 \n" 3743 MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) 3744 "lea " MEMLEA(0x20,0) ",%0 \n" 3745 "sub $0x8,%2 \n" 3746 "jg 1b \n" 3747 "vzeroupper \n" 3748 : "+r"(src_argb), // %0 3749 "+r"(dst_argb), // %1 3750 "+r"(width) // %2 3751 : "m"(kShuffleAlpha_AVX2) // %3 3752 : "memory", "cc" 3753 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 3754 ); 3755 } 3756 #endif // HAS_ARGBATTENUATEROW_AVX2 3757 3758 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 3759 // Unattenuate 4 pixels at a time. 3760 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, 3761 int width) { 3762 uintptr_t alpha; 3763 asm volatile ( 3764 // 4 pixel loop. 3765 LABELALIGN 3766 "1: \n" 3767 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3768 "movzb " MEMACCESS2(0x03,0) ",%3 \n" 3769 "punpcklbw %%xmm0,%%xmm0 \n" 3770 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 3771 "movzb " MEMACCESS2(0x07,0) ",%3 \n" 3772 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 3773 "pshuflw $0x40,%%xmm2,%%xmm2 \n" 3774 "pshuflw $0x40,%%xmm3,%%xmm3 \n" 3775 "movlhps %%xmm3,%%xmm2 \n" 3776 "pmulhuw %%xmm2,%%xmm0 \n" 3777 "movdqu " MEMACCESS(0) ",%%xmm1 \n" 3778 "movzb " MEMACCESS2(0x0b,0) ",%3 \n" 3779 "punpckhbw %%xmm1,%%xmm1 \n" 3780 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 3781 "movzb " MEMACCESS2(0x0f,0) ",%3 \n" 3782 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 3783 "pshuflw $0x40,%%xmm2,%%xmm2 \n" 3784 "pshuflw $0x40,%%xmm3,%%xmm3 \n" 3785 "movlhps %%xmm3,%%xmm2 \n" 3786 "pmulhuw %%xmm2,%%xmm1 \n" 3787 "lea " MEMLEA(0x10,0) ",%0 \n" 3788 "packuswb %%xmm1,%%xmm0 \n" 3789 "movdqu %%xmm0," MEMACCESS(1) " \n" 3790 "lea " MEMLEA(0x10,1) ",%1 \n" 3791 "sub $0x4,%2 \n" 3792 "jg 1b \n" 3793 : "+r"(src_argb), // %0 3794 "+r"(dst_argb), // %1 3795 "+r"(width), // %2 3796 "=&r"(alpha) // %3 3797 : "r"(fixed_invtbl8) // %4 3798 : "memory", "cc", NACL_R14 3799 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 3800 ); 3801 } 3802 #endif // HAS_ARGBUNATTENUATEROW_SSE2 3803 3804 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 3805 // Shuffle table duplicating alpha. 3806 static const uvec8 kUnattenShuffleAlpha_AVX2 = { 3807 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u 3808 }; 3809 // Unattenuate 8 pixels at a time. 3810 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, 3811 int width) { 3812 uintptr_t alpha; 3813 asm volatile ( 3814 "sub %0,%1 \n" 3815 "vbroadcastf128 %5,%%ymm5 \n" 3816 3817 // 8 pixel loop. 3818 LABELALIGN 3819 "1: \n" 3820 // replace VPGATHER 3821 "movzb " MEMACCESS2(0x03,0) ",%3 \n" 3822 MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 3823 "movzb " MEMACCESS2(0x07,0) ",%3 \n" 3824 MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1 3825 "movzb " MEMACCESS2(0x0b,0) ",%3 \n" 3826 "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" 3827 MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2 3828 "movzb " MEMACCESS2(0x0f,0) ",%3 \n" 3829 MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3 3830 "movzb " MEMACCESS2(0x13,0) ",%3 \n" 3831 "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" 3832 MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 3833 "movzb " MEMACCESS2(0x17,0) ",%3 \n" 3834 MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1 3835 "movzb " MEMACCESS2(0x1b,0) ",%3 \n" 3836 "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" 3837 MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2 3838 "movzb " MEMACCESS2(0x1f,0) ",%3 \n" 3839 MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3 3840 "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" 3841 "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n" 3842 "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n" 3843 "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n" 3844 // end of VPGATHER 3845 3846 "vmovdqu " MEMACCESS(0) ",%%ymm6 \n" 3847 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" 3848 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" 3849 "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" 3850 "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" 3851 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" 3852 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" 3853 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" 3854 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" 3855 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 3856 MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) 3857 "lea " MEMLEA(0x20,0) ",%0 \n" 3858 "sub $0x8,%2 \n" 3859 "jg 1b \n" 3860 "vzeroupper \n" 3861 : "+r"(src_argb), // %0 3862 "+r"(dst_argb), // %1 3863 "+r"(width), // %2 3864 "=&r"(alpha) // %3 3865 : "r"(fixed_invtbl8), // %4 3866 "m"(kUnattenShuffleAlpha_AVX2) // %5 3867 : "memory", "cc", NACL_R14 3868 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 3869 ); 3870 } 3871 #endif // HAS_ARGBUNATTENUATEROW_AVX2 3872 3873 #ifdef HAS_ARGBGRAYROW_SSSE3 3874 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels 3875 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 3876 asm volatile ( 3877 "movdqa %3,%%xmm4 \n" 3878 "movdqa %4,%%xmm5 \n" 3879 3880 // 8 pixel loop. 3881 LABELALIGN 3882 "1: \n" 3883 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3884 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3885 "pmaddubsw %%xmm4,%%xmm0 \n" 3886 "pmaddubsw %%xmm4,%%xmm1 \n" 3887 "phaddw %%xmm1,%%xmm0 \n" 3888 "paddw %%xmm5,%%xmm0 \n" 3889 "psrlw $0x7,%%xmm0 \n" 3890 "packuswb %%xmm0,%%xmm0 \n" 3891 "movdqu " MEMACCESS(0) ",%%xmm2 \n" 3892 "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" 3893 "lea " MEMLEA(0x20,0) ",%0 \n" 3894 "psrld $0x18,%%xmm2 \n" 3895 "psrld $0x18,%%xmm3 \n" 3896 "packuswb %%xmm3,%%xmm2 \n" 3897 "packuswb %%xmm2,%%xmm2 \n" 3898 "movdqa %%xmm0,%%xmm3 \n" 3899 "punpcklbw %%xmm0,%%xmm0 \n" 3900 "punpcklbw %%xmm2,%%xmm3 \n" 3901 "movdqa %%xmm0,%%xmm1 \n" 3902 "punpcklwd %%xmm3,%%xmm0 \n" 3903 "punpckhwd %%xmm3,%%xmm1 \n" 3904 "movdqu %%xmm0," MEMACCESS(1) " \n" 3905 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 3906 "lea " MEMLEA(0x20,1) ",%1 \n" 3907 "sub $0x8,%2 \n" 3908 "jg 1b \n" 3909 : "+r"(src_argb), // %0 3910 "+r"(dst_argb), // %1 3911 "+r"(width) // %2 3912 : "m"(kARGBToYJ), // %3 3913 "m"(kAddYJ64) // %4 3914 : "memory", "cc" 3915 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 3916 ); 3917 } 3918 #endif // HAS_ARGBGRAYROW_SSSE3 3919 3920 #ifdef HAS_ARGBSEPIAROW_SSSE3 3921 // b = (r * 35 + g * 68 + b * 17) >> 7 3922 // g = (r * 45 + g * 88 + b * 22) >> 7 3923 // r = (r * 50 + g * 98 + b * 24) >> 7 3924 // Constant for ARGB color to sepia tone 3925 static vec8 kARGBToSepiaB = { 3926 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 3927 }; 3928 3929 static vec8 kARGBToSepiaG = { 3930 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 3931 }; 3932 3933 static vec8 kARGBToSepiaR = { 3934 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 3935 }; 3936 3937 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. 3938 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { 3939 asm volatile ( 3940 "movdqa %2,%%xmm2 \n" 3941 "movdqa %3,%%xmm3 \n" 3942 "movdqa %4,%%xmm4 \n" 3943 3944 // 8 pixel loop. 3945 LABELALIGN 3946 "1: \n" 3947 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3948 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" 3949 "pmaddubsw %%xmm2,%%xmm0 \n" 3950 "pmaddubsw %%xmm2,%%xmm6 \n" 3951 "phaddw %%xmm6,%%xmm0 \n" 3952 "psrlw $0x7,%%xmm0 \n" 3953 "packuswb %%xmm0,%%xmm0 \n" 3954 "movdqu " MEMACCESS(0) ",%%xmm5 \n" 3955 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3956 "pmaddubsw %%xmm3,%%xmm5 \n" 3957 "pmaddubsw %%xmm3,%%xmm1 \n" 3958 "phaddw %%xmm1,%%xmm5 \n" 3959 "psrlw $0x7,%%xmm5 \n" 3960 "packuswb %%xmm5,%%xmm5 \n" 3961 "punpcklbw %%xmm5,%%xmm0 \n" 3962 "movdqu " MEMACCESS(0) ",%%xmm5 \n" 3963 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3964 "pmaddubsw %%xmm4,%%xmm5 \n" 3965 "pmaddubsw %%xmm4,%%xmm1 \n" 3966 "phaddw %%xmm1,%%xmm5 \n" 3967 "psrlw $0x7,%%xmm5 \n" 3968 "packuswb %%xmm5,%%xmm5 \n" 3969 "movdqu " MEMACCESS(0) ",%%xmm6 \n" 3970 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3971 "psrld $0x18,%%xmm6 \n" 3972 "psrld $0x18,%%xmm1 \n" 3973 "packuswb %%xmm1,%%xmm6 \n" 3974 "packuswb %%xmm6,%%xmm6 \n" 3975 "punpcklbw %%xmm6,%%xmm5 \n" 3976 "movdqa %%xmm0,%%xmm1 \n" 3977 "punpcklwd %%xmm5,%%xmm0 \n" 3978 "punpckhwd %%xmm5,%%xmm1 \n" 3979 "movdqu %%xmm0," MEMACCESS(0) " \n" 3980 "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" 3981 "lea " MEMLEA(0x20,0) ",%0 \n" 3982 "sub $0x8,%1 \n" 3983 "jg 1b \n" 3984 : "+r"(dst_argb), // %0 3985 "+r"(width) // %1 3986 : "m"(kARGBToSepiaB), // %2 3987 "m"(kARGBToSepiaG), // %3 3988 "m"(kARGBToSepiaR) // %4 3989 : "memory", "cc" 3990 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 3991 ); 3992 } 3993 #endif // HAS_ARGBSEPIAROW_SSSE3 3994 3995 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 3996 // Tranform 8 ARGB pixels (32 bytes) with color matrix. 3997 // Same as Sepia except matrix is provided. 3998 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 3999 const int8* matrix_argb, int width) { 4000 asm volatile ( 4001 "movdqu " MEMACCESS(3) ",%%xmm5 \n" 4002 "pshufd $0x00,%%xmm5,%%xmm2 \n" 4003 "pshufd $0x55,%%xmm5,%%xmm3 \n" 4004 "pshufd $0xaa,%%xmm5,%%xmm4 \n" 4005 "pshufd $0xff,%%xmm5,%%xmm5 \n" 4006 4007 // 8 pixel loop. 4008 LABELALIGN 4009 "1: \n" 4010 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4011 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" 4012 "pmaddubsw %%xmm2,%%xmm0 \n" 4013 "pmaddubsw %%xmm2,%%xmm7 \n" 4014 "movdqu " MEMACCESS(0) ",%%xmm6 \n" 4015 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 4016 "pmaddubsw %%xmm3,%%xmm6 \n" 4017 "pmaddubsw %%xmm3,%%xmm1 \n" 4018 "phaddsw %%xmm7,%%xmm0 \n" 4019 "phaddsw %%xmm1,%%xmm6 \n" 4020 "psraw $0x6,%%xmm0 \n" 4021 "psraw $0x6,%%xmm6 \n" 4022 "packuswb %%xmm0,%%xmm0 \n" 4023 "packuswb %%xmm6,%%xmm6 \n" 4024 "punpcklbw %%xmm6,%%xmm0 \n" 4025 "movdqu " MEMACCESS(0) ",%%xmm1 \n" 4026 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" 4027 "pmaddubsw %%xmm4,%%xmm1 \n" 4028 "pmaddubsw %%xmm4,%%xmm7 \n" 4029 "phaddsw %%xmm7,%%xmm1 \n" 4030 "movdqu " MEMACCESS(0) ",%%xmm6 \n" 4031 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" 4032 "pmaddubsw %%xmm5,%%xmm6 \n" 4033 "pmaddubsw %%xmm5,%%xmm7 \n" 4034 "phaddsw %%xmm7,%%xmm6 \n" 4035 "psraw $0x6,%%xmm1 \n" 4036 "psraw $0x6,%%xmm6 \n" 4037 "packuswb %%xmm1,%%xmm1 \n" 4038 "packuswb %%xmm6,%%xmm6 \n" 4039 "punpcklbw %%xmm6,%%xmm1 \n" 4040 "movdqa %%xmm0,%%xmm6 \n" 4041 "punpcklwd %%xmm1,%%xmm0 \n" 4042 "punpckhwd %%xmm1,%%xmm6 \n" 4043 "movdqu %%xmm0," MEMACCESS(1) " \n" 4044 "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n" 4045 "lea " MEMLEA(0x20,0) ",%0 \n" 4046 "lea " MEMLEA(0x20,1) ",%1 \n" 4047 "sub $0x8,%2 \n" 4048 "jg 1b \n" 4049 : "+r"(src_argb), // %0 4050 "+r"(dst_argb), // %1 4051 "+r"(width) // %2 4052 : "r"(matrix_argb) // %3 4053 : "memory", "cc" 4054 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 4055 ); 4056 } 4057 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 4058 4059 #ifdef HAS_ARGBQUANTIZEROW_SSE2 4060 // Quantize 4 ARGB pixels (16 bytes). 4061 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, 4062 int interval_offset, int width) { 4063 asm volatile ( 4064 "movd %2,%%xmm2 \n" 4065 "movd %3,%%xmm3 \n" 4066 "movd %4,%%xmm4 \n" 4067 "pshuflw $0x40,%%xmm2,%%xmm2 \n" 4068 "pshufd $0x44,%%xmm2,%%xmm2 \n" 4069 "pshuflw $0x40,%%xmm3,%%xmm3 \n" 4070 "pshufd $0x44,%%xmm3,%%xmm3 \n" 4071 "pshuflw $0x40,%%xmm4,%%xmm4 \n" 4072 "pshufd $0x44,%%xmm4,%%xmm4 \n" 4073 "pxor %%xmm5,%%xmm5 \n" 4074 "pcmpeqb %%xmm6,%%xmm6 \n" 4075 "pslld $0x18,%%xmm6 \n" 4076 4077 // 4 pixel loop. 4078 LABELALIGN 4079 "1: \n" 4080 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4081 "punpcklbw %%xmm5,%%xmm0 \n" 4082 "pmulhuw %%xmm2,%%xmm0 \n" 4083 "movdqu " MEMACCESS(0) ",%%xmm1 \n" 4084 "punpckhbw %%xmm5,%%xmm1 \n" 4085 "pmulhuw %%xmm2,%%xmm1 \n" 4086 "pmullw %%xmm3,%%xmm0 \n" 4087 "movdqu " MEMACCESS(0) ",%%xmm7 \n" 4088 "pmullw %%xmm3,%%xmm1 \n" 4089 "pand %%xmm6,%%xmm7 \n" 4090 "paddw %%xmm4,%%xmm0 \n" 4091 "paddw %%xmm4,%%xmm1 \n" 4092 "packuswb %%xmm1,%%xmm0 \n" 4093 "por %%xmm7,%%xmm0 \n" 4094 "movdqu %%xmm0," MEMACCESS(0) " \n" 4095 "lea " MEMLEA(0x10,0) ",%0 \n" 4096 "sub $0x4,%1 \n" 4097 "jg 1b \n" 4098 : "+r"(dst_argb), // %0 4099 "+r"(width) // %1 4100 : "r"(scale), // %2 4101 "r"(interval_size), // %3 4102 "r"(interval_offset) // %4 4103 : "memory", "cc" 4104 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 4105 ); 4106 } 4107 #endif // HAS_ARGBQUANTIZEROW_SSE2 4108 4109 #ifdef HAS_ARGBSHADEROW_SSE2 4110 // Shade 4 pixels at a time by specified value. 4111 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, 4112 uint32 value) { 4113 asm volatile ( 4114 "movd %3,%%xmm2 \n" 4115 "punpcklbw %%xmm2,%%xmm2 \n" 4116 "punpcklqdq %%xmm2,%%xmm2 \n" 4117 4118 // 4 pixel loop. 4119 LABELALIGN 4120 "1: \n" 4121 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4122 "lea " MEMLEA(0x10,0) ",%0 \n" 4123 "movdqa %%xmm0,%%xmm1 \n" 4124 "punpcklbw %%xmm0,%%xmm0 \n" 4125 "punpckhbw %%xmm1,%%xmm1 \n" 4126 "pmulhuw %%xmm2,%%xmm0 \n" 4127 "pmulhuw %%xmm2,%%xmm1 \n" 4128 "psrlw $0x8,%%xmm0 \n" 4129 "psrlw $0x8,%%xmm1 \n" 4130 "packuswb %%xmm1,%%xmm0 \n" 4131 "movdqu %%xmm0," MEMACCESS(1) " \n" 4132 "lea " MEMLEA(0x10,1) ",%1 \n" 4133 "sub $0x4,%2 \n" 4134 "jg 1b \n" 4135 : "+r"(src_argb), // %0 4136 "+r"(dst_argb), // %1 4137 "+r"(width) // %2 4138 : "r"(value) // %3 4139 : "memory", "cc" 4140 , "xmm0", "xmm1", "xmm2" 4141 ); 4142 } 4143 #endif // HAS_ARGBSHADEROW_SSE2 4144 4145 #ifdef HAS_ARGBMULTIPLYROW_SSE2 4146 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. 4147 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4148 uint8* dst_argb, int width) { 4149 asm volatile ( 4150 "pxor %%xmm5,%%xmm5 \n" 4151 4152 // 4 pixel loop. 4153 LABELALIGN 4154 "1: \n" 4155 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4156 "lea " MEMLEA(0x10,0) ",%0 \n" 4157 "movdqu " MEMACCESS(1) ",%%xmm2 \n" 4158 "lea " MEMLEA(0x10,1) ",%1 \n" 4159 "movdqu %%xmm0,%%xmm1 \n" 4160 "movdqu %%xmm2,%%xmm3 \n" 4161 "punpcklbw %%xmm0,%%xmm0 \n" 4162 "punpckhbw %%xmm1,%%xmm1 \n" 4163 "punpcklbw %%xmm5,%%xmm2 \n" 4164 "punpckhbw %%xmm5,%%xmm3 \n" 4165 "pmulhuw %%xmm2,%%xmm0 \n" 4166 "pmulhuw %%xmm3,%%xmm1 \n" 4167 "packuswb %%xmm1,%%xmm0 \n" 4168 "movdqu %%xmm0," MEMACCESS(2) " \n" 4169 "lea " MEMLEA(0x10,2) ",%2 \n" 4170 "sub $0x4,%3 \n" 4171 "jg 1b \n" 4172 : "+r"(src_argb0), // %0 4173 "+r"(src_argb1), // %1 4174 "+r"(dst_argb), // %2 4175 "+r"(width) // %3 4176 : 4177 : "memory", "cc" 4178 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 4179 ); 4180 } 4181 #endif // HAS_ARGBMULTIPLYROW_SSE2 4182 4183 #ifdef HAS_ARGBMULTIPLYROW_AVX2 4184 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. 4185 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 4186 uint8* dst_argb, int width) { 4187 asm volatile ( 4188 "vpxor %%ymm5,%%ymm5,%%ymm5 \n" 4189 4190 // 4 pixel loop. 4191 LABELALIGN 4192 "1: \n" 4193 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" 4194 "lea " MEMLEA(0x20,0) ",%0 \n" 4195 "vmovdqu " MEMACCESS(1) ",%%ymm3 \n" 4196 "lea " MEMLEA(0x20,1) ",%1 \n" 4197 "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" 4198 "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" 4199 "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" 4200 "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" 4201 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" 4202 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" 4203 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 4204 "vmovdqu %%ymm0," MEMACCESS(2) " \n" 4205 "lea " MEMLEA(0x20,2) ",%2 \n" 4206 "sub $0x8,%3 \n" 4207 "jg 1b \n" 4208 "vzeroupper \n" 4209 : "+r"(src_argb0), // %0 4210 "+r"(src_argb1), // %1 4211 "+r"(dst_argb), // %2 4212 "+r"(width) // %3 4213 : 4214 : "memory", "cc" 4215 #if defined(__AVX2__) 4216 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 4217 #endif 4218 ); 4219 } 4220 #endif // HAS_ARGBMULTIPLYROW_AVX2 4221 4222 #ifdef HAS_ARGBADDROW_SSE2 4223 // Add 2 rows of ARGB pixels together, 4 pixels at a time. 4224 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4225 uint8* dst_argb, int width) { 4226 asm volatile ( 4227 // 4 pixel loop. 4228 LABELALIGN 4229 "1: \n" 4230 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4231 "lea " MEMLEA(0x10,0) ",%0 \n" 4232 "movdqu " MEMACCESS(1) ",%%xmm1 \n" 4233 "lea " MEMLEA(0x10,1) ",%1 \n" 4234 "paddusb %%xmm1,%%xmm0 \n" 4235 "movdqu %%xmm0," MEMACCESS(2) " \n" 4236 "lea " MEMLEA(0x10,2) ",%2 \n" 4237 "sub $0x4,%3 \n" 4238 "jg 1b \n" 4239 : "+r"(src_argb0), // %0 4240 "+r"(src_argb1), // %1 4241 "+r"(dst_argb), // %2 4242 "+r"(width) // %3 4243 : 4244 : "memory", "cc" 4245 , "xmm0", "xmm1" 4246 ); 4247 } 4248 #endif // HAS_ARGBADDROW_SSE2 4249 4250 #ifdef HAS_ARGBADDROW_AVX2 4251 // Add 2 rows of ARGB pixels together, 4 pixels at a time. 4252 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 4253 uint8* dst_argb, int width) { 4254 asm volatile ( 4255 // 4 pixel loop. 4256 LABELALIGN 4257 "1: \n" 4258 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 4259 "lea " MEMLEA(0x20,0) ",%0 \n" 4260 "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" 4261 "lea " MEMLEA(0x20,1) ",%1 \n" 4262 "vmovdqu %%ymm0," MEMACCESS(2) " \n" 4263 "lea " MEMLEA(0x20,2) ",%2 \n" 4264 "sub $0x8,%3 \n" 4265 "jg 1b \n" 4266 "vzeroupper \n" 4267 : "+r"(src_argb0), // %0 4268 "+r"(src_argb1), // %1 4269 "+r"(dst_argb), // %2 4270 "+r"(width) // %3 4271 : 4272 : "memory", "cc" 4273 , "xmm0" 4274 ); 4275 } 4276 #endif // HAS_ARGBADDROW_AVX2 4277 4278 #ifdef HAS_ARGBSUBTRACTROW_SSE2 4279 // Subtract 2 rows of ARGB pixels, 4 pixels at a time. 4280 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4281 uint8* dst_argb, int width) { 4282 asm volatile ( 4283 // 4 pixel loop. 4284 LABELALIGN 4285 "1: \n" 4286 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4287 "lea " MEMLEA(0x10,0) ",%0 \n" 4288 "movdqu " MEMACCESS(1) ",%%xmm1 \n" 4289 "lea " MEMLEA(0x10,1) ",%1 \n" 4290 "psubusb %%xmm1,%%xmm0 \n" 4291 "movdqu %%xmm0," MEMACCESS(2) " \n" 4292 "lea " MEMLEA(0x10,2) ",%2 \n" 4293 "sub $0x4,%3 \n" 4294 "jg 1b \n" 4295 : "+r"(src_argb0), // %0 4296 "+r"(src_argb1), // %1 4297 "+r"(dst_argb), // %2 4298 "+r"(width) // %3 4299 : 4300 : "memory", "cc" 4301 , "xmm0", "xmm1" 4302 ); 4303 } 4304 #endif // HAS_ARGBSUBTRACTROW_SSE2 4305 4306 #ifdef HAS_ARGBSUBTRACTROW_AVX2 4307 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. 4308 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 4309 uint8* dst_argb, int width) { 4310 asm volatile ( 4311 // 4 pixel loop. 4312 LABELALIGN 4313 "1: \n" 4314 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 4315 "lea " MEMLEA(0x20,0) ",%0 \n" 4316 "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" 4317 "lea " MEMLEA(0x20,1) ",%1 \n" 4318 "vmovdqu %%ymm0," MEMACCESS(2) " \n" 4319 "lea " MEMLEA(0x20,2) ",%2 \n" 4320 "sub $0x8,%3 \n" 4321 "jg 1b \n" 4322 "vzeroupper \n" 4323 : "+r"(src_argb0), // %0 4324 "+r"(src_argb1), // %1 4325 "+r"(dst_argb), // %2 4326 "+r"(width) // %3 4327 : 4328 : "memory", "cc" 4329 , "xmm0" 4330 ); 4331 } 4332 #endif // HAS_ARGBSUBTRACTROW_AVX2 4333 4334 #ifdef HAS_SOBELXROW_SSE2 4335 // SobelX as a matrix is 4336 // -1 0 1 4337 // -2 0 2 4338 // -1 0 1 4339 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, 4340 const uint8* src_y2, uint8* dst_sobelx, int width) { 4341 asm volatile ( 4342 "sub %0,%1 \n" 4343 "sub %0,%2 \n" 4344 "sub %0,%3 \n" 4345 "pxor %%xmm5,%%xmm5 \n" 4346 4347 // 8 pixel loop. 4348 LABELALIGN 4349 "1: \n" 4350 "movq " MEMACCESS(0) ",%%xmm0 \n" 4351 "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n" 4352 "punpcklbw %%xmm5,%%xmm0 \n" 4353 "punpcklbw %%xmm5,%%xmm1 \n" 4354 "psubw %%xmm1,%%xmm0 \n" 4355 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 4356 MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2 4357 "punpcklbw %%xmm5,%%xmm1 \n" 4358 "punpcklbw %%xmm5,%%xmm2 \n" 4359 "psubw %%xmm2,%%xmm1 \n" 4360 MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2 4361 MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3 4362 "punpcklbw %%xmm5,%%xmm2 \n" 4363 "punpcklbw %%xmm5,%%xmm3 \n" 4364 "psubw %%xmm3,%%xmm2 \n" 4365 "paddw %%xmm2,%%xmm0 \n" 4366 "paddw %%xmm1,%%xmm0 \n" 4367 "paddw %%xmm1,%%xmm0 \n" 4368 "pxor %%xmm1,%%xmm1 \n" 4369 "psubw %%xmm0,%%xmm1 \n" 4370 "pmaxsw %%xmm1,%%xmm0 \n" 4371 "packuswb %%xmm0,%%xmm0 \n" 4372 MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1) 4373 "lea " MEMLEA(0x8,0) ",%0 \n" 4374 "sub $0x8,%4 \n" 4375 "jg 1b \n" 4376 : "+r"(src_y0), // %0 4377 "+r"(src_y1), // %1 4378 "+r"(src_y2), // %2 4379 "+r"(dst_sobelx), // %3 4380 "+r"(width) // %4 4381 : 4382 : "memory", "cc", NACL_R14 4383 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 4384 ); 4385 } 4386 #endif // HAS_SOBELXROW_SSE2 4387 4388 #ifdef HAS_SOBELYROW_SSE2 4389 // SobelY as a matrix is 4390 // -1 -2 -1 4391 // 0 0 0 4392 // 1 2 1 4393 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, 4394 uint8* dst_sobely, int width) { 4395 asm volatile ( 4396 "sub %0,%1 \n" 4397 "sub %0,%2 \n" 4398 "pxor %%xmm5,%%xmm5 \n" 4399 4400 // 8 pixel loop. 4401 LABELALIGN 4402 "1: \n" 4403 "movq " MEMACCESS(0) ",%%xmm0 \n" 4404 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 4405 "punpcklbw %%xmm5,%%xmm0 \n" 4406 "punpcklbw %%xmm5,%%xmm1 \n" 4407 "psubw %%xmm1,%%xmm0 \n" 4408 "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n" 4409 MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2 4410 "punpcklbw %%xmm5,%%xmm1 \n" 4411 "punpcklbw %%xmm5,%%xmm2 \n" 4412 "psubw %%xmm2,%%xmm1 \n" 4413 "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n" 4414 MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3 4415 "punpcklbw %%xmm5,%%xmm2 \n" 4416 "punpcklbw %%xmm5,%%xmm3 \n" 4417 "psubw %%xmm3,%%xmm2 \n" 4418 "paddw %%xmm2,%%xmm0 \n" 4419 "paddw %%xmm1,%%xmm0 \n" 4420 "paddw %%xmm1,%%xmm0 \n" 4421 "pxor %%xmm1,%%xmm1 \n" 4422 "psubw %%xmm0,%%xmm1 \n" 4423 "pmaxsw %%xmm1,%%xmm0 \n" 4424 "packuswb %%xmm0,%%xmm0 \n" 4425 MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1) 4426 "lea " MEMLEA(0x8,0) ",%0 \n" 4427 "sub $0x8,%3 \n" 4428 "jg 1b \n" 4429 : "+r"(src_y0), // %0 4430 "+r"(src_y1), // %1 4431 "+r"(dst_sobely), // %2 4432 "+r"(width) // %3 4433 : 4434 : "memory", "cc", NACL_R14 4435 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 4436 ); 4437 } 4438 #endif // HAS_SOBELYROW_SSE2 4439 4440 #ifdef HAS_SOBELROW_SSE2 4441 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. 4442 // A = 255 4443 // R = Sobel 4444 // G = Sobel 4445 // B = Sobel 4446 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 4447 uint8* dst_argb, int width) { 4448 asm volatile ( 4449 "sub %0,%1 \n" 4450 "pcmpeqb %%xmm5,%%xmm5 \n" 4451 "pslld $0x18,%%xmm5 \n" 4452 4453 // 8 pixel loop. 4454 LABELALIGN 4455 "1: \n" 4456 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4457 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 4458 "lea " MEMLEA(0x10,0) ",%0 \n" 4459 "paddusb %%xmm1,%%xmm0 \n" 4460 "movdqa %%xmm0,%%xmm2 \n" 4461 "punpcklbw %%xmm0,%%xmm2 \n" 4462 "punpckhbw %%xmm0,%%xmm0 \n" 4463 "movdqa %%xmm2,%%xmm1 \n" 4464 "punpcklwd %%xmm2,%%xmm1 \n" 4465 "punpckhwd %%xmm2,%%xmm2 \n" 4466 "por %%xmm5,%%xmm1 \n" 4467 "por %%xmm5,%%xmm2 \n" 4468 "movdqa %%xmm0,%%xmm3 \n" 4469 "punpcklwd %%xmm0,%%xmm3 \n" 4470 "punpckhwd %%xmm0,%%xmm0 \n" 4471 "por %%xmm5,%%xmm3 \n" 4472 "por %%xmm5,%%xmm0 \n" 4473 "movdqu %%xmm1," MEMACCESS(2) " \n" 4474 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" 4475 "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n" 4476 "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n" 4477 "lea " MEMLEA(0x40,2) ",%2 \n" 4478 "sub $0x10,%3 \n" 4479 "jg 1b \n" 4480 : "+r"(src_sobelx), // %0 4481 "+r"(src_sobely), // %1 4482 "+r"(dst_argb), // %2 4483 "+r"(width) // %3 4484 : 4485 : "memory", "cc", NACL_R14 4486 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 4487 ); 4488 } 4489 #endif // HAS_SOBELROW_SSE2 4490 4491 #ifdef HAS_SOBELTOPLANEROW_SSE2 4492 // Adds Sobel X and Sobel Y and stores Sobel into a plane. 4493 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 4494 uint8* dst_y, int width) { 4495 asm volatile ( 4496 "sub %0,%1 \n" 4497 "pcmpeqb %%xmm5,%%xmm5 \n" 4498 "pslld $0x18,%%xmm5 \n" 4499 4500 // 8 pixel loop. 4501 LABELALIGN 4502 "1: \n" 4503 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4504 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 4505 "lea " MEMLEA(0x10,0) ",%0 \n" 4506 "paddusb %%xmm1,%%xmm0 \n" 4507 "movdqu %%xmm0," MEMACCESS(2) " \n" 4508 "lea " MEMLEA(0x10,2) ",%2 \n" 4509 "sub $0x10,%3 \n" 4510 "jg 1b \n" 4511 : "+r"(src_sobelx), // %0 4512 "+r"(src_sobely), // %1 4513 "+r"(dst_y), // %2 4514 "+r"(width) // %3 4515 : 4516 : "memory", "cc", NACL_R14 4517 "xmm0", "xmm1" 4518 ); 4519 } 4520 #endif // HAS_SOBELTOPLANEROW_SSE2 4521 4522 #ifdef HAS_SOBELXYROW_SSE2 4523 // Mixes Sobel X, Sobel Y and Sobel into ARGB. 4524 // A = 255 4525 // R = Sobel X 4526 // G = Sobel 4527 // B = Sobel Y 4528 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 4529 uint8* dst_argb, int width) { 4530 asm volatile ( 4531 "sub %0,%1 \n" 4532 "pcmpeqb %%xmm5,%%xmm5 \n" 4533 4534 // 8 pixel loop. 4535 LABELALIGN 4536 "1: \n" 4537 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4538 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 4539 "lea " MEMLEA(0x10,0) ",%0 \n" 4540 "movdqa %%xmm0,%%xmm2 \n" 4541 "paddusb %%xmm1,%%xmm2 \n" 4542 "movdqa %%xmm0,%%xmm3 \n" 4543 "punpcklbw %%xmm5,%%xmm3 \n" 4544 "punpckhbw %%xmm5,%%xmm0 \n" 4545 "movdqa %%xmm1,%%xmm4 \n" 4546 "punpcklbw %%xmm2,%%xmm4 \n" 4547 "punpckhbw %%xmm2,%%xmm1 \n" 4548 "movdqa %%xmm4,%%xmm6 \n" 4549 "punpcklwd %%xmm3,%%xmm6 \n" 4550 "punpckhwd %%xmm3,%%xmm4 \n" 4551 "movdqa %%xmm1,%%xmm7 \n" 4552 "punpcklwd %%xmm0,%%xmm7 \n" 4553 "punpckhwd %%xmm0,%%xmm1 \n" 4554 "movdqu %%xmm6," MEMACCESS(2) " \n" 4555 "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n" 4556 "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n" 4557 "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n" 4558 "lea " MEMLEA(0x40,2) ",%2 \n" 4559 "sub $0x10,%3 \n" 4560 "jg 1b \n" 4561 : "+r"(src_sobelx), // %0 4562 "+r"(src_sobely), // %1 4563 "+r"(dst_argb), // %2 4564 "+r"(width) // %3 4565 : 4566 : "memory", "cc", NACL_R14 4567 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 4568 ); 4569 } 4570 #endif // HAS_SOBELXYROW_SSE2 4571 4572 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 4573 // Creates a table of cumulative sums where each value is a sum of all values 4574 // above and to the left of the value, inclusive of the value. 4575 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, 4576 const int32* previous_cumsum, int width) { 4577 asm volatile ( 4578 "pxor %%xmm0,%%xmm0 \n" 4579 "pxor %%xmm1,%%xmm1 \n" 4580 "sub $0x4,%3 \n" 4581 "jl 49f \n" 4582 "test $0xf,%1 \n" 4583 "jne 49f \n" 4584 4585 // 4 pixel loop \n" 4586 LABELALIGN 4587 "40: \n" 4588 "movdqu " MEMACCESS(0) ",%%xmm2 \n" 4589 "lea " MEMLEA(0x10,0) ",%0 \n" 4590 "movdqa %%xmm2,%%xmm4 \n" 4591 "punpcklbw %%xmm1,%%xmm2 \n" 4592 "movdqa %%xmm2,%%xmm3 \n" 4593 "punpcklwd %%xmm1,%%xmm2 \n" 4594 "punpckhwd %%xmm1,%%xmm3 \n" 4595 "punpckhbw %%xmm1,%%xmm4 \n" 4596 "movdqa %%xmm4,%%xmm5 \n" 4597 "punpcklwd %%xmm1,%%xmm4 \n" 4598 "punpckhwd %%xmm1,%%xmm5 \n" 4599 "paddd %%xmm2,%%xmm0 \n" 4600 "movdqu " MEMACCESS(2) ",%%xmm2 \n" 4601 "paddd %%xmm0,%%xmm2 \n" 4602 "paddd %%xmm3,%%xmm0 \n" 4603 "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n" 4604 "paddd %%xmm0,%%xmm3 \n" 4605 "paddd %%xmm4,%%xmm0 \n" 4606 "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n" 4607 "paddd %%xmm0,%%xmm4 \n" 4608 "paddd %%xmm5,%%xmm0 \n" 4609 "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n" 4610 "lea " MEMLEA(0x40,2) ",%2 \n" 4611 "paddd %%xmm0,%%xmm5 \n" 4612 "movdqu %%xmm2," MEMACCESS(1) " \n" 4613 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" 4614 "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n" 4615 "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n" 4616 "lea " MEMLEA(0x40,1) ",%1 \n" 4617 "sub $0x4,%3 \n" 4618 "jge 40b \n" 4619 4620 "49: \n" 4621 "add $0x3,%3 \n" 4622 "jl 19f \n" 4623 4624 // 1 pixel loop \n" 4625 LABELALIGN 4626 "10: \n" 4627 "movd " MEMACCESS(0) ",%%xmm2 \n" 4628 "lea " MEMLEA(0x4,0) ",%0 \n" 4629 "punpcklbw %%xmm1,%%xmm2 \n" 4630 "punpcklwd %%xmm1,%%xmm2 \n" 4631 "paddd %%xmm2,%%xmm0 \n" 4632 "movdqu " MEMACCESS(2) ",%%xmm2 \n" 4633 "lea " MEMLEA(0x10,2) ",%2 \n" 4634 "paddd %%xmm0,%%xmm2 \n" 4635 "movdqu %%xmm2," MEMACCESS(1) " \n" 4636 "lea " MEMLEA(0x10,1) ",%1 \n" 4637 "sub $0x1,%3 \n" 4638 "jge 10b \n" 4639 4640 "19: \n" 4641 : "+r"(row), // %0 4642 "+r"(cumsum), // %1 4643 "+r"(previous_cumsum), // %2 4644 "+r"(width) // %3 4645 : 4646 : "memory", "cc" 4647 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 4648 ); 4649 } 4650 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 4651 4652 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 4653 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, 4654 int width, int area, uint8* dst, 4655 int count) { 4656 asm volatile ( 4657 "movd %5,%%xmm5 \n" 4658 "cvtdq2ps %%xmm5,%%xmm5 \n" 4659 "rcpss %%xmm5,%%xmm4 \n" 4660 "pshufd $0x0,%%xmm4,%%xmm4 \n" 4661 "sub $0x4,%3 \n" 4662 "jl 49f \n" 4663 "cmpl $0x80,%5 \n" 4664 "ja 40f \n" 4665 4666 "pshufd $0x0,%%xmm5,%%xmm5 \n" 4667 "pcmpeqb %%xmm6,%%xmm6 \n" 4668 "psrld $0x10,%%xmm6 \n" 4669 "cvtdq2ps %%xmm6,%%xmm6 \n" 4670 "addps %%xmm6,%%xmm5 \n" 4671 "mulps %%xmm4,%%xmm5 \n" 4672 "cvtps2dq %%xmm5,%%xmm5 \n" 4673 "packssdw %%xmm5,%%xmm5 \n" 4674 4675 // 4 pixel small loop \n" 4676 LABELALIGN 4677 "4: \n" 4678 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4679 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 4680 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 4681 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 4682 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 4683 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 4684 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 4685 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 4686 "lea " MEMLEA(0x40,0) ",%0 \n" 4687 "psubd " MEMACCESS(1) ",%%xmm0 \n" 4688 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" 4689 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" 4690 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" 4691 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 4692 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 4693 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 4694 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 4695 "lea " MEMLEA(0x40,1) ",%1 \n" 4696 "packssdw %%xmm1,%%xmm0 \n" 4697 "packssdw %%xmm3,%%xmm2 \n" 4698 "pmulhuw %%xmm5,%%xmm0 \n" 4699 "pmulhuw %%xmm5,%%xmm2 \n" 4700 "packuswb %%xmm2,%%xmm0 \n" 4701 "movdqu %%xmm0," MEMACCESS(2) " \n" 4702 "lea " MEMLEA(0x10,2) ",%2 \n" 4703 "sub $0x4,%3 \n" 4704 "jge 4b \n" 4705 "jmp 49f \n" 4706 4707 // 4 pixel loop \n" 4708 LABELALIGN 4709 "40: \n" 4710 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4711 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 4712 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 4713 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 4714 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 4715 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 4716 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 4717 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 4718 "lea " MEMLEA(0x40,0) ",%0 \n" 4719 "psubd " MEMACCESS(1) ",%%xmm0 \n" 4720 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" 4721 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" 4722 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" 4723 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 4724 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 4725 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 4726 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 4727 "lea " MEMLEA(0x40,1) ",%1 \n" 4728 "cvtdq2ps %%xmm0,%%xmm0 \n" 4729 "cvtdq2ps %%xmm1,%%xmm1 \n" 4730 "mulps %%xmm4,%%xmm0 \n" 4731 "mulps %%xmm4,%%xmm1 \n" 4732 "cvtdq2ps %%xmm2,%%xmm2 \n" 4733 "cvtdq2ps %%xmm3,%%xmm3 \n" 4734 "mulps %%xmm4,%%xmm2 \n" 4735 "mulps %%xmm4,%%xmm3 \n" 4736 "cvtps2dq %%xmm0,%%xmm0 \n" 4737 "cvtps2dq %%xmm1,%%xmm1 \n" 4738 "cvtps2dq %%xmm2,%%xmm2 \n" 4739 "cvtps2dq %%xmm3,%%xmm3 \n" 4740 "packssdw %%xmm1,%%xmm0 \n" 4741 "packssdw %%xmm3,%%xmm2 \n" 4742 "packuswb %%xmm2,%%xmm0 \n" 4743 "movdqu %%xmm0," MEMACCESS(2) " \n" 4744 "lea " MEMLEA(0x10,2) ",%2 \n" 4745 "sub $0x4,%3 \n" 4746 "jge 40b \n" 4747 4748 "49: \n" 4749 "add $0x3,%3 \n" 4750 "jl 19f \n" 4751 4752 // 1 pixel loop \n" 4753 LABELALIGN 4754 "10: \n" 4755 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4756 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 4757 "lea " MEMLEA(0x10,0) ",%0 \n" 4758 "psubd " MEMACCESS(1) ",%%xmm0 \n" 4759 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 4760 "lea " MEMLEA(0x10,1) ",%1 \n" 4761 "cvtdq2ps %%xmm0,%%xmm0 \n" 4762 "mulps %%xmm4,%%xmm0 \n" 4763 "cvtps2dq %%xmm0,%%xmm0 \n" 4764 "packssdw %%xmm0,%%xmm0 \n" 4765 "packuswb %%xmm0,%%xmm0 \n" 4766 "movd %%xmm0," MEMACCESS(2) " \n" 4767 "lea " MEMLEA(0x4,2) ",%2 \n" 4768 "sub $0x1,%3 \n" 4769 "jge 10b \n" 4770 "19: \n" 4771 : "+r"(topleft), // %0 4772 "+r"(botleft), // %1 4773 "+r"(dst), // %2 4774 "+rm"(count) // %3 4775 : "r"((intptr_t)(width)), // %4 4776 "rm"(area) // %5 4777 : "memory", "cc", NACL_R14 4778 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 4779 ); 4780 } 4781 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 4782 4783 #ifdef HAS_ARGBAFFINEROW_SSE2 4784 // Copy ARGB pixels from source image with slope to a row of destination. 4785 LIBYUV_API 4786 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, 4787 uint8* dst_argb, const float* src_dudv, int width) { 4788 intptr_t src_argb_stride_temp = src_argb_stride; 4789 intptr_t temp; 4790 asm volatile ( 4791 "movq " MEMACCESS(3) ",%%xmm2 \n" 4792 "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" 4793 "shl $0x10,%1 \n" 4794 "add $0x4,%1 \n" 4795 "movd %1,%%xmm5 \n" 4796 "sub $0x4,%4 \n" 4797 "jl 49f \n" 4798 4799 "pshufd $0x44,%%xmm7,%%xmm7 \n" 4800 "pshufd $0x0,%%xmm5,%%xmm5 \n" 4801 "movdqa %%xmm2,%%xmm0 \n" 4802 "addps %%xmm7,%%xmm0 \n" 4803 "movlhps %%xmm0,%%xmm2 \n" 4804 "movdqa %%xmm7,%%xmm4 \n" 4805 "addps %%xmm4,%%xmm4 \n" 4806 "movdqa %%xmm2,%%xmm3 \n" 4807 "addps %%xmm4,%%xmm3 \n" 4808 "addps %%xmm4,%%xmm4 \n" 4809 4810 // 4 pixel loop \n" 4811 LABELALIGN 4812 "40: \n" 4813 "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2 4814 "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2 4815 "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts 4816 "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride 4817 "movd %%xmm0,%k1 \n" 4818 "pshufd $0x39,%%xmm0,%%xmm0 \n" 4819 "movd %%xmm0,%k5 \n" 4820 "pshufd $0x39,%%xmm0,%%xmm0 \n" 4821 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 4822 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 4823 "punpckldq %%xmm6,%%xmm1 \n" 4824 "addps %%xmm4,%%xmm2 \n" 4825 "movq %%xmm1," MEMACCESS(2) " \n" 4826 "movd %%xmm0,%k1 \n" 4827 "pshufd $0x39,%%xmm0,%%xmm0 \n" 4828 "movd %%xmm0,%k5 \n" 4829 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 4830 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 4831 "punpckldq %%xmm6,%%xmm0 \n" 4832 "addps %%xmm4,%%xmm3 \n" 4833 "movq %%xmm0," MEMACCESS2(0x08,2) " \n" 4834 "lea " MEMLEA(0x10,2) ",%2 \n" 4835 "sub $0x4,%4 \n" 4836 "jge 40b \n" 4837 4838 "49: \n" 4839 "add $0x3,%4 \n" 4840 "jl 19f \n" 4841 4842 // 1 pixel loop \n" 4843 LABELALIGN 4844 "10: \n" 4845 "cvttps2dq %%xmm2,%%xmm0 \n" 4846 "packssdw %%xmm0,%%xmm0 \n" 4847 "pmaddwd %%xmm5,%%xmm0 \n" 4848 "addps %%xmm7,%%xmm2 \n" 4849 "movd %%xmm0,%k1 \n" 4850 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 4851 "movd %%xmm0," MEMACCESS(2) " \n" 4852 "lea " MEMLEA(0x04,2) ",%2 \n" 4853 "sub $0x1,%4 \n" 4854 "jge 10b \n" 4855 "19: \n" 4856 : "+r"(src_argb), // %0 4857 "+r"(src_argb_stride_temp), // %1 4858 "+r"(dst_argb), // %2 4859 "+r"(src_dudv), // %3 4860 "+rm"(width), // %4 4861 "=&r"(temp) // %5 4862 : 4863 : "memory", "cc", NACL_R14 4864 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 4865 ); 4866 } 4867 #endif // HAS_ARGBAFFINEROW_SSE2 4868 4869 #ifdef HAS_INTERPOLATEROW_SSSE3 4870 // Bilinear filter 16x2 -> 16x1 4871 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 4872 ptrdiff_t src_stride, int dst_width, 4873 int source_y_fraction) { 4874 asm volatile ( 4875 "sub %1,%0 \n" 4876 "cmp $0x0,%3 \n" 4877 "je 100f \n" 4878 "cmp $0x80,%3 \n" 4879 "je 50f \n" 4880 4881 "movd %3,%%xmm0 \n" 4882 "neg %3 \n" 4883 "add $0x100,%3 \n" 4884 "movd %3,%%xmm5 \n" 4885 "punpcklbw %%xmm0,%%xmm5 \n" 4886 "punpcklwd %%xmm5,%%xmm5 \n" 4887 "pshufd $0x0,%%xmm5,%%xmm5 \n" 4888 "mov $0x80808080,%%eax \n" 4889 "movd %%eax,%%xmm4 \n" 4890 "pshufd $0x0,%%xmm4,%%xmm4 \n" 4891 4892 // General purpose row blend. 4893 LABELALIGN 4894 "1: \n" 4895 "movdqu " MEMACCESS(1) ",%%xmm0 \n" 4896 MEMOPREG(movdqu,0x00,1,4,1,xmm2) 4897 "movdqa %%xmm0,%%xmm1 \n" 4898 "punpcklbw %%xmm2,%%xmm0 \n" 4899 "punpckhbw %%xmm2,%%xmm1 \n" 4900 "psubb %%xmm4,%%xmm0 \n" 4901 "psubb %%xmm4,%%xmm1 \n" 4902 "movdqa %%xmm5,%%xmm2 \n" 4903 "movdqa %%xmm5,%%xmm3 \n" 4904 "pmaddubsw %%xmm0,%%xmm2 \n" 4905 "pmaddubsw %%xmm1,%%xmm3 \n" 4906 "paddw %%xmm4,%%xmm2 \n" 4907 "paddw %%xmm4,%%xmm3 \n" 4908 "psrlw $0x8,%%xmm2 \n" 4909 "psrlw $0x8,%%xmm3 \n" 4910 "packuswb %%xmm3,%%xmm2 \n" 4911 MEMOPMEM(movdqu,xmm2,0x00,1,0,1) 4912 "lea " MEMLEA(0x10,1) ",%1 \n" 4913 "sub $0x10,%2 \n" 4914 "jg 1b \n" 4915 "jmp 99f \n" 4916 4917 // Blend 50 / 50. 4918 LABELALIGN 4919 "50: \n" 4920 "movdqu " MEMACCESS(1) ",%%xmm0 \n" 4921 MEMOPREG(movdqu,0x00,1,4,1,xmm1) 4922 "pavgb %%xmm1,%%xmm0 \n" 4923 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) 4924 "lea " MEMLEA(0x10,1) ",%1 \n" 4925 "sub $0x10,%2 \n" 4926 "jg 50b \n" 4927 "jmp 99f \n" 4928 4929 // Blend 100 / 0 - Copy row unchanged. 4930 LABELALIGN 4931 "100: \n" 4932 "movdqu " MEMACCESS(1) ",%%xmm0 \n" 4933 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) 4934 "lea " MEMLEA(0x10,1) ",%1 \n" 4935 "sub $0x10,%2 \n" 4936 "jg 100b \n" 4937 4938 "99: \n" 4939 : "+r"(dst_ptr), // %0 4940 "+r"(src_ptr), // %1 4941 "+rm"(dst_width), // %2 4942 "+r"(source_y_fraction) // %3 4943 : "r"((intptr_t)(src_stride)) // %4 4944 : "memory", "cc", "eax", NACL_R14 4945 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 4946 ); 4947 } 4948 #endif // HAS_INTERPOLATEROW_SSSE3 4949 4950 #ifdef HAS_INTERPOLATEROW_AVX2 4951 // Bilinear filter 32x2 -> 32x1 4952 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, 4953 ptrdiff_t src_stride, int dst_width, 4954 int source_y_fraction) { 4955 asm volatile ( 4956 "cmp $0x0,%3 \n" 4957 "je 100f \n" 4958 "sub %1,%0 \n" 4959 "cmp $0x80,%3 \n" 4960 "je 50f \n" 4961 4962 "vmovd %3,%%xmm0 \n" 4963 "neg %3 \n" 4964 "add $0x100,%3 \n" 4965 "vmovd %3,%%xmm5 \n" 4966 "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" 4967 "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" 4968 "vbroadcastss %%xmm5,%%ymm5 \n" 4969 "mov $0x80808080,%%eax \n" 4970 "vmovd %%eax,%%xmm4 \n" 4971 "vbroadcastss %%xmm4,%%ymm4 \n" 4972 4973 // General purpose row blend. 4974 LABELALIGN 4975 "1: \n" 4976 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" 4977 MEMOPREG(vmovdqu,0x00,1,4,1,ymm2) 4978 "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" 4979 "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" 4980 "vpsubb %%ymm4,%%ymm1,%%ymm1 \n" 4981 "vpsubb %%ymm4,%%ymm0,%%ymm0 \n" 4982 "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n" 4983 "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n" 4984 "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" 4985 "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" 4986 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" 4987 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 4988 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 4989 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) 4990 "lea " MEMLEA(0x20,1) ",%1 \n" 4991 "sub $0x20,%2 \n" 4992 "jg 1b \n" 4993 "jmp 99f \n" 4994 4995 // Blend 50 / 50. 4996 LABELALIGN 4997 "50: \n" 4998 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" 4999 VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0 5000 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) 5001 "lea " MEMLEA(0x20,1) ",%1 \n" 5002 "sub $0x20,%2 \n" 5003 "jg 50b \n" 5004 "jmp 99f \n" 5005 5006 // Blend 100 / 0 - Copy row unchanged. 5007 LABELALIGN 5008 "100: \n" 5009 "rep movsb " MEMMOVESTRING(1,0) " \n" 5010 "jmp 999f \n" 5011 5012 "99: \n" 5013 "vzeroupper \n" 5014 "999: \n" 5015 : "+D"(dst_ptr), // %0 5016 "+S"(src_ptr), // %1 5017 "+cm"(dst_width), // %2 5018 "+r"(source_y_fraction) // %3 5019 : "r"((intptr_t)(src_stride)) // %4 5020 : "memory", "cc", "eax", NACL_R14 5021 "xmm0", "xmm1", "xmm2", "xmm4", "xmm5" 5022 ); 5023 } 5024 #endif // HAS_INTERPOLATEROW_AVX2 5025 5026 #ifdef HAS_ARGBSHUFFLEROW_SSSE3 5027 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5028 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 5029 const uint8* shuffler, int width) { 5030 asm volatile ( 5031 "movdqu " MEMACCESS(3) ",%%xmm5 \n" 5032 LABELALIGN 5033 "1: \n" 5034 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 5035 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 5036 "lea " MEMLEA(0x20,0) ",%0 \n" 5037 "pshufb %%xmm5,%%xmm0 \n" 5038 "pshufb %%xmm5,%%xmm1 \n" 5039 "movdqu %%xmm0," MEMACCESS(1) " \n" 5040 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 5041 "lea " MEMLEA(0x20,1) ",%1 \n" 5042 "sub $0x8,%2 \n" 5043 "jg 1b \n" 5044 : "+r"(src_argb), // %0 5045 "+r"(dst_argb), // %1 5046 "+r"(width) // %2 5047 : "r"(shuffler) // %3 5048 : "memory", "cc" 5049 , "xmm0", "xmm1", "xmm5" 5050 ); 5051 } 5052 #endif // HAS_ARGBSHUFFLEROW_SSSE3 5053 5054 #ifdef HAS_ARGBSHUFFLEROW_AVX2 5055 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5056 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, 5057 const uint8* shuffler, int width) { 5058 asm volatile ( 5059 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" 5060 LABELALIGN 5061 "1: \n" 5062 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 5063 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 5064 "lea " MEMLEA(0x40,0) ",%0 \n" 5065 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" 5066 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" 5067 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 5068 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" 5069 "lea " MEMLEA(0x40,1) ",%1 \n" 5070 "sub $0x10,%2 \n" 5071 "jg 1b \n" 5072 "vzeroupper \n" 5073 : "+r"(src_argb), // %0 5074 "+r"(dst_argb), // %1 5075 "+r"(width) // %2 5076 : "r"(shuffler) // %3 5077 : "memory", "cc" 5078 , "xmm0", "xmm1", "xmm5" 5079 ); 5080 } 5081 #endif // HAS_ARGBSHUFFLEROW_AVX2 5082 5083 #ifdef HAS_ARGBSHUFFLEROW_SSE2 5084 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5085 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, 5086 const uint8* shuffler, int width) { 5087 uintptr_t pixel_temp; 5088 asm volatile ( 5089 "pxor %%xmm5,%%xmm5 \n" 5090 "mov " MEMACCESS(4) ",%k2 \n" 5091 "cmp $0x3000102,%k2 \n" 5092 "je 3012f \n" 5093 "cmp $0x10203,%k2 \n" 5094 "je 123f \n" 5095 "cmp $0x30201,%k2 \n" 5096 "je 321f \n" 5097 "cmp $0x2010003,%k2 \n" 5098 "je 2103f \n" 5099 5100 LABELALIGN 5101 "1: \n" 5102 "movzb " MEMACCESS(4) ",%2 \n" 5103 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 5104 "mov %b2," MEMACCESS(1) " \n" 5105 "movzb " MEMACCESS2(0x1,4) ",%2 \n" 5106 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 5107 "mov %b2," MEMACCESS2(0x1,1) " \n" 5108 "movzb " MEMACCESS2(0x2,4) ",%2 \n" 5109 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 5110 "mov %b2," MEMACCESS2(0x2,1) " \n" 5111 "movzb " MEMACCESS2(0x3,4) ",%2 \n" 5112 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 5113 "mov %b2," MEMACCESS2(0x3,1) " \n" 5114 "lea " MEMLEA(0x4,0) ",%0 \n" 5115 "lea " MEMLEA(0x4,1) ",%1 \n" 5116 "sub $0x1,%3 \n" 5117 "jg 1b \n" 5118 "jmp 99f \n" 5119 5120 LABELALIGN 5121 "123: \n" 5122 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 5123 "lea " MEMLEA(0x10,0) ",%0 \n" 5124 "movdqa %%xmm0,%%xmm1 \n" 5125 "punpcklbw %%xmm5,%%xmm0 \n" 5126 "punpckhbw %%xmm5,%%xmm1 \n" 5127 "pshufhw $0x1b,%%xmm0,%%xmm0 \n" 5128 "pshuflw $0x1b,%%xmm0,%%xmm0 \n" 5129 "pshufhw $0x1b,%%xmm1,%%xmm1 \n" 5130 "pshuflw $0x1b,%%xmm1,%%xmm1 \n" 5131 "packuswb %%xmm1,%%xmm0 \n" 5132 "movdqu %%xmm0," MEMACCESS(1) " \n" 5133 "lea " MEMLEA(0x10,1) ",%1 \n" 5134 "sub $0x4,%3 \n" 5135 "jg 123b \n" 5136 "jmp 99f \n" 5137 5138 LABELALIGN 5139 "321: \n" 5140 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 5141 "lea " MEMLEA(0x10,0) ",%0 \n" 5142 "movdqa %%xmm0,%%xmm1 \n" 5143 "punpcklbw %%xmm5,%%xmm0 \n" 5144 "punpckhbw %%xmm5,%%xmm1 \n" 5145 "pshufhw $0x39,%%xmm0,%%xmm0 \n" 5146 "pshuflw $0x39,%%xmm0,%%xmm0 \n" 5147 "pshufhw $0x39,%%xmm1,%%xmm1 \n" 5148 "pshuflw $0x39,%%xmm1,%%xmm1 \n" 5149 "packuswb %%xmm1,%%xmm0 \n" 5150 "movdqu %%xmm0," MEMACCESS(1) " \n" 5151 "lea " MEMLEA(0x10,1) ",%1 \n" 5152 "sub $0x4,%3 \n" 5153 "jg 321b \n" 5154 "jmp 99f \n" 5155 5156 LABELALIGN 5157 "2103: \n" 5158 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 5159 "lea " MEMLEA(0x10,0) ",%0 \n" 5160 "movdqa %%xmm0,%%xmm1 \n" 5161 "punpcklbw %%xmm5,%%xmm0 \n" 5162 "punpckhbw %%xmm5,%%xmm1 \n" 5163 "pshufhw $0x93,%%xmm0,%%xmm0 \n" 5164 "pshuflw $0x93,%%xmm0,%%xmm0 \n" 5165 "pshufhw $0x93,%%xmm1,%%xmm1 \n" 5166 "pshuflw $0x93,%%xmm1,%%xmm1 \n" 5167 "packuswb %%xmm1,%%xmm0 \n" 5168 "movdqu %%xmm0," MEMACCESS(1) " \n" 5169 "lea " MEMLEA(0x10,1) ",%1 \n" 5170 "sub $0x4,%3 \n" 5171 "jg 2103b \n" 5172 "jmp 99f \n" 5173 5174 LABELALIGN 5175 "3012: \n" 5176 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 5177 "lea " MEMLEA(0x10,0) ",%0 \n" 5178 "movdqa %%xmm0,%%xmm1 \n" 5179 "punpcklbw %%xmm5,%%xmm0 \n" 5180 "punpckhbw %%xmm5,%%xmm1 \n" 5181 "pshufhw $0xc6,%%xmm0,%%xmm0 \n" 5182 "pshuflw $0xc6,%%xmm0,%%xmm0 \n" 5183 "pshufhw $0xc6,%%xmm1,%%xmm1 \n" 5184 "pshuflw $0xc6,%%xmm1,%%xmm1 \n" 5185 "packuswb %%xmm1,%%xmm0 \n" 5186 "movdqu %%xmm0," MEMACCESS(1) " \n" 5187 "lea " MEMLEA(0x10,1) ",%1 \n" 5188 "sub $0x4,%3 \n" 5189 "jg 3012b \n" 5190 5191 "99: \n" 5192 : "+r"(src_argb), // %0 5193 "+r"(dst_argb), // %1 5194 "=&d"(pixel_temp), // %2 5195 "+r"(width) // %3 5196 : "r"(shuffler) // %4 5197 : "memory", "cc", NACL_R14 5198 "xmm0", "xmm1", "xmm5" 5199 ); 5200 } 5201 #endif // HAS_ARGBSHUFFLEROW_SSE2 5202 5203 #ifdef HAS_I422TOYUY2ROW_SSE2 5204 void I422ToYUY2Row_SSE2(const uint8* src_y, 5205 const uint8* src_u, 5206 const uint8* src_v, 5207 uint8* dst_frame, int width) { 5208 asm volatile ( 5209 "sub %1,%2 \n" 5210 LABELALIGN 5211 "1: \n" 5212 "movq " MEMACCESS(1) ",%%xmm2 \n" 5213 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 5214 "lea " MEMLEA(0x8,1) ",%1 \n" 5215 "punpcklbw %%xmm3,%%xmm2 \n" 5216 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 5217 "lea " MEMLEA(0x10,0) ",%0 \n" 5218 "movdqa %%xmm0,%%xmm1 \n" 5219 "punpcklbw %%xmm2,%%xmm0 \n" 5220 "punpckhbw %%xmm2,%%xmm1 \n" 5221 "movdqu %%xmm0," MEMACCESS(3) " \n" 5222 "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n" 5223 "lea " MEMLEA(0x20,3) ",%3 \n" 5224 "sub $0x10,%4 \n" 5225 "jg 1b \n" 5226 : "+r"(src_y), // %0 5227 "+r"(src_u), // %1 5228 "+r"(src_v), // %2 5229 "+r"(dst_frame), // %3 5230 "+rm"(width) // %4 5231 : 5232 : "memory", "cc", NACL_R14 5233 "xmm0", "xmm1", "xmm2", "xmm3" 5234 ); 5235 } 5236 #endif // HAS_I422TOYUY2ROW_SSE2 5237 5238 #ifdef HAS_I422TOUYVYROW_SSE2 5239 void I422ToUYVYRow_SSE2(const uint8* src_y, 5240 const uint8* src_u, 5241 const uint8* src_v, 5242 uint8* dst_frame, int width) { 5243 asm volatile ( 5244 "sub %1,%2 \n" 5245 LABELALIGN 5246 "1: \n" 5247 "movq " MEMACCESS(1) ",%%xmm2 \n" 5248 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 5249 "lea " MEMLEA(0x8,1) ",%1 \n" 5250 "punpcklbw %%xmm3,%%xmm2 \n" 5251 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 5252 "movdqa %%xmm2,%%xmm1 \n" 5253 "lea " MEMLEA(0x10,0) ",%0 \n" 5254 "punpcklbw %%xmm0,%%xmm1 \n" 5255 "punpckhbw %%xmm0,%%xmm2 \n" 5256 "movdqu %%xmm1," MEMACCESS(3) " \n" 5257 "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n" 5258 "lea " MEMLEA(0x20,3) ",%3 \n" 5259 "sub $0x10,%4 \n" 5260 "jg 1b \n" 5261 : "+r"(src_y), // %0 5262 "+r"(src_u), // %1 5263 "+r"(src_v), // %2 5264 "+r"(dst_frame), // %3 5265 "+rm"(width) // %4 5266 : 5267 : "memory", "cc", NACL_R14 5268 "xmm0", "xmm1", "xmm2", "xmm3" 5269 ); 5270 } 5271 #endif // HAS_I422TOUYVYROW_SSE2 5272 5273 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 5274 void ARGBPolynomialRow_SSE2(const uint8* src_argb, 5275 uint8* dst_argb, const float* poly, 5276 int width) { 5277 asm volatile ( 5278 "pxor %%xmm3,%%xmm3 \n" 5279 5280 // 2 pixel loop. 5281 LABELALIGN 5282 "1: \n" 5283 "movq " MEMACCESS(0) ",%%xmm0 \n" 5284 "lea " MEMLEA(0x8,0) ",%0 \n" 5285 "punpcklbw %%xmm3,%%xmm0 \n" 5286 "movdqa %%xmm0,%%xmm4 \n" 5287 "punpcklwd %%xmm3,%%xmm0 \n" 5288 "punpckhwd %%xmm3,%%xmm4 \n" 5289 "cvtdq2ps %%xmm0,%%xmm0 \n" 5290 "cvtdq2ps %%xmm4,%%xmm4 \n" 5291 "movdqa %%xmm0,%%xmm1 \n" 5292 "movdqa %%xmm4,%%xmm5 \n" 5293 "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n" 5294 "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n" 5295 "addps " MEMACCESS(3) ",%%xmm0 \n" 5296 "addps " MEMACCESS(3) ",%%xmm4 \n" 5297 "movdqa %%xmm1,%%xmm2 \n" 5298 "movdqa %%xmm5,%%xmm6 \n" 5299 "mulps %%xmm1,%%xmm2 \n" 5300 "mulps %%xmm5,%%xmm6 \n" 5301 "mulps %%xmm2,%%xmm1 \n" 5302 "mulps %%xmm6,%%xmm5 \n" 5303 "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n" 5304 "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n" 5305 "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n" 5306 "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n" 5307 "addps %%xmm2,%%xmm0 \n" 5308 "addps %%xmm6,%%xmm4 \n" 5309 "addps %%xmm1,%%xmm0 \n" 5310 "addps %%xmm5,%%xmm4 \n" 5311 "cvttps2dq %%xmm0,%%xmm0 \n" 5312 "cvttps2dq %%xmm4,%%xmm4 \n" 5313 "packuswb %%xmm4,%%xmm0 \n" 5314 "packuswb %%xmm0,%%xmm0 \n" 5315 "movq %%xmm0," MEMACCESS(1) " \n" 5316 "lea " MEMLEA(0x8,1) ",%1 \n" 5317 "sub $0x2,%2 \n" 5318 "jg 1b \n" 5319 : "+r"(src_argb), // %0 5320 "+r"(dst_argb), // %1 5321 "+r"(width) // %2 5322 : "r"(poly) // %3 5323 : "memory", "cc" 5324 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 5325 ); 5326 } 5327 #endif // HAS_ARGBPOLYNOMIALROW_SSE2 5328 5329 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 5330 void ARGBPolynomialRow_AVX2(const uint8* src_argb, 5331 uint8* dst_argb, const float* poly, 5332 int width) { 5333 asm volatile ( 5334 "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n" 5335 "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n" 5336 "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n" 5337 "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n" 5338 5339 // 2 pixel loop. 5340 LABELALIGN 5341 "1: \n" 5342 "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels 5343 "lea " MEMLEA(0x8,0) ",%0 \n" 5344 "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats 5345 "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X 5346 "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X 5347 "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X 5348 "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X 5349 "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X 5350 "vcvttps2dq %%ymm0,%%ymm0 \n" 5351 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" 5352 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 5353 "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" 5354 "vmovq %%xmm0," MEMACCESS(1) " \n" 5355 "lea " MEMLEA(0x8,1) ",%1 \n" 5356 "sub $0x2,%2 \n" 5357 "jg 1b \n" 5358 "vzeroupper \n" 5359 : "+r"(src_argb), // %0 5360 "+r"(dst_argb), // %1 5361 "+r"(width) // %2 5362 : "r"(poly) // %3 5363 : "memory", "cc", 5364 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 5365 ); 5366 } 5367 #endif // HAS_ARGBPOLYNOMIALROW_AVX2 5368 5369 #ifdef HAS_ARGBCOLORTABLEROW_X86 5370 // Tranform ARGB pixels with color table. 5371 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, 5372 int width) { 5373 uintptr_t pixel_temp; 5374 asm volatile ( 5375 // 1 pixel loop. 5376 LABELALIGN 5377 "1: \n" 5378 "movzb " MEMACCESS(0) ",%1 \n" 5379 "lea " MEMLEA(0x4,0) ",%0 \n" 5380 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 5381 "mov %b1," MEMACCESS2(-0x4,0) " \n" 5382 "movzb " MEMACCESS2(-0x3,0) ",%1 \n" 5383 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 5384 "mov %b1," MEMACCESS2(-0x3,0) " \n" 5385 "movzb " MEMACCESS2(-0x2,0) ",%1 \n" 5386 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 5387 "mov %b1," MEMACCESS2(-0x2,0) " \n" 5388 "movzb " MEMACCESS2(-0x1,0) ",%1 \n" 5389 MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1 5390 "mov %b1," MEMACCESS2(-0x1,0) " \n" 5391 "dec %2 \n" 5392 "jg 1b \n" 5393 : "+r"(dst_argb), // %0 5394 "=&d"(pixel_temp), // %1 5395 "+r"(width) // %2 5396 : "r"(table_argb) // %3 5397 : "memory", "cc"); 5398 } 5399 #endif // HAS_ARGBCOLORTABLEROW_X86 5400 5401 #ifdef HAS_RGBCOLORTABLEROW_X86 5402 // Tranform RGB pixels with color table. 5403 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { 5404 uintptr_t pixel_temp; 5405 asm volatile ( 5406 // 1 pixel loop. 5407 LABELALIGN 5408 "1: \n" 5409 "movzb " MEMACCESS(0) ",%1 \n" 5410 "lea " MEMLEA(0x4,0) ",%0 \n" 5411 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 5412 "mov %b1," MEMACCESS2(-0x4,0) " \n" 5413 "movzb " MEMACCESS2(-0x3,0) ",%1 \n" 5414 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 5415 "mov %b1," MEMACCESS2(-0x3,0) " \n" 5416 "movzb " MEMACCESS2(-0x2,0) ",%1 \n" 5417 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 5418 "mov %b1," MEMACCESS2(-0x2,0) " \n" 5419 "dec %2 \n" 5420 "jg 1b \n" 5421 : "+r"(dst_argb), // %0 5422 "=&d"(pixel_temp), // %1 5423 "+r"(width) // %2 5424 : "r"(table_argb) // %3 5425 : "memory", "cc"); 5426 } 5427 #endif // HAS_RGBCOLORTABLEROW_X86 5428 5429 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 5430 // Tranform RGB pixels with luma table. 5431 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 5432 int width, 5433 const uint8* luma, uint32 lumacoeff) { 5434 uintptr_t pixel_temp; 5435 uintptr_t table_temp; 5436 asm volatile ( 5437 "movd %6,%%xmm3 \n" 5438 "pshufd $0x0,%%xmm3,%%xmm3 \n" 5439 "pcmpeqb %%xmm4,%%xmm4 \n" 5440 "psllw $0x8,%%xmm4 \n" 5441 "pxor %%xmm5,%%xmm5 \n" 5442 5443 // 4 pixel loop. 5444 LABELALIGN 5445 "1: \n" 5446 "movdqu " MEMACCESS(2) ",%%xmm0 \n" 5447 "pmaddubsw %%xmm3,%%xmm0 \n" 5448 "phaddw %%xmm0,%%xmm0 \n" 5449 "pand %%xmm4,%%xmm0 \n" 5450 "punpcklwd %%xmm5,%%xmm0 \n" 5451 "movd %%xmm0,%k1 \n" // 32 bit offset 5452 "add %5,%1 \n" 5453 "pshufd $0x39,%%xmm0,%%xmm0 \n" 5454 5455 "movzb " MEMACCESS(2) ",%0 \n" 5456 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5457 "mov %b0," MEMACCESS(3) " \n" 5458 "movzb " MEMACCESS2(0x1,2) ",%0 \n" 5459 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5460 "mov %b0," MEMACCESS2(0x1,3) " \n" 5461 "movzb " MEMACCESS2(0x2,2) ",%0 \n" 5462 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5463 "mov %b0," MEMACCESS2(0x2,3) " \n" 5464 "movzb " MEMACCESS2(0x3,2) ",%0 \n" 5465 "mov %b0," MEMACCESS2(0x3,3) " \n" 5466 5467 "movd %%xmm0,%k1 \n" // 32 bit offset 5468 "add %5,%1 \n" 5469 "pshufd $0x39,%%xmm0,%%xmm0 \n" 5470 5471 "movzb " MEMACCESS2(0x4,2) ",%0 \n" 5472 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5473 "mov %b0," MEMACCESS2(0x4,3) " \n" 5474 "movzb " MEMACCESS2(0x5,2) ",%0 \n" 5475 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5476 "mov %b0," MEMACCESS2(0x5,3) " \n" 5477 "movzb " MEMACCESS2(0x6,2) ",%0 \n" 5478 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5479 "mov %b0," MEMACCESS2(0x6,3) " \n" 5480 "movzb " MEMACCESS2(0x7,2) ",%0 \n" 5481 "mov %b0," MEMACCESS2(0x7,3) " \n" 5482 5483 "movd %%xmm0,%k1 \n" // 32 bit offset 5484 "add %5,%1 \n" 5485 "pshufd $0x39,%%xmm0,%%xmm0 \n" 5486 5487 "movzb " MEMACCESS2(0x8,2) ",%0 \n" 5488 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5489 "mov %b0," MEMACCESS2(0x8,3) " \n" 5490 "movzb " MEMACCESS2(0x9,2) ",%0 \n" 5491 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5492 "mov %b0," MEMACCESS2(0x9,3) " \n" 5493 "movzb " MEMACCESS2(0xa,2) ",%0 \n" 5494 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5495 "mov %b0," MEMACCESS2(0xa,3) " \n" 5496 "movzb " MEMACCESS2(0xb,2) ",%0 \n" 5497 "mov %b0," MEMACCESS2(0xb,3) " \n" 5498 5499 "movd %%xmm0,%k1 \n" // 32 bit offset 5500 "add %5,%1 \n" 5501 5502 "movzb " MEMACCESS2(0xc,2) ",%0 \n" 5503 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5504 "mov %b0," MEMACCESS2(0xc,3) " \n" 5505 "movzb " MEMACCESS2(0xd,2) ",%0 \n" 5506 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5507 "mov %b0," MEMACCESS2(0xd,3) " \n" 5508 "movzb " MEMACCESS2(0xe,2) ",%0 \n" 5509 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5510 "mov %b0," MEMACCESS2(0xe,3) " \n" 5511 "movzb " MEMACCESS2(0xf,2) ",%0 \n" 5512 "mov %b0," MEMACCESS2(0xf,3) " \n" 5513 "lea " MEMLEA(0x10,2) ",%2 \n" 5514 "lea " MEMLEA(0x10,3) ",%3 \n" 5515 "sub $0x4,%4 \n" 5516 "jg 1b \n" 5517 : "=&d"(pixel_temp), // %0 5518 "=&a"(table_temp), // %1 5519 "+r"(src_argb), // %2 5520 "+r"(dst_argb), // %3 5521 "+rm"(width) // %4 5522 : "r"(luma), // %5 5523 "rm"(lumacoeff) // %6 5524 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5" 5525 ); 5526 } 5527 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 5528 5529 #endif // defined(__x86_64__) || defined(__i386__) 5530 5531 #ifdef __cplusplus 5532 } // extern "C" 5533 } // namespace libyuv 5534 #endif 5535