1 /* 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/row.h" 12 #include "libyuv/scale_row.h" 13 14 #ifdef __cplusplus 15 namespace libyuv { 16 extern "C" { 17 #endif 18 19 // This module is for GCC x86 and x64. 20 #if !defined(LIBYUV_DISABLE_X86) && \ 21 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) 22 23 // Offsets for source bytes 0 to 9 24 static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, 25 128, 128, 128, 128, 128, 128, 128, 128}; 26 27 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. 28 static uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, 29 128, 128, 128, 128, 128, 128, 128, 128}; 30 31 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 32 static uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, 33 128, 128, 128, 128, 128, 128, 128, 128}; 34 35 // Offsets for source bytes 0 to 10 36 static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; 37 38 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. 39 static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13}; 40 41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 42 static uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, 43 10, 11, 12, 13, 13, 14, 14, 15}; 44 45 // Coefficients for source bytes 0 to 10 46 static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; 47 48 // Coefficients for source bytes 10 to 21 49 static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; 50 51 // Coefficients for source bytes 21 to 31 52 static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; 53 54 // Coefficients for source bytes 21 to 31 55 static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; 56 57 static uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, 58 128, 128, 128, 128, 128, 128, 128, 128}; 59 60 static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, 61 6, 8, 11, 14, 128, 128, 128, 128}; 62 63 // Arrange words 0,3,6 into 0,1,2 64 static uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, 65 128, 128, 128, 128, 128, 128, 128, 128}; 66 67 // Arrange words 0,3,6 into 3,4,5 68 static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, 69 6, 7, 12, 13, 128, 128, 128, 128}; 70 71 // Scaling values for boxes of 3x3 and 2x3 72 static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 73 65536 / 9, 65536 / 6, 0, 0}; 74 75 // Arrange first value for pixels 0,1,2,3,4,5 76 static uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, 77 11, 128, 14, 128, 128, 128, 128, 128}; 78 79 // Arrange second value for pixels 0,1,2,3,4,5 80 static uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, 81 12, 128, 15, 128, 128, 128, 128, 128}; 82 83 // Arrange third value for pixels 0,1,2,3,4,5 84 static uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, 85 13, 128, 128, 128, 128, 128, 128, 128}; 86 87 // Scaling values for boxes of 3x2 and 2x2 88 static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 89 65536 / 3, 65536 / 2, 0, 0}; 90 91 // GCC versions of row functions are verbatim conversions from Visual C. 92 // Generated using gcc disassembly on Visual C object file: 93 // objdump -D yuvscaler.obj >yuvscaler.txt 94 95 void ScaleRowDown2_SSSE3(const uint8* src_ptr, 96 ptrdiff_t src_stride, 97 uint8* dst_ptr, 98 int dst_width) { 99 (void)src_stride; 100 asm volatile ( 101 LABELALIGN 102 "1: \n" 103 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 104 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 105 "lea " MEMLEA(0x20,0) ",%0 \n" 106 "psrlw $0x8,%%xmm0 \n" 107 "psrlw $0x8,%%xmm1 \n" 108 "packuswb %%xmm1,%%xmm0 \n" 109 "movdqu %%xmm0," MEMACCESS(1) " \n" 110 "lea " MEMLEA(0x10,1) ",%1 \n" 111 "sub $0x10,%2 \n" 112 "jg 1b \n" 113 : "+r"(src_ptr), // %0 114 "+r"(dst_ptr), // %1 115 "+r"(dst_width) // %2 116 :: "memory", "cc", "xmm0", "xmm1" 117 ); 118 } 119 120 void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, 121 ptrdiff_t src_stride, 122 uint8* dst_ptr, 123 int dst_width) { 124 (void)src_stride; 125 asm volatile ( 126 "pcmpeqb %%xmm4,%%xmm4 \n" 127 "psrlw $0xf,%%xmm4 \n" 128 "packuswb %%xmm4,%%xmm4 \n" 129 "pxor %%xmm5,%%xmm5 \n" 130 131 LABELALIGN 132 "1: \n" 133 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 134 "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n" 135 "lea " MEMLEA(0x20,0) ",%0 \n" 136 "pmaddubsw %%xmm4,%%xmm0 \n" 137 "pmaddubsw %%xmm4,%%xmm1 \n" 138 "pavgw %%xmm5,%%xmm0 \n" 139 "pavgw %%xmm5,%%xmm1 \n" 140 "packuswb %%xmm1,%%xmm0 \n" 141 "movdqu %%xmm0," MEMACCESS(1) " \n" 142 "lea " MEMLEA(0x10,1) ",%1 \n" 143 "sub $0x10,%2 \n" 144 "jg 1b \n" 145 : "+r"(src_ptr), // %0 146 "+r"(dst_ptr), // %1 147 "+r"(dst_width) // %2 148 :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5" 149 ); 150 } 151 152 void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, 153 ptrdiff_t src_stride, 154 uint8* dst_ptr, 155 int dst_width) { 156 asm volatile ( 157 "pcmpeqb %%xmm4,%%xmm4 \n" 158 "psrlw $0xf,%%xmm4 \n" 159 "packuswb %%xmm4,%%xmm4 \n" 160 "pxor %%xmm5,%%xmm5 \n" 161 162 LABELALIGN 163 "1: \n" 164 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 165 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 166 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 167 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 168 "lea " MEMLEA(0x20,0) ",%0 \n" 169 "pmaddubsw %%xmm4,%%xmm0 \n" 170 "pmaddubsw %%xmm4,%%xmm1 \n" 171 "pmaddubsw %%xmm4,%%xmm2 \n" 172 "pmaddubsw %%xmm4,%%xmm3 \n" 173 "paddw %%xmm2,%%xmm0 \n" 174 "paddw %%xmm3,%%xmm1 \n" 175 "psrlw $0x1,%%xmm0 \n" 176 "psrlw $0x1,%%xmm1 \n" 177 "pavgw %%xmm5,%%xmm0 \n" 178 "pavgw %%xmm5,%%xmm1 \n" 179 "packuswb %%xmm1,%%xmm0 \n" 180 "movdqu %%xmm0," MEMACCESS(1) " \n" 181 "lea " MEMLEA(0x10,1) ",%1 \n" 182 "sub $0x10,%2 \n" 183 "jg 1b \n" 184 : "+r"(src_ptr), // %0 185 "+r"(dst_ptr), // %1 186 "+r"(dst_width) // %2 187 : "r"((intptr_t)(src_stride)) // %3 188 : "memory", "cc", NACL_R14 189 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 190 ); 191 } 192 193 #ifdef HAS_SCALEROWDOWN2_AVX2 194 void ScaleRowDown2_AVX2(const uint8* src_ptr, 195 ptrdiff_t src_stride, 196 uint8* dst_ptr, 197 int dst_width) { 198 (void)src_stride; 199 asm volatile ( 200 LABELALIGN 201 "1: \n" 202 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 203 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 204 "lea " MEMLEA(0x40,0) ",%0 \n" 205 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 206 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" 207 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 208 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 209 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 210 "lea " MEMLEA(0x20,1) ",%1 \n" 211 "sub $0x20,%2 \n" 212 "jg 1b \n" 213 "vzeroupper \n" 214 : "+r"(src_ptr), // %0 215 "+r"(dst_ptr), // %1 216 "+r"(dst_width) // %2 217 :: "memory", "cc", "xmm0", "xmm1" 218 ); 219 } 220 221 void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, 222 ptrdiff_t src_stride, 223 uint8* dst_ptr, 224 int dst_width) { 225 (void)src_stride; 226 asm volatile ( 227 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" 228 "vpsrlw $0xf,%%ymm4,%%ymm4 \n" 229 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" 230 "vpxor %%ymm5,%%ymm5,%%ymm5 \n" 231 232 LABELALIGN 233 "1: \n" 234 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 235 "vmovdqu " MEMACCESS2(0x20, 0) ",%%ymm1 \n" 236 "lea " MEMLEA(0x40,0) ",%0 \n" 237 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" 238 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" 239 "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" 240 "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" 241 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 242 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 243 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 244 "lea " MEMLEA(0x20,1) ",%1 \n" 245 "sub $0x20,%2 \n" 246 "jg 1b \n" 247 "vzeroupper \n" 248 : "+r"(src_ptr), // %0 249 "+r"(dst_ptr), // %1 250 "+r"(dst_width) // %2 251 :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5" 252 ); 253 } 254 255 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, 256 ptrdiff_t src_stride, 257 uint8* dst_ptr, 258 int dst_width) { 259 asm volatile ( 260 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" 261 "vpsrlw $0xf,%%ymm4,%%ymm4 \n" 262 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" 263 "vpxor %%ymm5,%%ymm5,%%ymm5 \n" 264 265 LABELALIGN 266 "1: \n" 267 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 268 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 269 MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2 270 MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3 271 "lea " MEMLEA(0x40,0) ",%0 \n" 272 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" 273 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" 274 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" 275 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" 276 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" 277 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" 278 "vpsrlw $0x1,%%ymm0,%%ymm0 \n" 279 "vpsrlw $0x1,%%ymm1,%%ymm1 \n" 280 "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" 281 "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" 282 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 283 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 284 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 285 "lea " MEMLEA(0x20,1) ",%1 \n" 286 "sub $0x20,%2 \n" 287 "jg 1b \n" 288 "vzeroupper \n" 289 : "+r"(src_ptr), // %0 290 "+r"(dst_ptr), // %1 291 "+r"(dst_width) // %2 292 : "r"((intptr_t)(src_stride)) // %3 293 : "memory", "cc", NACL_R14 294 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 295 ); 296 } 297 #endif // HAS_SCALEROWDOWN2_AVX2 298 299 void ScaleRowDown4_SSSE3(const uint8* src_ptr, 300 ptrdiff_t src_stride, 301 uint8* dst_ptr, 302 int dst_width) { 303 (void)src_stride; 304 asm volatile ( 305 "pcmpeqb %%xmm5,%%xmm5 \n" 306 "psrld $0x18,%%xmm5 \n" 307 "pslld $0x10,%%xmm5 \n" 308 309 LABELALIGN 310 "1: \n" 311 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 312 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 313 "lea " MEMLEA(0x20,0) ",%0 \n" 314 "pand %%xmm5,%%xmm0 \n" 315 "pand %%xmm5,%%xmm1 \n" 316 "packuswb %%xmm1,%%xmm0 \n" 317 "psrlw $0x8,%%xmm0 \n" 318 "packuswb %%xmm0,%%xmm0 \n" 319 "movq %%xmm0," MEMACCESS(1) " \n" 320 "lea " MEMLEA(0x8,1) ",%1 \n" 321 "sub $0x8,%2 \n" 322 "jg 1b \n" 323 : "+r"(src_ptr), // %0 324 "+r"(dst_ptr), // %1 325 "+r"(dst_width) // %2 326 :: "memory", "cc", "xmm0", "xmm1", "xmm5" 327 ); 328 } 329 330 void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, 331 ptrdiff_t src_stride, 332 uint8* dst_ptr, 333 int dst_width) { 334 intptr_t stridex3; 335 asm volatile ( 336 "pcmpeqb %%xmm4,%%xmm4 \n" 337 "psrlw $0xf,%%xmm4 \n" 338 "movdqa %%xmm4,%%xmm5 \n" 339 "packuswb %%xmm4,%%xmm4 \n" 340 "psllw $0x3,%%xmm5 \n" 341 "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" 342 343 LABELALIGN 344 "1: \n" 345 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 346 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 347 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 348 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 349 "pmaddubsw %%xmm4,%%xmm0 \n" 350 "pmaddubsw %%xmm4,%%xmm1 \n" 351 "pmaddubsw %%xmm4,%%xmm2 \n" 352 "pmaddubsw %%xmm4,%%xmm3 \n" 353 "paddw %%xmm2,%%xmm0 \n" 354 "paddw %%xmm3,%%xmm1 \n" 355 MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2 356 MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3 357 "pmaddubsw %%xmm4,%%xmm2 \n" 358 "pmaddubsw %%xmm4,%%xmm3 \n" 359 "paddw %%xmm2,%%xmm0 \n" 360 "paddw %%xmm3,%%xmm1 \n" 361 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 362 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 363 "lea " MEMLEA(0x20,0) ",%0 \n" 364 "pmaddubsw %%xmm4,%%xmm2 \n" 365 "pmaddubsw %%xmm4,%%xmm3 \n" 366 "paddw %%xmm2,%%xmm0 \n" 367 "paddw %%xmm3,%%xmm1 \n" 368 "phaddw %%xmm1,%%xmm0 \n" 369 "paddw %%xmm5,%%xmm0 \n" 370 "psrlw $0x4,%%xmm0 \n" 371 "packuswb %%xmm0,%%xmm0 \n" 372 "movq %%xmm0," MEMACCESS(1) " \n" 373 "lea " MEMLEA(0x8,1) ",%1 \n" 374 "sub $0x8,%2 \n" 375 "jg 1b \n" 376 : "+r"(src_ptr), // %0 377 "+r"(dst_ptr), // %1 378 "+r"(dst_width), // %2 379 "=&r"(stridex3) // %3 380 : "r"((intptr_t)(src_stride)) // %4 381 : "memory", "cc", NACL_R14 382 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 383 ); 384 } 385 386 #ifdef HAS_SCALEROWDOWN4_AVX2 387 void ScaleRowDown4_AVX2(const uint8* src_ptr, 388 ptrdiff_t src_stride, 389 uint8* dst_ptr, 390 int dst_width) { 391 (void)src_stride; 392 asm volatile ( 393 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 394 "vpsrld $0x18,%%ymm5,%%ymm5 \n" 395 "vpslld $0x10,%%ymm5,%%ymm5 \n" 396 LABELALIGN 397 "1: \n" 398 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 399 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 400 "lea " MEMLEA(0x40,0) ",%0 \n" 401 "vpand %%ymm5,%%ymm0,%%ymm0 \n" 402 "vpand %%ymm5,%%ymm1,%%ymm1 \n" 403 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 404 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 405 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 406 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" 407 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 408 "vmovdqu %%xmm0," MEMACCESS(1) " \n" 409 "lea " MEMLEA(0x10,1) ",%1 \n" 410 "sub $0x10,%2 \n" 411 "jg 1b \n" 412 "vzeroupper \n" 413 : "+r"(src_ptr), // %0 414 "+r"(dst_ptr), // %1 415 "+r"(dst_width) // %2 416 :: "memory", "cc", "xmm0", "xmm1", "xmm5" 417 ); 418 } 419 420 void ScaleRowDown4Box_AVX2(const uint8* src_ptr, 421 ptrdiff_t src_stride, 422 uint8* dst_ptr, 423 int dst_width) { 424 asm volatile ( 425 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" 426 "vpsrlw $0xf,%%ymm4,%%ymm4 \n" 427 "vpsllw $0x3,%%ymm4,%%ymm5 \n" 428 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" 429 430 LABELALIGN 431 "1: \n" 432 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 433 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 434 MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2 435 MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3 436 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" 437 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" 438 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" 439 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" 440 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" 441 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" 442 MEMOPREG(vmovdqu,0x00,0,3,2,ymm2) // vmovdqu (%0,%3,2),%%ymm2 443 MEMOPREG(vmovdqu,0x20,0,3,2,ymm3) // vmovdqu 0x20(%0,%3,2),%%ymm3 444 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" 445 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" 446 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" 447 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" 448 MEMOPREG(vmovdqu,0x00,0,4,1,ymm2) // vmovdqu (%0,%4,1),%%ymm2 449 MEMOPREG(vmovdqu,0x20,0,4,1,ymm3) // vmovdqu 0x20(%0,%4,1),%%ymm3 450 "lea " MEMLEA(0x40,0) ",%0 \n" 451 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" 452 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" 453 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" 454 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" 455 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" 456 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 457 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" 458 "vpsrlw $0x4,%%ymm0,%%ymm0 \n" 459 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" 460 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 461 "vmovdqu %%xmm0," MEMACCESS(1) " \n" 462 "lea " MEMLEA(0x10,1) ",%1 \n" 463 "sub $0x10,%2 \n" 464 "jg 1b \n" 465 "vzeroupper \n" 466 : "+r"(src_ptr), // %0 467 "+r"(dst_ptr), // %1 468 "+r"(dst_width) // %2 469 : "r"((intptr_t)(src_stride)), // %3 470 "r"((intptr_t)(src_stride * 3)) // %4 471 : "memory", "cc", NACL_R14 472 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 473 ); 474 } 475 #endif // HAS_SCALEROWDOWN4_AVX2 476 477 void ScaleRowDown34_SSSE3(const uint8* src_ptr, 478 ptrdiff_t src_stride, 479 uint8* dst_ptr, 480 int dst_width) { 481 (void)src_stride; 482 asm volatile( 483 "movdqa %0,%%xmm3 \n" 484 "movdqa %1,%%xmm4 \n" 485 "movdqa %2,%%xmm5 \n" 486 : 487 : "m"(kShuf0), // %0 488 "m"(kShuf1), // %1 489 "m"(kShuf2) // %2 490 ); 491 asm volatile ( 492 LABELALIGN 493 "1: \n" 494 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 495 "movdqu " MEMACCESS2(0x10,0) ",%%xmm2 \n" 496 "lea " MEMLEA(0x20,0) ",%0 \n" 497 "movdqa %%xmm2,%%xmm1 \n" 498 "palignr $0x8,%%xmm0,%%xmm1 \n" 499 "pshufb %%xmm3,%%xmm0 \n" 500 "pshufb %%xmm4,%%xmm1 \n" 501 "pshufb %%xmm5,%%xmm2 \n" 502 "movq %%xmm0," MEMACCESS(1) " \n" 503 "movq %%xmm1," MEMACCESS2(0x8,1) " \n" 504 "movq %%xmm2," MEMACCESS2(0x10,1) " \n" 505 "lea " MEMLEA(0x18,1) ",%1 \n" 506 "sub $0x18,%2 \n" 507 "jg 1b \n" 508 : "+r"(src_ptr), // %0 509 "+r"(dst_ptr), // %1 510 "+r"(dst_width) // %2 511 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 512 ); 513 } 514 515 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, 516 ptrdiff_t src_stride, 517 uint8* dst_ptr, 518 int dst_width) { 519 asm volatile( 520 "movdqa %0,%%xmm2 \n" // kShuf01 521 "movdqa %1,%%xmm3 \n" // kShuf11 522 "movdqa %2,%%xmm4 \n" // kShuf21 523 : 524 : "m"(kShuf01), // %0 525 "m"(kShuf11), // %1 526 "m"(kShuf21) // %2 527 ); 528 asm volatile( 529 "movdqa %0,%%xmm5 \n" // kMadd01 530 "movdqa %1,%%xmm0 \n" // kMadd11 531 "movdqa %2,%%xmm1 \n" // kRound34 532 : 533 : "m"(kMadd01), // %0 534 "m"(kMadd11), // %1 535 "m"(kRound34) // %2 536 ); 537 asm volatile ( 538 LABELALIGN 539 "1: \n" 540 "movdqu " MEMACCESS(0) ",%%xmm6 \n" 541 MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3),%%xmm7 542 "pavgb %%xmm7,%%xmm6 \n" 543 "pshufb %%xmm2,%%xmm6 \n" 544 "pmaddubsw %%xmm5,%%xmm6 \n" 545 "paddsw %%xmm1,%%xmm6 \n" 546 "psrlw $0x2,%%xmm6 \n" 547 "packuswb %%xmm6,%%xmm6 \n" 548 "movq %%xmm6," MEMACCESS(1) " \n" 549 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" 550 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7 551 "pavgb %%xmm7,%%xmm6 \n" 552 "pshufb %%xmm3,%%xmm6 \n" 553 "pmaddubsw %%xmm0,%%xmm6 \n" 554 "paddsw %%xmm1,%%xmm6 \n" 555 "psrlw $0x2,%%xmm6 \n" 556 "packuswb %%xmm6,%%xmm6 \n" 557 "movq %%xmm6," MEMACCESS2(0x8,1) " \n" 558 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" 559 MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3),%%xmm7 560 "lea " MEMLEA(0x20,0) ",%0 \n" 561 "pavgb %%xmm7,%%xmm6 \n" 562 "pshufb %%xmm4,%%xmm6 \n" 563 "pmaddubsw %4,%%xmm6 \n" 564 "paddsw %%xmm1,%%xmm6 \n" 565 "psrlw $0x2,%%xmm6 \n" 566 "packuswb %%xmm6,%%xmm6 \n" 567 "movq %%xmm6," MEMACCESS2(0x10,1) " \n" 568 "lea " MEMLEA(0x18,1) ",%1 \n" 569 "sub $0x18,%2 \n" 570 "jg 1b \n" 571 : "+r"(src_ptr), // %0 572 "+r"(dst_ptr), // %1 573 "+r"(dst_width) // %2 574 : "r"((intptr_t)(src_stride)), // %3 575 "m"(kMadd21) // %4 576 : "memory", "cc", NACL_R14 577 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 578 ); 579 } 580 581 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, 582 ptrdiff_t src_stride, 583 uint8* dst_ptr, 584 int dst_width) { 585 asm volatile( 586 "movdqa %0,%%xmm2 \n" // kShuf01 587 "movdqa %1,%%xmm3 \n" // kShuf11 588 "movdqa %2,%%xmm4 \n" // kShuf21 589 : 590 : "m"(kShuf01), // %0 591 "m"(kShuf11), // %1 592 "m"(kShuf21) // %2 593 ); 594 asm volatile( 595 "movdqa %0,%%xmm5 \n" // kMadd01 596 "movdqa %1,%%xmm0 \n" // kMadd11 597 "movdqa %2,%%xmm1 \n" // kRound34 598 : 599 : "m"(kMadd01), // %0 600 "m"(kMadd11), // %1 601 "m"(kRound34) // %2 602 ); 603 604 asm volatile ( 605 LABELALIGN 606 "1: \n" 607 "movdqu " MEMACCESS(0) ",%%xmm6 \n" 608 MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3,1),%%xmm7 609 "pavgb %%xmm6,%%xmm7 \n" 610 "pavgb %%xmm7,%%xmm6 \n" 611 "pshufb %%xmm2,%%xmm6 \n" 612 "pmaddubsw %%xmm5,%%xmm6 \n" 613 "paddsw %%xmm1,%%xmm6 \n" 614 "psrlw $0x2,%%xmm6 \n" 615 "packuswb %%xmm6,%%xmm6 \n" 616 "movq %%xmm6," MEMACCESS(1) " \n" 617 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" 618 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7 619 "pavgb %%xmm6,%%xmm7 \n" 620 "pavgb %%xmm7,%%xmm6 \n" 621 "pshufb %%xmm3,%%xmm6 \n" 622 "pmaddubsw %%xmm0,%%xmm6 \n" 623 "paddsw %%xmm1,%%xmm6 \n" 624 "psrlw $0x2,%%xmm6 \n" 625 "packuswb %%xmm6,%%xmm6 \n" 626 "movq %%xmm6," MEMACCESS2(0x8,1) " \n" 627 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" 628 MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3,1),%%xmm7 629 "lea " MEMLEA(0x20,0) ",%0 \n" 630 "pavgb %%xmm6,%%xmm7 \n" 631 "pavgb %%xmm7,%%xmm6 \n" 632 "pshufb %%xmm4,%%xmm6 \n" 633 "pmaddubsw %4,%%xmm6 \n" 634 "paddsw %%xmm1,%%xmm6 \n" 635 "psrlw $0x2,%%xmm6 \n" 636 "packuswb %%xmm6,%%xmm6 \n" 637 "movq %%xmm6," MEMACCESS2(0x10,1) " \n" 638 "lea " MEMLEA(0x18,1) ",%1 \n" 639 "sub $0x18,%2 \n" 640 "jg 1b \n" 641 : "+r"(src_ptr), // %0 642 "+r"(dst_ptr), // %1 643 "+r"(dst_width) // %2 644 : "r"((intptr_t)(src_stride)), // %3 645 "m"(kMadd21) // %4 646 : "memory", "cc", NACL_R14 647 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 648 ); 649 } 650 651 void ScaleRowDown38_SSSE3(const uint8* src_ptr, 652 ptrdiff_t src_stride, 653 uint8* dst_ptr, 654 int dst_width) { 655 (void)src_stride; 656 asm volatile ( 657 "movdqa %3,%%xmm4 \n" 658 "movdqa %4,%%xmm5 \n" 659 660 LABELALIGN 661 "1: \n" 662 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 663 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 664 "lea " MEMLEA(0x20,0) ",%0 \n" 665 "pshufb %%xmm4,%%xmm0 \n" 666 "pshufb %%xmm5,%%xmm1 \n" 667 "paddusb %%xmm1,%%xmm0 \n" 668 "movq %%xmm0," MEMACCESS(1) " \n" 669 "movhlps %%xmm0,%%xmm1 \n" 670 "movd %%xmm1," MEMACCESS2(0x8,1) " \n" 671 "lea " MEMLEA(0xc,1) ",%1 \n" 672 "sub $0xc,%2 \n" 673 "jg 1b \n" 674 : "+r"(src_ptr), // %0 675 "+r"(dst_ptr), // %1 676 "+r"(dst_width) // %2 677 : "m"(kShuf38a), // %3 678 "m"(kShuf38b) // %4 679 : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5" 680 ); 681 } 682 683 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, 684 ptrdiff_t src_stride, 685 uint8* dst_ptr, 686 int dst_width) { 687 asm volatile( 688 "movdqa %0,%%xmm2 \n" 689 "movdqa %1,%%xmm3 \n" 690 "movdqa %2,%%xmm4 \n" 691 "movdqa %3,%%xmm5 \n" 692 : 693 : "m"(kShufAb0), // %0 694 "m"(kShufAb1), // %1 695 "m"(kShufAb2), // %2 696 "m"(kScaleAb2) // %3 697 ); 698 asm volatile ( 699 LABELALIGN 700 "1: \n" 701 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 702 MEMOPREG(movdqu,0x00,0,3,1,xmm1) // movdqu (%0,%3,1),%%xmm1 703 "lea " MEMLEA(0x10,0) ",%0 \n" 704 "pavgb %%xmm1,%%xmm0 \n" 705 "movdqa %%xmm0,%%xmm1 \n" 706 "pshufb %%xmm2,%%xmm1 \n" 707 "movdqa %%xmm0,%%xmm6 \n" 708 "pshufb %%xmm3,%%xmm6 \n" 709 "paddusw %%xmm6,%%xmm1 \n" 710 "pshufb %%xmm4,%%xmm0 \n" 711 "paddusw %%xmm0,%%xmm1 \n" 712 "pmulhuw %%xmm5,%%xmm1 \n" 713 "packuswb %%xmm1,%%xmm1 \n" 714 "movd %%xmm1," MEMACCESS(1) " \n" 715 "psrlq $0x10,%%xmm1 \n" 716 "movd %%xmm1," MEMACCESS2(0x2,1) " \n" 717 "lea " MEMLEA(0x6,1) ",%1 \n" 718 "sub $0x6,%2 \n" 719 "jg 1b \n" 720 : "+r"(src_ptr), // %0 721 "+r"(dst_ptr), // %1 722 "+r"(dst_width) // %2 723 : "r"((intptr_t)(src_stride)) // %3 724 : "memory", "cc", NACL_R14 725 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 726 ); 727 } 728 729 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, 730 ptrdiff_t src_stride, 731 uint8* dst_ptr, 732 int dst_width) { 733 asm volatile( 734 "movdqa %0,%%xmm2 \n" 735 "movdqa %1,%%xmm3 \n" 736 "movdqa %2,%%xmm4 \n" 737 "pxor %%xmm5,%%xmm5 \n" 738 : 739 : "m"(kShufAc), // %0 740 "m"(kShufAc3), // %1 741 "m"(kScaleAc33) // %2 742 ); 743 asm volatile ( 744 LABELALIGN 745 "1: \n" 746 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 747 MEMOPREG(movdqu,0x00,0,3,1,xmm6) // movdqu (%0,%3,1),%%xmm6 748 "movhlps %%xmm0,%%xmm1 \n" 749 "movhlps %%xmm6,%%xmm7 \n" 750 "punpcklbw %%xmm5,%%xmm0 \n" 751 "punpcklbw %%xmm5,%%xmm1 \n" 752 "punpcklbw %%xmm5,%%xmm6 \n" 753 "punpcklbw %%xmm5,%%xmm7 \n" 754 "paddusw %%xmm6,%%xmm0 \n" 755 "paddusw %%xmm7,%%xmm1 \n" 756 MEMOPREG(movdqu,0x00,0,3,2,xmm6) // movdqu (%0,%3,2),%%xmm6 757 "lea " MEMLEA(0x10,0) ",%0 \n" 758 "movhlps %%xmm6,%%xmm7 \n" 759 "punpcklbw %%xmm5,%%xmm6 \n" 760 "punpcklbw %%xmm5,%%xmm7 \n" 761 "paddusw %%xmm6,%%xmm0 \n" 762 "paddusw %%xmm7,%%xmm1 \n" 763 "movdqa %%xmm0,%%xmm6 \n" 764 "psrldq $0x2,%%xmm0 \n" 765 "paddusw %%xmm0,%%xmm6 \n" 766 "psrldq $0x2,%%xmm0 \n" 767 "paddusw %%xmm0,%%xmm6 \n" 768 "pshufb %%xmm2,%%xmm6 \n" 769 "movdqa %%xmm1,%%xmm7 \n" 770 "psrldq $0x2,%%xmm1 \n" 771 "paddusw %%xmm1,%%xmm7 \n" 772 "psrldq $0x2,%%xmm1 \n" 773 "paddusw %%xmm1,%%xmm7 \n" 774 "pshufb %%xmm3,%%xmm7 \n" 775 "paddusw %%xmm7,%%xmm6 \n" 776 "pmulhuw %%xmm4,%%xmm6 \n" 777 "packuswb %%xmm6,%%xmm6 \n" 778 "movd %%xmm6," MEMACCESS(1) " \n" 779 "psrlq $0x10,%%xmm6 \n" 780 "movd %%xmm6," MEMACCESS2(0x2,1) " \n" 781 "lea " MEMLEA(0x6,1) ",%1 \n" 782 "sub $0x6,%2 \n" 783 "jg 1b \n" 784 : "+r"(src_ptr), // %0 785 "+r"(dst_ptr), // %1 786 "+r"(dst_width) // %2 787 : "r"((intptr_t)(src_stride)) // %3 788 : "memory", "cc", NACL_R14 789 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 790 ); 791 } 792 793 // Reads 16xN bytes and produces 16 shorts at a time. 794 void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { 795 asm volatile ( 796 "pxor %%xmm5,%%xmm5 \n" 797 798 LABELALIGN 799 "1: \n" 800 "movdqu " MEMACCESS(0) ",%%xmm3 \n" 801 "lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16 802 "movdqu " MEMACCESS(1) ",%%xmm0 \n" 803 "movdqu " MEMACCESS2(0x10,1) ",%%xmm1 \n" 804 "movdqa %%xmm3,%%xmm2 \n" 805 "punpcklbw %%xmm5,%%xmm2 \n" 806 "punpckhbw %%xmm5,%%xmm3 \n" 807 "paddusw %%xmm2,%%xmm0 \n" 808 "paddusw %%xmm3,%%xmm1 \n" 809 "movdqu %%xmm0," MEMACCESS(1) " \n" 810 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 811 "lea " MEMLEA(0x20,1) ",%1 \n" 812 "sub $0x10,%2 \n" 813 "jg 1b \n" 814 : "+r"(src_ptr), // %0 815 "+r"(dst_ptr), // %1 816 "+r"(src_width) // %2 817 : 818 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 819 ); 820 } 821 822 #ifdef HAS_SCALEADDROW_AVX2 823 // Reads 32 bytes and accumulates to 32 shorts at a time. 824 void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { 825 asm volatile ( 826 "vpxor %%ymm5,%%ymm5,%%ymm5 \n" 827 828 LABELALIGN 829 "1: \n" 830 "vmovdqu " MEMACCESS(0) ",%%ymm3 \n" 831 "lea " MEMLEA(0x20,0) ",%0 \n" // src_ptr += 32 832 "vpermq $0xd8,%%ymm3,%%ymm3 \n" 833 "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" 834 "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" 835 "vpaddusw " MEMACCESS(1) ",%%ymm2,%%ymm0 \n" 836 "vpaddusw " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n" 837 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 838 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" 839 "lea " MEMLEA(0x40,1) ",%1 \n" 840 "sub $0x20,%2 \n" 841 "jg 1b \n" 842 "vzeroupper \n" 843 : "+r"(src_ptr), // %0 844 "+r"(dst_ptr), // %1 845 "+r"(src_width) // %2 846 : 847 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 848 ); 849 } 850 #endif // HAS_SCALEADDROW_AVX2 851 852 // Constant for making pixels signed to avoid pmaddubsw 853 // saturation. 854 static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 855 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; 856 857 // Constant for making pixels unsigned and adding .5 for rounding. 858 static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, 859 0x4040, 0x4040, 0x4040, 0x4040}; 860 861 // Bilinear column filtering. SSSE3 version. 862 void ScaleFilterCols_SSSE3(uint8* dst_ptr, 863 const uint8* src_ptr, 864 int dst_width, 865 int x, 866 int dx) { 867 intptr_t x0, x1, temp_pixel; 868 asm volatile ( 869 "movd %6,%%xmm2 \n" 870 "movd %7,%%xmm3 \n" 871 "movl $0x04040000,%k2 \n" 872 "movd %k2,%%xmm5 \n" 873 "pcmpeqb %%xmm6,%%xmm6 \n" 874 "psrlw $0x9,%%xmm6 \n" // 0x007f007f 875 "pcmpeqb %%xmm7,%%xmm7 \n" 876 "psrlw $15,%%xmm7 \n" // 0x00010001 877 878 "pextrw $0x1,%%xmm2,%k3 \n" 879 "subl $0x2,%5 \n" 880 "jl 29f \n" 881 "movdqa %%xmm2,%%xmm0 \n" 882 "paddd %%xmm3,%%xmm0 \n" 883 "punpckldq %%xmm0,%%xmm2 \n" 884 "punpckldq %%xmm3,%%xmm3 \n" 885 "paddd %%xmm3,%%xmm3 \n" 886 "pextrw $0x3,%%xmm2,%k4 \n" 887 888 LABELALIGN 889 "2: \n" 890 "movdqa %%xmm2,%%xmm1 \n" 891 "paddd %%xmm3,%%xmm2 \n" 892 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 893 "movd %k2,%%xmm0 \n" 894 "psrlw $0x9,%%xmm1 \n" 895 MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2 896 "movd %k2,%%xmm4 \n" 897 "pshufb %%xmm5,%%xmm1 \n" 898 "punpcklwd %%xmm4,%%xmm0 \n" 899 "psubb %8,%%xmm0 \n" // make pixels signed. 900 "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + 1 901 "paddusb %%xmm7,%%xmm1 \n" 902 "pmaddubsw %%xmm0,%%xmm1 \n" 903 "pextrw $0x1,%%xmm2,%k3 \n" 904 "pextrw $0x3,%%xmm2,%k4 \n" 905 "paddw %9,%%xmm1 \n" // make pixels unsigned. 906 "psrlw $0x7,%%xmm1 \n" 907 "packuswb %%xmm1,%%xmm1 \n" 908 "movd %%xmm1,%k2 \n" 909 "mov %w2," MEMACCESS(0) " \n" 910 "lea " MEMLEA(0x2,0) ",%0 \n" 911 "subl $0x2,%5 \n" 912 "jge 2b \n" 913 914 LABELALIGN 915 "29: \n" 916 "addl $0x1,%5 \n" 917 "jl 99f \n" 918 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 919 "movd %k2,%%xmm0 \n" 920 "psrlw $0x9,%%xmm2 \n" 921 "pshufb %%xmm5,%%xmm2 \n" 922 "psubb %8,%%xmm0 \n" // make pixels signed. 923 "pxor %%xmm6,%%xmm2 \n" 924 "paddusb %%xmm7,%%xmm2 \n" 925 "pmaddubsw %%xmm0,%%xmm2 \n" 926 "paddw %9,%%xmm2 \n" // make pixels unsigned. 927 "psrlw $0x7,%%xmm2 \n" 928 "packuswb %%xmm2,%%xmm2 \n" 929 "movd %%xmm2,%k2 \n" 930 "mov %b2," MEMACCESS(0) " \n" 931 "99: \n" 932 : "+r"(dst_ptr), // %0 933 "+r"(src_ptr), // %1 934 "=&a"(temp_pixel), // %2 935 "=&r"(x0), // %3 936 "=&r"(x1), // %4 937 #if defined(__x86_64__) 938 "+rm"(dst_width) // %5 939 #else 940 "+m"(dst_width) // %5 941 #endif 942 : "rm"(x), // %6 943 "rm"(dx), // %7 944 #if defined(__x86_64__) 945 "x"(kFsub80), // %8 946 "x"(kFadd40) // %9 947 #else 948 "m"(kFsub80), // %8 949 "m"(kFadd40) // %9 950 #endif 951 : "memory", "cc", NACL_R14 952 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 953 ); 954 } 955 956 // Reads 4 pixels, duplicates them and writes 8 pixels. 957 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 958 void ScaleColsUp2_SSE2(uint8* dst_ptr, 959 const uint8* src_ptr, 960 int dst_width, 961 int x, 962 int dx) { 963 (void)x; 964 (void)dx; 965 asm volatile ( 966 LABELALIGN 967 "1: \n" 968 "movdqu " MEMACCESS(1) ",%%xmm0 \n" 969 "lea " MEMLEA(0x10,1) ",%1 \n" 970 "movdqa %%xmm0,%%xmm1 \n" 971 "punpcklbw %%xmm0,%%xmm0 \n" 972 "punpckhbw %%xmm1,%%xmm1 \n" 973 "movdqu %%xmm0," MEMACCESS(0) " \n" 974 "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" 975 "lea " MEMLEA(0x20,0) ",%0 \n" 976 "sub $0x20,%2 \n" 977 "jg 1b \n" 978 979 : "+r"(dst_ptr), // %0 980 "+r"(src_ptr), // %1 981 "+r"(dst_width) // %2 982 :: "memory", "cc", "xmm0", "xmm1" 983 ); 984 } 985 986 void ScaleARGBRowDown2_SSE2(const uint8* src_argb, 987 ptrdiff_t src_stride, 988 uint8* dst_argb, 989 int dst_width) { 990 (void)src_stride; 991 asm volatile ( 992 LABELALIGN 993 "1: \n" 994 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 995 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 996 "lea " MEMLEA(0x20,0) ",%0 \n" 997 "shufps $0xdd,%%xmm1,%%xmm0 \n" 998 "movdqu %%xmm0," MEMACCESS(1) " \n" 999 "lea " MEMLEA(0x10,1) ",%1 \n" 1000 "sub $0x4,%2 \n" 1001 "jg 1b \n" 1002 : "+r"(src_argb), // %0 1003 "+r"(dst_argb), // %1 1004 "+r"(dst_width) // %2 1005 :: "memory", "cc", "xmm0", "xmm1" 1006 ); 1007 } 1008 1009 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, 1010 ptrdiff_t src_stride, 1011 uint8* dst_argb, 1012 int dst_width) { 1013 (void)src_stride; 1014 asm volatile ( 1015 LABELALIGN 1016 "1: \n" 1017 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1018 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1019 "lea " MEMLEA(0x20,0) ",%0 \n" 1020 "movdqa %%xmm0,%%xmm2 \n" 1021 "shufps $0x88,%%xmm1,%%xmm0 \n" 1022 "shufps $0xdd,%%xmm1,%%xmm2 \n" 1023 "pavgb %%xmm2,%%xmm0 \n" 1024 "movdqu %%xmm0," MEMACCESS(1) " \n" 1025 "lea " MEMLEA(0x10,1) ",%1 \n" 1026 "sub $0x4,%2 \n" 1027 "jg 1b \n" 1028 : "+r"(src_argb), // %0 1029 "+r"(dst_argb), // %1 1030 "+r"(dst_width) // %2 1031 :: "memory", "cc", "xmm0", "xmm1" 1032 ); 1033 } 1034 1035 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, 1036 ptrdiff_t src_stride, 1037 uint8* dst_argb, 1038 int dst_width) { 1039 asm volatile ( 1040 LABELALIGN 1041 "1: \n" 1042 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1043 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1044 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 1045 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 1046 "lea " MEMLEA(0x20,0) ",%0 \n" 1047 "pavgb %%xmm2,%%xmm0 \n" 1048 "pavgb %%xmm3,%%xmm1 \n" 1049 "movdqa %%xmm0,%%xmm2 \n" 1050 "shufps $0x88,%%xmm1,%%xmm0 \n" 1051 "shufps $0xdd,%%xmm1,%%xmm2 \n" 1052 "pavgb %%xmm2,%%xmm0 \n" 1053 "movdqu %%xmm0," MEMACCESS(1) " \n" 1054 "lea " MEMLEA(0x10,1) ",%1 \n" 1055 "sub $0x4,%2 \n" 1056 "jg 1b \n" 1057 : "+r"(src_argb), // %0 1058 "+r"(dst_argb), // %1 1059 "+r"(dst_width) // %2 1060 : "r"((intptr_t)(src_stride)) // %3 1061 : "memory", "cc", NACL_R14 1062 "xmm0", "xmm1", "xmm2", "xmm3" 1063 ); 1064 } 1065 1066 // Reads 4 pixels at a time. 1067 // Alignment requirement: dst_argb 16 byte aligned. 1068 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, 1069 ptrdiff_t src_stride, 1070 int src_stepx, 1071 uint8* dst_argb, 1072 int dst_width) { 1073 intptr_t src_stepx_x4 = (intptr_t)(src_stepx); 1074 intptr_t src_stepx_x12; 1075 (void)src_stride; 1076 asm volatile ( 1077 "lea " MEMLEA3(0x00,1,4) ",%1 \n" 1078 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" 1079 LABELALIGN 1080 "1: \n" 1081 "movd " MEMACCESS(0) ",%%xmm0 \n" 1082 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 1083 "punpckldq %%xmm1,%%xmm0 \n" 1084 MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2 1085 MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3 1086 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" 1087 "punpckldq %%xmm3,%%xmm2 \n" 1088 "punpcklqdq %%xmm2,%%xmm0 \n" 1089 "movdqu %%xmm0," MEMACCESS(2) " \n" 1090 "lea " MEMLEA(0x10,2) ",%2 \n" 1091 "sub $0x4,%3 \n" 1092 "jg 1b \n" 1093 : "+r"(src_argb), // %0 1094 "+r"(src_stepx_x4), // %1 1095 "+r"(dst_argb), // %2 1096 "+r"(dst_width), // %3 1097 "=&r"(src_stepx_x12) // %4 1098 :: "memory", "cc", NACL_R14 1099 "xmm0", "xmm1", "xmm2", "xmm3" 1100 ); 1101 } 1102 1103 // Blends four 2x2 to 4x1. 1104 // Alignment requirement: dst_argb 16 byte aligned. 1105 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, 1106 ptrdiff_t src_stride, 1107 int src_stepx, 1108 uint8* dst_argb, 1109 int dst_width) { 1110 intptr_t src_stepx_x4 = (intptr_t)(src_stepx); 1111 intptr_t src_stepx_x12; 1112 intptr_t row1 = (intptr_t)(src_stride); 1113 asm volatile ( 1114 "lea " MEMLEA3(0x00,1,4) ",%1 \n" 1115 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" 1116 "lea " MEMLEA4(0x00,0,5,1) ",%5 \n" 1117 1118 LABELALIGN 1119 "1: \n" 1120 "movq " MEMACCESS(0) ",%%xmm0 \n" 1121 MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0 1122 MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1 1123 MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1 1124 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" 1125 "movq " MEMACCESS(5) ",%%xmm2 \n" 1126 MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2 1127 MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3 1128 MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3 1129 "lea " MEMLEA4(0x00,5,1,4) ",%5 \n" 1130 "pavgb %%xmm2,%%xmm0 \n" 1131 "pavgb %%xmm3,%%xmm1 \n" 1132 "movdqa %%xmm0,%%xmm2 \n" 1133 "shufps $0x88,%%xmm1,%%xmm0 \n" 1134 "shufps $0xdd,%%xmm1,%%xmm2 \n" 1135 "pavgb %%xmm2,%%xmm0 \n" 1136 "movdqu %%xmm0," MEMACCESS(2) " \n" 1137 "lea " MEMLEA(0x10,2) ",%2 \n" 1138 "sub $0x4,%3 \n" 1139 "jg 1b \n" 1140 : "+r"(src_argb), // %0 1141 "+r"(src_stepx_x4), // %1 1142 "+r"(dst_argb), // %2 1143 "+rm"(dst_width), // %3 1144 "=&r"(src_stepx_x12), // %4 1145 "+r"(row1) // %5 1146 :: "memory", "cc", NACL_R14 1147 "xmm0", "xmm1", "xmm2", "xmm3" 1148 ); 1149 } 1150 1151 void ScaleARGBCols_SSE2(uint8* dst_argb, 1152 const uint8* src_argb, 1153 int dst_width, 1154 int x, 1155 int dx) { 1156 intptr_t x0, x1; 1157 asm volatile ( 1158 "movd %5,%%xmm2 \n" 1159 "movd %6,%%xmm3 \n" 1160 "pshufd $0x0,%%xmm2,%%xmm2 \n" 1161 "pshufd $0x11,%%xmm3,%%xmm0 \n" 1162 "paddd %%xmm0,%%xmm2 \n" 1163 "paddd %%xmm3,%%xmm3 \n" 1164 "pshufd $0x5,%%xmm3,%%xmm0 \n" 1165 "paddd %%xmm0,%%xmm2 \n" 1166 "paddd %%xmm3,%%xmm3 \n" 1167 "pshufd $0x0,%%xmm3,%%xmm3 \n" 1168 "pextrw $0x1,%%xmm2,%k0 \n" 1169 "pextrw $0x3,%%xmm2,%k1 \n" 1170 "cmp $0x0,%4 \n" 1171 "jl 99f \n" 1172 "sub $0x4,%4 \n" 1173 "jl 49f \n" 1174 1175 LABELALIGN 1176 "40: \n" 1177 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 1178 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 1179 "pextrw $0x5,%%xmm2,%k0 \n" 1180 "pextrw $0x7,%%xmm2,%k1 \n" 1181 "paddd %%xmm3,%%xmm2 \n" 1182 "punpckldq %%xmm1,%%xmm0 \n" 1183 MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1 1184 MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4 1185 "pextrw $0x1,%%xmm2,%k0 \n" 1186 "pextrw $0x3,%%xmm2,%k1 \n" 1187 "punpckldq %%xmm4,%%xmm1 \n" 1188 "punpcklqdq %%xmm1,%%xmm0 \n" 1189 "movdqu %%xmm0," MEMACCESS(2) " \n" 1190 "lea " MEMLEA(0x10,2) ",%2 \n" 1191 "sub $0x4,%4 \n" 1192 "jge 40b \n" 1193 1194 "49: \n" 1195 "test $0x2,%4 \n" 1196 "je 29f \n" 1197 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 1198 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 1199 "pextrw $0x5,%%xmm2,%k0 \n" 1200 "punpckldq %%xmm1,%%xmm0 \n" 1201 "movq %%xmm0," MEMACCESS(2) " \n" 1202 "lea " MEMLEA(0x8,2) ",%2 \n" 1203 "29: \n" 1204 "test $0x1,%4 \n" 1205 "je 99f \n" 1206 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 1207 "movd %%xmm0," MEMACCESS(2) " \n" 1208 "99: \n" 1209 : "=&a"(x0), // %0 1210 "=&d"(x1), // %1 1211 "+r"(dst_argb), // %2 1212 "+r"(src_argb), // %3 1213 "+r"(dst_width) // %4 1214 : "rm"(x), // %5 1215 "rm"(dx) // %6 1216 : "memory", "cc", NACL_R14 1217 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" 1218 ); 1219 } 1220 1221 // Reads 4 pixels, duplicates them and writes 8 pixels. 1222 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 1223 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, 1224 const uint8* src_argb, 1225 int dst_width, 1226 int x, 1227 int dx) { 1228 (void)x; 1229 (void)dx; 1230 asm volatile ( 1231 LABELALIGN 1232 "1: \n" 1233 "movdqu " MEMACCESS(1) ",%%xmm0 \n" 1234 "lea " MEMLEA(0x10,1) ",%1 \n" 1235 "movdqa %%xmm0,%%xmm1 \n" 1236 "punpckldq %%xmm0,%%xmm0 \n" 1237 "punpckhdq %%xmm1,%%xmm1 \n" 1238 "movdqu %%xmm0," MEMACCESS(0) " \n" 1239 "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" 1240 "lea " MEMLEA(0x20,0) ",%0 \n" 1241 "sub $0x8,%2 \n" 1242 "jg 1b \n" 1243 1244 : "+r"(dst_argb), // %0 1245 "+r"(src_argb), // %1 1246 "+r"(dst_width) // %2 1247 :: "memory", "cc", NACL_R14 1248 "xmm0", "xmm1" 1249 ); 1250 } 1251 1252 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw 1253 static uvec8 kShuffleColARGB = { 1254 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel 1255 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel 1256 }; 1257 1258 // Shuffle table for duplicating 2 fractions into 8 bytes each 1259 static uvec8 kShuffleFractions = { 1260 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 1261 }; 1262 1263 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version 1264 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, 1265 const uint8* src_argb, 1266 int dst_width, 1267 int x, 1268 int dx) { 1269 intptr_t x0, x1; 1270 asm volatile( 1271 "movdqa %0,%%xmm4 \n" 1272 "movdqa %1,%%xmm5 \n" 1273 : 1274 : "m"(kShuffleColARGB), // %0 1275 "m"(kShuffleFractions) // %1 1276 ); 1277 1278 asm volatile ( 1279 "movd %5,%%xmm2 \n" 1280 "movd %6,%%xmm3 \n" 1281 "pcmpeqb %%xmm6,%%xmm6 \n" 1282 "psrlw $0x9,%%xmm6 \n" 1283 "pextrw $0x1,%%xmm2,%k3 \n" 1284 "sub $0x2,%2 \n" 1285 "jl 29f \n" 1286 "movdqa %%xmm2,%%xmm0 \n" 1287 "paddd %%xmm3,%%xmm0 \n" 1288 "punpckldq %%xmm0,%%xmm2 \n" 1289 "punpckldq %%xmm3,%%xmm3 \n" 1290 "paddd %%xmm3,%%xmm3 \n" 1291 "pextrw $0x3,%%xmm2,%k4 \n" 1292 1293 LABELALIGN 1294 "2: \n" 1295 "movdqa %%xmm2,%%xmm1 \n" 1296 "paddd %%xmm3,%%xmm2 \n" 1297 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 1298 "psrlw $0x9,%%xmm1 \n" 1299 MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0 1300 "pshufb %%xmm5,%%xmm1 \n" 1301 "pshufb %%xmm4,%%xmm0 \n" 1302 "pxor %%xmm6,%%xmm1 \n" 1303 "pmaddubsw %%xmm1,%%xmm0 \n" 1304 "psrlw $0x7,%%xmm0 \n" 1305 "pextrw $0x1,%%xmm2,%k3 \n" 1306 "pextrw $0x3,%%xmm2,%k4 \n" 1307 "packuswb %%xmm0,%%xmm0 \n" 1308 "movq %%xmm0," MEMACCESS(0) " \n" 1309 "lea " MEMLEA(0x8,0) ",%0 \n" 1310 "sub $0x2,%2 \n" 1311 "jge 2b \n" 1312 1313 LABELALIGN 1314 "29: \n" 1315 "add $0x1,%2 \n" 1316 "jl 99f \n" 1317 "psrlw $0x9,%%xmm2 \n" 1318 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 1319 "pshufb %%xmm5,%%xmm2 \n" 1320 "pshufb %%xmm4,%%xmm0 \n" 1321 "pxor %%xmm6,%%xmm2 \n" 1322 "pmaddubsw %%xmm2,%%xmm0 \n" 1323 "psrlw $0x7,%%xmm0 \n" 1324 "packuswb %%xmm0,%%xmm0 \n" 1325 "movd %%xmm0," MEMACCESS(0) " \n" 1326 1327 LABELALIGN 1328 "99: \n" 1329 : "+r"(dst_argb), // %0 1330 "+r"(src_argb), // %1 1331 "+rm"(dst_width), // %2 1332 "=&r"(x0), // %3 1333 "=&r"(x1) // %4 1334 : "rm"(x), // %5 1335 "rm"(dx) // %6 1336 : "memory", "cc", NACL_R14 1337 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 1338 ); 1339 } 1340 1341 // Divide num by div and return as 16.16 fixed point result. 1342 int FixedDiv_X86(int num, int div) { 1343 asm volatile( 1344 "cdq \n" 1345 "shld $0x10,%%eax,%%edx \n" 1346 "shl $0x10,%%eax \n" 1347 "idiv %1 \n" 1348 "mov %0, %%eax \n" 1349 : "+a"(num) // %0 1350 : "c"(div) // %1 1351 : "memory", "cc", "edx"); 1352 return num; 1353 } 1354 1355 // Divide num - 1 by div - 1 and return as 16.16 fixed point result. 1356 int FixedDiv1_X86(int num, int div) { 1357 asm volatile( 1358 "cdq \n" 1359 "shld $0x10,%%eax,%%edx \n" 1360 "shl $0x10,%%eax \n" 1361 "sub $0x10001,%%eax \n" 1362 "sbb $0x0,%%edx \n" 1363 "sub $0x1,%1 \n" 1364 "idiv %1 \n" 1365 "mov %0, %%eax \n" 1366 : "+a"(num) // %0 1367 : "c"(div) // %1 1368 : "memory", "cc", "edx"); 1369 return num; 1370 } 1371 1372 #endif // defined(__x86_64__) || defined(__i386__) 1373 1374 #ifdef __cplusplus 1375 } // extern "C" 1376 } // namespace libyuv 1377 #endif 1378