1 2 /* A program to test SSE4.1/SSE4.2 instructions. 3 Revisions: Nov.208 - wrote this file 4 Apr.10.2010 - added PEXTR* tests 5 Apr.16.2010 - added PINS* tests 6 */ 7 8 /* HOW TO COMPILE: 9 gcc -m64 -g -O -Wall -o sse4-64 sse4-64.c 10 */ 11 12 #include <stdio.h> 13 #include <stdlib.h> 14 #include <assert.h> 15 //#include "tests/malloc.h" // reenable when reintegrated 16 #include <string.h> 17 18 19 20 // rmme when reintegrated 21 // Allocates a 16-aligned block. Asserts if the allocation fails. 22 #ifdef VGO_darwin 23 #include <stdlib.h> 24 #else 25 #include <malloc.h> 26 #endif 27 __attribute__((unused)) 28 static void* memalign16(size_t szB) 29 { 30 void* x; 31 #if defined(VGO_darwin) 32 // Darwin lacks memalign, but its malloc is always 16-aligned anyway. 33 x = malloc(szB); 34 #else 35 x = memalign(16, szB); 36 #endif 37 assert(x); 38 assert(0 == ((16-1) & (unsigned long)x)); 39 return x; 40 } 41 42 43 44 typedef unsigned char V128[16]; 45 typedef unsigned int UInt; 46 typedef signed int Int; 47 typedef unsigned char UChar; 48 typedef unsigned long long int ULong; 49 50 typedef unsigned char Bool; 51 #define False ((Bool)0) 52 #define True ((Bool)1) 53 54 55 typedef 56 struct { 57 V128 arg1; 58 V128 arg2; 59 V128 res; 60 } 61 RRArgs; 62 63 typedef 64 struct { 65 V128 arg1; 66 V128 res; 67 } 68 RMArgs; 69 70 static void do64HLtoV128 ( /*OUT*/V128* res, ULong wHi, ULong wLo ) 71 { 72 // try to sidestep strict-aliasing snafus by memcpying explicitly 73 UChar* p = (UChar*)res; 74 memcpy(&p[8], (UChar*)&wHi, 8); 75 memcpy(&p[0], (UChar*)&wLo, 8); 76 } 77 78 static UChar randUChar ( void ) 79 { 80 static UInt seed = 80021; 81 seed = 1103515245 * seed + 12345; 82 return (seed >> 17) & 0xFF; 83 } 84 85 static ULong randULong ( void ) 86 { 87 Int i; 88 ULong r = 0; 89 for (i = 0; i < 8; i++) { 90 r = (r << 8) | (ULong)(0xFF & randUChar()); 91 } 92 return r; 93 } 94 95 static void randV128 ( V128* v ) 96 { 97 Int i; 98 for (i = 0; i < 16; i++) 99 (*v)[i] = randUChar(); 100 } 101 102 static void showV128 ( V128* v ) 103 { 104 Int i; 105 for (i = 15; i >= 0; i--) 106 printf("%02x", (Int)(*v)[i]); 107 } 108 109 static void showMaskedV128 ( V128* v, V128* mask ) 110 { 111 Int i; 112 for (i = 15; i >= 0; i--) 113 printf("%02x", (Int)( ((*v)[i]) & ((*mask)[i]) )); 114 } 115 116 static void showIGVV( char* rOrM, char* op, Int imm, 117 ULong src64, V128* dst, V128* res ) 118 { 119 printf("%s %10s $%d ", rOrM, op, imm); 120 printf("%016llx", src64); 121 printf(" "); 122 showV128(dst); 123 printf(" "); 124 showV128(res); 125 printf("\n"); 126 } 127 128 static void showIAG ( char* rOrM, char* op, Int imm, 129 V128* argL, ULong argR, ULong res ) 130 { 131 printf("%s %10s $%d ", rOrM, op, imm); 132 showV128(argL); 133 printf(" "); 134 printf("%016llx", argR); 135 printf(" "); 136 printf("%016llx", res); 137 printf("\n"); 138 } 139 140 static void showIAA ( char* rOrM, char* op, Int imm, RRArgs* rra, V128* rmask ) 141 { 142 printf("%s %10s $%d ", rOrM, op, imm); 143 showV128(&rra->arg1); 144 printf(" "); 145 showV128(&rra->arg2); 146 printf(" "); 147 showMaskedV128(&rra->res, rmask); 148 printf("\n"); 149 } 150 151 static void showAA ( char* rOrM, char* op, RRArgs* rra, V128* rmask ) 152 { 153 printf("%s %10s ", rOrM, op); 154 showV128(&rra->arg1); 155 printf(" "); 156 showV128(&rra->arg2); 157 printf(" "); 158 showMaskedV128(&rra->res, rmask); 159 printf("\n"); 160 } 161 162 /* Note: these are little endian. Hence first byte is the least 163 significant byte of lane zero. */ 164 165 /* Mask for insns where all result bits are non-approximated. */ 166 static V128 AllMask = { 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 167 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF }; 168 169 /* Mark for insns which produce approximated vector short results. */ 170 __attribute__((unused)) 171 static V128 ApproxPS = { 0x00,0x00,0x80,0xFF, 0x00,0x00,0x80,0xFF, 172 0x00,0x00,0x80,0xFF, 0x00,0x00,0x80,0xFF }; 173 174 /* Mark for insns which produce approximated scalar short results. */ 175 __attribute__((unused)) 176 static V128 ApproxSS = { 0x00,0x00,0x80,0xFF, 0xFF,0xFF,0xFF,0xFF, 177 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF }; 178 179 static V128 fives = { 0x55,0x55,0x55,0x55, 0x55,0x55,0x55,0x55, 180 0x55,0x55,0x55,0x55, 0x55,0x55,0x55,0x55 }; 181 182 static V128 zeroes = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 183 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 }; 184 185 double mkPosInf ( void ) { return 1.0 / 0.0; } 186 double mkNegInf ( void ) { return -mkPosInf(); } 187 double mkPosNan ( void ) { return 0.0 / 0.0; } 188 double mkNegNan ( void ) { return -mkPosNan(); } 189 190 __attribute__((noinline)) 191 UInt get_mxcsr ( void ) 192 { 193 ULong w64; 194 __asm__ __volatile__( 195 "subq $8, %%rsp" "\n\t" 196 "stmxcsr (%%rsp)" "\n\t" 197 "movq (%%rsp), %0" "\n" 198 "addq $8, %%rsp" 199 : /*OUT*/"=r"(w64) : /*IN*/ : "memory","cc" 200 ); 201 if (0) printf("get %08x\n", (UInt)w64); 202 return (UInt)w64; 203 } 204 205 __attribute__((noinline)) 206 void set_mxcsr ( UInt w32 ) 207 { 208 if (0) printf("set %08x\n", w32); 209 ULong w64 = (ULong)w32; 210 __asm__ __volatile__( 211 "subq $8, %%rsp" "\n\t" 212 "movq %0, (%%rsp)" "\n\t" 213 "ldmxcsr (%%rsp)" "\n\t" 214 "addq $8, %%rsp" 215 : /*OUT*/ : /*IN*/"r"(w64) : "memory",/*"mxcsr",*/"cc" 216 ); 217 } 218 219 UInt get_sse_roundingmode ( void ) 220 { 221 UInt w = get_mxcsr(); 222 return (w >> 13) & 3; 223 } 224 225 void set_sse_roundingmode ( UInt m ) 226 { 227 UInt w; 228 assert(0 == (m & ~3)); 229 w = get_mxcsr(); 230 w &= ~(3 << 13); 231 w |= (m << 13); 232 set_mxcsr(w); 233 } 234 235 236 #define DO_imm_r_r(_opname, _imm, _src, _dst) \ 237 { \ 238 V128 _tmp; \ 239 __asm__ __volatile__( \ 240 "movupd (%0), %%xmm2" "\n\t" \ 241 "movupd (%1), %%xmm11" "\n\t" \ 242 _opname " $" #_imm ", %%xmm2, %%xmm11" "\n\t" \ 243 "movupd %%xmm11, (%2)" "\n" \ 244 : /*out*/ : /*in*/ "r"(&(_src)), "r"(&(_dst)), "r"(&(_tmp)) \ 245 : "cc", "memory", "xmm2", "xmm11" \ 246 ); \ 247 RRArgs rra; \ 248 memcpy(&rra.arg1, &(_src), sizeof(V128)); \ 249 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \ 250 memcpy(&rra.res, &(_tmp), sizeof(V128)); \ 251 showIAA("r", (_opname), (_imm), &rra, &AllMask); \ 252 } 253 254 #define DO_imm_m_r(_opname, _imm, _src, _dst) \ 255 { \ 256 V128 _tmp; \ 257 V128* _srcM = memalign16(sizeof(V128)); \ 258 memcpy(_srcM, &(_src), sizeof(V128)); \ 259 __asm__ __volatile__( \ 260 "movupd (%1), %%xmm11" "\n\t" \ 261 _opname " $" #_imm ", (%0), %%xmm11" "\n\t" \ 262 "movupd %%xmm11, (%2)" "\n" \ 263 : /*out*/ : /*in*/ "r"(_srcM), "r"(&(_dst)), "r"(&(_tmp)) \ 264 : "cc", "memory", "xmm11" \ 265 ); \ 266 RRArgs rra; \ 267 memcpy(&rra.arg1, &(_src), sizeof(V128)); \ 268 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \ 269 memcpy(&rra.res, &(_tmp), sizeof(V128)); \ 270 showIAA("m", (_opname), (_imm), &rra, &AllMask); \ 271 free(_srcM); \ 272 } 273 274 #define DO_imm_mandr_r(_opname, _imm, _src, _dst) \ 275 DO_imm_r_r( _opname, _imm, _src, _dst ) \ 276 DO_imm_m_r( _opname, _imm, _src, _dst ) 277 278 279 280 281 282 #define DO_r_r(_opname, _src, _dst) \ 283 { \ 284 V128 _tmp; \ 285 __asm__ __volatile__( \ 286 "movupd (%0), %%xmm2" "\n\t" \ 287 "movupd (%1), %%xmm11" "\n\t" \ 288 _opname " %%xmm2, %%xmm11" "\n\t" \ 289 "movupd %%xmm11, (%2)" "\n" \ 290 : /*out*/ : /*in*/ "r"(&(_src)), "r"(&(_dst)), "r"(&(_tmp)) \ 291 : "cc", "memory", "xmm2", "xmm11" \ 292 ); \ 293 RRArgs rra; \ 294 memcpy(&rra.arg1, &(_src), sizeof(V128)); \ 295 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \ 296 memcpy(&rra.res, &(_tmp), sizeof(V128)); \ 297 showAA("r", (_opname), &rra, &AllMask); \ 298 } 299 300 #define DO_m_r(_opname, _src, _dst) \ 301 { \ 302 V128 _tmp; \ 303 V128* _srcM = memalign16(sizeof(V128)); \ 304 memcpy(_srcM, &(_src), sizeof(V128)); \ 305 __asm__ __volatile__( \ 306 "movupd (%1), %%xmm11" "\n\t" \ 307 _opname " (%0), %%xmm11" "\n\t" \ 308 "movupd %%xmm11, (%2)" "\n" \ 309 : /*out*/ : /*in*/ "r"(_srcM), "r"(&(_dst)), "r"(&(_tmp)) \ 310 : "cc", "memory", "xmm11" \ 311 ); \ 312 RRArgs rra; \ 313 memcpy(&rra.arg1, &(_src), sizeof(V128)); \ 314 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \ 315 memcpy(&rra.res, &(_tmp), sizeof(V128)); \ 316 showAA("m", (_opname), &rra, &AllMask); \ 317 free(_srcM); \ 318 } 319 320 #define DO_mandr_r(_opname, _src, _dst) \ 321 DO_r_r(_opname, _src, _dst) \ 322 DO_m_r(_opname, _src, _dst) 323 324 325 326 327 #define DO_imm_r_to_rscalar(_opname, _imm, _src, _dstsuffix) \ 328 { \ 329 ULong _scbefore = 0x5555555555555555ULL; \ 330 ULong _scafter = 0xAAAAAAAAAAAAAAAAULL; \ 331 /* This assumes that gcc won't make any of %0, %1, %2 */ \ 332 /* be r11. That should be ensured (cough, cough) */ \ 333 /* by declaring r11 to be clobbered. */ \ 334 __asm__ __volatile__( \ 335 "movupd (%0), %%xmm2" "\n\t" \ 336 "movq (%1), %%r11" "\n\t" \ 337 _opname " $" #_imm ", %%xmm2, %%r11" _dstsuffix "\n\t" \ 338 "movq %%r11, (%2)" "\n" \ 339 : /*out*/ \ 340 : /*in*/ "r"(&(_src)), "r"(&(_scbefore)), "r"(&(_scafter)) \ 341 : "cc", "memory", "xmm2", "r11" \ 342 ); \ 343 showIAG("r", (_opname), (_imm), &(_src), (_scbefore), (_scafter)); \ 344 } 345 346 #define DO_imm_r_to_mscalar(_opname, _imm, _src) \ 347 { \ 348 ULong _scbefore = 0x5555555555555555ULL; \ 349 ULong _scafter = _scbefore; \ 350 __asm__ __volatile__( \ 351 "movupd (%0), %%xmm2" "\n\t" \ 352 _opname " $" #_imm ", %%xmm2, (%1)" "\n\t" \ 353 : /*out*/ \ 354 : /*in*/ "r"(&(_src)), "r"(&(_scafter)) \ 355 : "cc", "memory", "xmm2" \ 356 ); \ 357 showIAG("m", (_opname), (_imm), &(_src), (_scbefore), (_scafter)); \ 358 } 359 360 #define DO_imm_r_to_mandrscalar(_opname, _imm, _src, _dstsuffix) \ 361 DO_imm_r_to_rscalar( _opname, _imm, _src, _dstsuffix ) \ 362 DO_imm_r_to_mscalar( _opname, _imm, _src ) 363 364 365 366 367 368 369 370 371 #define DO_imm_rscalar_to_r(_opname, _imm, _src, _srcsuffix) \ 372 { \ 373 V128 dstv; \ 374 V128 res; \ 375 ULong src64 = (ULong)(_src); \ 376 memcpy(dstv, fives, sizeof(dstv)); \ 377 memcpy(res, zeroes, sizeof(res)); \ 378 /* This assumes that gcc won't make any of %0, %1, %2 */ \ 379 /* be r11. That should be ensured (cough, cough) */ \ 380 /* by declaring r11 to be clobbered. */ \ 381 __asm__ __volatile__( \ 382 "movupd (%0), %%xmm2" "\n\t" /*dstv*/ \ 383 "movq (%1), %%r11" "\n\t" /*src64*/ \ 384 _opname " $" #_imm ", %%r11" _srcsuffix ", %%xmm2" "\n\t" \ 385 "movupd %%xmm2, (%2)" "\n" /*res*/ \ 386 : /*out*/ \ 387 : /*in*/ "r"(&dstv), "r"(&src64), "r"(&res) \ 388 : "cc", "memory", "xmm2", "r11" \ 389 ); \ 390 showIGVV("r", (_opname), (_imm), src64, &dstv, &res); \ 391 } 392 #define DO_imm_mscalar_to_r(_opname, _imm, _src) \ 393 { \ 394 V128 dstv; \ 395 V128 res; \ 396 ULong src64 = (ULong)(_src); \ 397 memcpy(dstv, fives, sizeof(dstv)); \ 398 memcpy(res, zeroes, sizeof(res)); \ 399 __asm__ __volatile__( \ 400 "movupd (%0), %%xmm2" "\n\t" /*dstv*/ \ 401 _opname " $" #_imm ", (%1), %%xmm2" "\n\t" \ 402 "movupd %%xmm2, (%2)" "\n" /*res*/ \ 403 : /*out*/ \ 404 : /*in*/ "r"(&dstv), "r"(&src64), "r"(&res) \ 405 : "cc", "memory", "xmm2" \ 406 ); \ 407 showIGVV("m", (_opname), (_imm), src64, &dstv, &res); \ 408 } 409 410 #define DO_imm_mandrscalar_to_r(_opname, _imm, _src, _dstsuffix) \ 411 DO_imm_rscalar_to_r( _opname, _imm, _src, _dstsuffix ) \ 412 DO_imm_mscalar_to_r( _opname, _imm, _src ) 413 414 415 416 417 418 void test_BLENDPD ( void ) 419 { 420 V128 src, dst; 421 Int i; 422 for (i = 0; i < 10; i++) { 423 randV128(&src); 424 randV128(&dst); 425 DO_imm_mandr_r("blendpd", 0, src, dst); 426 DO_imm_mandr_r("blendpd", 1, src, dst); 427 DO_imm_mandr_r("blendpd", 2, src, dst); 428 DO_imm_mandr_r("blendpd", 3, src, dst); 429 } 430 } 431 432 void test_BLENDPS ( void ) 433 { 434 V128 src, dst; 435 Int i; 436 for (i = 0; i < 10; i++) { 437 randV128(&src); 438 randV128(&dst); 439 DO_imm_mandr_r("blendps", 0, src, dst); 440 DO_imm_mandr_r("blendps", 1, src, dst); 441 DO_imm_mandr_r("blendps", 2, src, dst); 442 DO_imm_mandr_r("blendps", 3, src, dst); 443 DO_imm_mandr_r("blendps", 4, src, dst); 444 DO_imm_mandr_r("blendps", 5, src, dst); 445 DO_imm_mandr_r("blendps", 6, src, dst); 446 DO_imm_mandr_r("blendps", 7, src, dst); 447 DO_imm_mandr_r("blendps", 8, src, dst); 448 DO_imm_mandr_r("blendps", 9, src, dst); 449 DO_imm_mandr_r("blendps", 10, src, dst); 450 DO_imm_mandr_r("blendps", 11, src, dst); 451 DO_imm_mandr_r("blendps", 12, src, dst); 452 DO_imm_mandr_r("blendps", 13, src, dst); 453 DO_imm_mandr_r("blendps", 14, src, dst); 454 DO_imm_mandr_r("blendps", 15, src, dst); 455 } 456 } 457 458 void test_DPPD ( void ) 459 { 460 V128 src, dst; 461 { 462 *(double*)(&src[0]) = 1.2345; 463 *(double*)(&src[8]) = -6.78910; 464 *(double*)(&dst[0]) = -11.121314; 465 *(double*)(&dst[8]) = 15.161718; 466 DO_imm_mandr_r("dppd", 0, src, dst); 467 DO_imm_mandr_r("dppd", 1, src, dst); 468 DO_imm_mandr_r("dppd", 2, src, dst); 469 DO_imm_mandr_r("dppd", 3, src, dst); 470 DO_imm_mandr_r("dppd", 4, src, dst); 471 DO_imm_mandr_r("dppd", 5, src, dst); 472 DO_imm_mandr_r("dppd", 6, src, dst); 473 DO_imm_mandr_r("dppd", 7, src, dst); 474 DO_imm_mandr_r("dppd", 8, src, dst); 475 DO_imm_mandr_r("dppd", 9, src, dst); 476 DO_imm_mandr_r("dppd", 10, src, dst); 477 DO_imm_mandr_r("dppd", 11, src, dst); 478 DO_imm_mandr_r("dppd", 12, src, dst); 479 DO_imm_mandr_r("dppd", 13, src, dst); 480 DO_imm_mandr_r("dppd", 14, src, dst); 481 DO_imm_mandr_r("dppd", 15, src, dst); 482 DO_imm_mandr_r("dppd", 16, src, dst); 483 DO_imm_mandr_r("dppd", 17, src, dst); 484 DO_imm_mandr_r("dppd", 18, src, dst); 485 DO_imm_mandr_r("dppd", 19, src, dst); 486 DO_imm_mandr_r("dppd", 20, src, dst); 487 DO_imm_mandr_r("dppd", 21, src, dst); 488 DO_imm_mandr_r("dppd", 22, src, dst); 489 DO_imm_mandr_r("dppd", 23, src, dst); 490 DO_imm_mandr_r("dppd", 24, src, dst); 491 DO_imm_mandr_r("dppd", 25, src, dst); 492 DO_imm_mandr_r("dppd", 26, src, dst); 493 DO_imm_mandr_r("dppd", 27, src, dst); 494 DO_imm_mandr_r("dppd", 28, src, dst); 495 DO_imm_mandr_r("dppd", 29, src, dst); 496 DO_imm_mandr_r("dppd", 30, src, dst); 497 DO_imm_mandr_r("dppd", 31, src, dst); 498 DO_imm_mandr_r("dppd", 32, src, dst); 499 DO_imm_mandr_r("dppd", 33, src, dst); 500 DO_imm_mandr_r("dppd", 34, src, dst); 501 DO_imm_mandr_r("dppd", 35, src, dst); 502 DO_imm_mandr_r("dppd", 36, src, dst); 503 DO_imm_mandr_r("dppd", 37, src, dst); 504 DO_imm_mandr_r("dppd", 38, src, dst); 505 DO_imm_mandr_r("dppd", 39, src, dst); 506 DO_imm_mandr_r("dppd", 40, src, dst); 507 DO_imm_mandr_r("dppd", 41, src, dst); 508 DO_imm_mandr_r("dppd", 42, src, dst); 509 DO_imm_mandr_r("dppd", 43, src, dst); 510 DO_imm_mandr_r("dppd", 44, src, dst); 511 DO_imm_mandr_r("dppd", 45, src, dst); 512 DO_imm_mandr_r("dppd", 46, src, dst); 513 DO_imm_mandr_r("dppd", 47, src, dst); 514 DO_imm_mandr_r("dppd", 48, src, dst); 515 DO_imm_mandr_r("dppd", 49, src, dst); 516 DO_imm_mandr_r("dppd", 50, src, dst); 517 DO_imm_mandr_r("dppd", 51, src, dst); 518 DO_imm_mandr_r("dppd", 52, src, dst); 519 DO_imm_mandr_r("dppd", 53, src, dst); 520 DO_imm_mandr_r("dppd", 54, src, dst); 521 DO_imm_mandr_r("dppd", 55, src, dst); 522 DO_imm_mandr_r("dppd", 56, src, dst); 523 DO_imm_mandr_r("dppd", 57, src, dst); 524 DO_imm_mandr_r("dppd", 58, src, dst); 525 DO_imm_mandr_r("dppd", 59, src, dst); 526 DO_imm_mandr_r("dppd", 60, src, dst); 527 DO_imm_mandr_r("dppd", 61, src, dst); 528 DO_imm_mandr_r("dppd", 62, src, dst); 529 DO_imm_mandr_r("dppd", 63, src, dst); 530 DO_imm_mandr_r("dppd", 64, src, dst); 531 DO_imm_mandr_r("dppd", 65, src, dst); 532 DO_imm_mandr_r("dppd", 66, src, dst); 533 DO_imm_mandr_r("dppd", 67, src, dst); 534 DO_imm_mandr_r("dppd", 68, src, dst); 535 DO_imm_mandr_r("dppd", 69, src, dst); 536 DO_imm_mandr_r("dppd", 70, src, dst); 537 DO_imm_mandr_r("dppd", 71, src, dst); 538 DO_imm_mandr_r("dppd", 72, src, dst); 539 DO_imm_mandr_r("dppd", 73, src, dst); 540 DO_imm_mandr_r("dppd", 74, src, dst); 541 DO_imm_mandr_r("dppd", 75, src, dst); 542 DO_imm_mandr_r("dppd", 76, src, dst); 543 DO_imm_mandr_r("dppd", 77, src, dst); 544 DO_imm_mandr_r("dppd", 78, src, dst); 545 DO_imm_mandr_r("dppd", 79, src, dst); 546 DO_imm_mandr_r("dppd", 80, src, dst); 547 DO_imm_mandr_r("dppd", 81, src, dst); 548 DO_imm_mandr_r("dppd", 82, src, dst); 549 DO_imm_mandr_r("dppd", 83, src, dst); 550 DO_imm_mandr_r("dppd", 84, src, dst); 551 DO_imm_mandr_r("dppd", 85, src, dst); 552 DO_imm_mandr_r("dppd", 86, src, dst); 553 DO_imm_mandr_r("dppd", 87, src, dst); 554 DO_imm_mandr_r("dppd", 88, src, dst); 555 DO_imm_mandr_r("dppd", 89, src, dst); 556 DO_imm_mandr_r("dppd", 90, src, dst); 557 DO_imm_mandr_r("dppd", 91, src, dst); 558 DO_imm_mandr_r("dppd", 92, src, dst); 559 DO_imm_mandr_r("dppd", 93, src, dst); 560 DO_imm_mandr_r("dppd", 94, src, dst); 561 DO_imm_mandr_r("dppd", 95, src, dst); 562 DO_imm_mandr_r("dppd", 96, src, dst); 563 DO_imm_mandr_r("dppd", 97, src, dst); 564 DO_imm_mandr_r("dppd", 98, src, dst); 565 DO_imm_mandr_r("dppd", 99, src, dst); 566 DO_imm_mandr_r("dppd", 100, src, dst); 567 DO_imm_mandr_r("dppd", 101, src, dst); 568 DO_imm_mandr_r("dppd", 102, src, dst); 569 DO_imm_mandr_r("dppd", 103, src, dst); 570 DO_imm_mandr_r("dppd", 104, src, dst); 571 DO_imm_mandr_r("dppd", 105, src, dst); 572 DO_imm_mandr_r("dppd", 106, src, dst); 573 DO_imm_mandr_r("dppd", 107, src, dst); 574 DO_imm_mandr_r("dppd", 108, src, dst); 575 DO_imm_mandr_r("dppd", 109, src, dst); 576 DO_imm_mandr_r("dppd", 110, src, dst); 577 DO_imm_mandr_r("dppd", 111, src, dst); 578 DO_imm_mandr_r("dppd", 112, src, dst); 579 DO_imm_mandr_r("dppd", 113, src, dst); 580 DO_imm_mandr_r("dppd", 114, src, dst); 581 DO_imm_mandr_r("dppd", 115, src, dst); 582 DO_imm_mandr_r("dppd", 116, src, dst); 583 DO_imm_mandr_r("dppd", 117, src, dst); 584 DO_imm_mandr_r("dppd", 118, src, dst); 585 DO_imm_mandr_r("dppd", 119, src, dst); 586 DO_imm_mandr_r("dppd", 120, src, dst); 587 DO_imm_mandr_r("dppd", 121, src, dst); 588 DO_imm_mandr_r("dppd", 122, src, dst); 589 DO_imm_mandr_r("dppd", 123, src, dst); 590 DO_imm_mandr_r("dppd", 124, src, dst); 591 DO_imm_mandr_r("dppd", 125, src, dst); 592 DO_imm_mandr_r("dppd", 126, src, dst); 593 DO_imm_mandr_r("dppd", 127, src, dst); 594 DO_imm_mandr_r("dppd", 128, src, dst); 595 DO_imm_mandr_r("dppd", 129, src, dst); 596 DO_imm_mandr_r("dppd", 130, src, dst); 597 DO_imm_mandr_r("dppd", 131, src, dst); 598 DO_imm_mandr_r("dppd", 132, src, dst); 599 DO_imm_mandr_r("dppd", 133, src, dst); 600 DO_imm_mandr_r("dppd", 134, src, dst); 601 DO_imm_mandr_r("dppd", 135, src, dst); 602 DO_imm_mandr_r("dppd", 136, src, dst); 603 DO_imm_mandr_r("dppd", 137, src, dst); 604 DO_imm_mandr_r("dppd", 138, src, dst); 605 DO_imm_mandr_r("dppd", 139, src, dst); 606 DO_imm_mandr_r("dppd", 140, src, dst); 607 DO_imm_mandr_r("dppd", 141, src, dst); 608 DO_imm_mandr_r("dppd", 142, src, dst); 609 DO_imm_mandr_r("dppd", 143, src, dst); 610 DO_imm_mandr_r("dppd", 144, src, dst); 611 DO_imm_mandr_r("dppd", 145, src, dst); 612 DO_imm_mandr_r("dppd", 146, src, dst); 613 DO_imm_mandr_r("dppd", 147, src, dst); 614 DO_imm_mandr_r("dppd", 148, src, dst); 615 DO_imm_mandr_r("dppd", 149, src, dst); 616 DO_imm_mandr_r("dppd", 150, src, dst); 617 DO_imm_mandr_r("dppd", 151, src, dst); 618 DO_imm_mandr_r("dppd", 152, src, dst); 619 DO_imm_mandr_r("dppd", 153, src, dst); 620 DO_imm_mandr_r("dppd", 154, src, dst); 621 DO_imm_mandr_r("dppd", 155, src, dst); 622 DO_imm_mandr_r("dppd", 156, src, dst); 623 DO_imm_mandr_r("dppd", 157, src, dst); 624 DO_imm_mandr_r("dppd", 158, src, dst); 625 DO_imm_mandr_r("dppd", 159, src, dst); 626 DO_imm_mandr_r("dppd", 160, src, dst); 627 DO_imm_mandr_r("dppd", 161, src, dst); 628 DO_imm_mandr_r("dppd", 162, src, dst); 629 DO_imm_mandr_r("dppd", 163, src, dst); 630 DO_imm_mandr_r("dppd", 164, src, dst); 631 DO_imm_mandr_r("dppd", 165, src, dst); 632 DO_imm_mandr_r("dppd", 166, src, dst); 633 DO_imm_mandr_r("dppd", 167, src, dst); 634 DO_imm_mandr_r("dppd", 168, src, dst); 635 DO_imm_mandr_r("dppd", 169, src, dst); 636 DO_imm_mandr_r("dppd", 170, src, dst); 637 DO_imm_mandr_r("dppd", 171, src, dst); 638 DO_imm_mandr_r("dppd", 172, src, dst); 639 DO_imm_mandr_r("dppd", 173, src, dst); 640 DO_imm_mandr_r("dppd", 174, src, dst); 641 DO_imm_mandr_r("dppd", 175, src, dst); 642 DO_imm_mandr_r("dppd", 176, src, dst); 643 DO_imm_mandr_r("dppd", 177, src, dst); 644 DO_imm_mandr_r("dppd", 178, src, dst); 645 DO_imm_mandr_r("dppd", 179, src, dst); 646 DO_imm_mandr_r("dppd", 180, src, dst); 647 DO_imm_mandr_r("dppd", 181, src, dst); 648 DO_imm_mandr_r("dppd", 182, src, dst); 649 DO_imm_mandr_r("dppd", 183, src, dst); 650 DO_imm_mandr_r("dppd", 184, src, dst); 651 DO_imm_mandr_r("dppd", 185, src, dst); 652 DO_imm_mandr_r("dppd", 186, src, dst); 653 DO_imm_mandr_r("dppd", 187, src, dst); 654 DO_imm_mandr_r("dppd", 188, src, dst); 655 DO_imm_mandr_r("dppd", 189, src, dst); 656 DO_imm_mandr_r("dppd", 190, src, dst); 657 DO_imm_mandr_r("dppd", 191, src, dst); 658 DO_imm_mandr_r("dppd", 192, src, dst); 659 DO_imm_mandr_r("dppd", 193, src, dst); 660 DO_imm_mandr_r("dppd", 194, src, dst); 661 DO_imm_mandr_r("dppd", 195, src, dst); 662 DO_imm_mandr_r("dppd", 196, src, dst); 663 DO_imm_mandr_r("dppd", 197, src, dst); 664 DO_imm_mandr_r("dppd", 198, src, dst); 665 DO_imm_mandr_r("dppd", 199, src, dst); 666 DO_imm_mandr_r("dppd", 200, src, dst); 667 DO_imm_mandr_r("dppd", 201, src, dst); 668 DO_imm_mandr_r("dppd", 202, src, dst); 669 DO_imm_mandr_r("dppd", 203, src, dst); 670 DO_imm_mandr_r("dppd", 204, src, dst); 671 DO_imm_mandr_r("dppd", 205, src, dst); 672 DO_imm_mandr_r("dppd", 206, src, dst); 673 DO_imm_mandr_r("dppd", 207, src, dst); 674 DO_imm_mandr_r("dppd", 208, src, dst); 675 DO_imm_mandr_r("dppd", 209, src, dst); 676 DO_imm_mandr_r("dppd", 210, src, dst); 677 DO_imm_mandr_r("dppd", 211, src, dst); 678 DO_imm_mandr_r("dppd", 212, src, dst); 679 DO_imm_mandr_r("dppd", 213, src, dst); 680 DO_imm_mandr_r("dppd", 214, src, dst); 681 DO_imm_mandr_r("dppd", 215, src, dst); 682 DO_imm_mandr_r("dppd", 216, src, dst); 683 DO_imm_mandr_r("dppd", 217, src, dst); 684 DO_imm_mandr_r("dppd", 218, src, dst); 685 DO_imm_mandr_r("dppd", 219, src, dst); 686 DO_imm_mandr_r("dppd", 220, src, dst); 687 DO_imm_mandr_r("dppd", 221, src, dst); 688 DO_imm_mandr_r("dppd", 222, src, dst); 689 DO_imm_mandr_r("dppd", 223, src, dst); 690 DO_imm_mandr_r("dppd", 224, src, dst); 691 DO_imm_mandr_r("dppd", 225, src, dst); 692 DO_imm_mandr_r("dppd", 226, src, dst); 693 DO_imm_mandr_r("dppd", 227, src, dst); 694 DO_imm_mandr_r("dppd", 228, src, dst); 695 DO_imm_mandr_r("dppd", 229, src, dst); 696 DO_imm_mandr_r("dppd", 230, src, dst); 697 DO_imm_mandr_r("dppd", 231, src, dst); 698 DO_imm_mandr_r("dppd", 232, src, dst); 699 DO_imm_mandr_r("dppd", 233, src, dst); 700 DO_imm_mandr_r("dppd", 234, src, dst); 701 DO_imm_mandr_r("dppd", 235, src, dst); 702 DO_imm_mandr_r("dppd", 236, src, dst); 703 DO_imm_mandr_r("dppd", 237, src, dst); 704 DO_imm_mandr_r("dppd", 238, src, dst); 705 DO_imm_mandr_r("dppd", 239, src, dst); 706 DO_imm_mandr_r("dppd", 240, src, dst); 707 DO_imm_mandr_r("dppd", 241, src, dst); 708 DO_imm_mandr_r("dppd", 242, src, dst); 709 DO_imm_mandr_r("dppd", 243, src, dst); 710 DO_imm_mandr_r("dppd", 244, src, dst); 711 DO_imm_mandr_r("dppd", 245, src, dst); 712 DO_imm_mandr_r("dppd", 246, src, dst); 713 DO_imm_mandr_r("dppd", 247, src, dst); 714 DO_imm_mandr_r("dppd", 248, src, dst); 715 DO_imm_mandr_r("dppd", 249, src, dst); 716 DO_imm_mandr_r("dppd", 250, src, dst); 717 DO_imm_mandr_r("dppd", 251, src, dst); 718 DO_imm_mandr_r("dppd", 252, src, dst); 719 DO_imm_mandr_r("dppd", 253, src, dst); 720 DO_imm_mandr_r("dppd", 254, src, dst); 721 DO_imm_mandr_r("dppd", 255, src, dst); 722 } 723 } 724 725 void test_DPPS ( void ) 726 { 727 V128 src, dst; 728 { 729 *(float*)(&src[0]) = 1.2; 730 *(float*)(&src[4]) = -3.4; 731 *(float*)(&src[8]) = -6.7; 732 *(float*)(&src[12]) = 8.9; 733 *(float*)(&dst[0]) = -10.11; 734 *(float*)(&dst[4]) = 12.13; 735 *(float*)(&dst[8]) = 14.15; 736 *(float*)(&dst[12]) = -16.17; 737 DO_imm_mandr_r("dpps", 0, src, dst); 738 DO_imm_mandr_r("dpps", 1, src, dst); 739 DO_imm_mandr_r("dpps", 2, src, dst); 740 DO_imm_mandr_r("dpps", 3, src, dst); 741 DO_imm_mandr_r("dpps", 4, src, dst); 742 DO_imm_mandr_r("dpps", 5, src, dst); 743 DO_imm_mandr_r("dpps", 6, src, dst); 744 DO_imm_mandr_r("dpps", 7, src, dst); 745 DO_imm_mandr_r("dpps", 8, src, dst); 746 DO_imm_mandr_r("dpps", 9, src, dst); 747 DO_imm_mandr_r("dpps", 10, src, dst); 748 DO_imm_mandr_r("dpps", 11, src, dst); 749 DO_imm_mandr_r("dpps", 12, src, dst); 750 DO_imm_mandr_r("dpps", 13, src, dst); 751 DO_imm_mandr_r("dpps", 14, src, dst); 752 DO_imm_mandr_r("dpps", 15, src, dst); 753 DO_imm_mandr_r("dpps", 16, src, dst); 754 DO_imm_mandr_r("dpps", 17, src, dst); 755 DO_imm_mandr_r("dpps", 18, src, dst); 756 DO_imm_mandr_r("dpps", 19, src, dst); 757 DO_imm_mandr_r("dpps", 20, src, dst); 758 DO_imm_mandr_r("dpps", 21, src, dst); 759 DO_imm_mandr_r("dpps", 22, src, dst); 760 DO_imm_mandr_r("dpps", 23, src, dst); 761 DO_imm_mandr_r("dpps", 24, src, dst); 762 DO_imm_mandr_r("dpps", 25, src, dst); 763 DO_imm_mandr_r("dpps", 26, src, dst); 764 DO_imm_mandr_r("dpps", 27, src, dst); 765 DO_imm_mandr_r("dpps", 28, src, dst); 766 DO_imm_mandr_r("dpps", 29, src, dst); 767 DO_imm_mandr_r("dpps", 30, src, dst); 768 DO_imm_mandr_r("dpps", 31, src, dst); 769 DO_imm_mandr_r("dpps", 32, src, dst); 770 DO_imm_mandr_r("dpps", 33, src, dst); 771 DO_imm_mandr_r("dpps", 34, src, dst); 772 DO_imm_mandr_r("dpps", 35, src, dst); 773 DO_imm_mandr_r("dpps", 36, src, dst); 774 DO_imm_mandr_r("dpps", 37, src, dst); 775 DO_imm_mandr_r("dpps", 38, src, dst); 776 DO_imm_mandr_r("dpps", 39, src, dst); 777 DO_imm_mandr_r("dpps", 40, src, dst); 778 DO_imm_mandr_r("dpps", 41, src, dst); 779 DO_imm_mandr_r("dpps", 42, src, dst); 780 DO_imm_mandr_r("dpps", 43, src, dst); 781 DO_imm_mandr_r("dpps", 44, src, dst); 782 DO_imm_mandr_r("dpps", 45, src, dst); 783 DO_imm_mandr_r("dpps", 46, src, dst); 784 DO_imm_mandr_r("dpps", 47, src, dst); 785 DO_imm_mandr_r("dpps", 48, src, dst); 786 DO_imm_mandr_r("dpps", 49, src, dst); 787 DO_imm_mandr_r("dpps", 50, src, dst); 788 DO_imm_mandr_r("dpps", 51, src, dst); 789 DO_imm_mandr_r("dpps", 52, src, dst); 790 DO_imm_mandr_r("dpps", 53, src, dst); 791 DO_imm_mandr_r("dpps", 54, src, dst); 792 DO_imm_mandr_r("dpps", 55, src, dst); 793 DO_imm_mandr_r("dpps", 56, src, dst); 794 DO_imm_mandr_r("dpps", 57, src, dst); 795 DO_imm_mandr_r("dpps", 58, src, dst); 796 DO_imm_mandr_r("dpps", 59, src, dst); 797 DO_imm_mandr_r("dpps", 60, src, dst); 798 DO_imm_mandr_r("dpps", 61, src, dst); 799 DO_imm_mandr_r("dpps", 62, src, dst); 800 DO_imm_mandr_r("dpps", 63, src, dst); 801 DO_imm_mandr_r("dpps", 64, src, dst); 802 DO_imm_mandr_r("dpps", 65, src, dst); 803 DO_imm_mandr_r("dpps", 66, src, dst); 804 DO_imm_mandr_r("dpps", 67, src, dst); 805 DO_imm_mandr_r("dpps", 68, src, dst); 806 DO_imm_mandr_r("dpps", 69, src, dst); 807 DO_imm_mandr_r("dpps", 70, src, dst); 808 DO_imm_mandr_r("dpps", 71, src, dst); 809 DO_imm_mandr_r("dpps", 72, src, dst); 810 DO_imm_mandr_r("dpps", 73, src, dst); 811 DO_imm_mandr_r("dpps", 74, src, dst); 812 DO_imm_mandr_r("dpps", 75, src, dst); 813 DO_imm_mandr_r("dpps", 76, src, dst); 814 DO_imm_mandr_r("dpps", 77, src, dst); 815 DO_imm_mandr_r("dpps", 78, src, dst); 816 DO_imm_mandr_r("dpps", 79, src, dst); 817 DO_imm_mandr_r("dpps", 80, src, dst); 818 DO_imm_mandr_r("dpps", 81, src, dst); 819 DO_imm_mandr_r("dpps", 82, src, dst); 820 DO_imm_mandr_r("dpps", 83, src, dst); 821 DO_imm_mandr_r("dpps", 84, src, dst); 822 DO_imm_mandr_r("dpps", 85, src, dst); 823 DO_imm_mandr_r("dpps", 86, src, dst); 824 DO_imm_mandr_r("dpps", 87, src, dst); 825 DO_imm_mandr_r("dpps", 88, src, dst); 826 DO_imm_mandr_r("dpps", 89, src, dst); 827 DO_imm_mandr_r("dpps", 90, src, dst); 828 DO_imm_mandr_r("dpps", 91, src, dst); 829 DO_imm_mandr_r("dpps", 92, src, dst); 830 DO_imm_mandr_r("dpps", 93, src, dst); 831 DO_imm_mandr_r("dpps", 94, src, dst); 832 DO_imm_mandr_r("dpps", 95, src, dst); 833 DO_imm_mandr_r("dpps", 96, src, dst); 834 DO_imm_mandr_r("dpps", 97, src, dst); 835 DO_imm_mandr_r("dpps", 98, src, dst); 836 DO_imm_mandr_r("dpps", 99, src, dst); 837 DO_imm_mandr_r("dpps", 100, src, dst); 838 DO_imm_mandr_r("dpps", 101, src, dst); 839 DO_imm_mandr_r("dpps", 102, src, dst); 840 DO_imm_mandr_r("dpps", 103, src, dst); 841 DO_imm_mandr_r("dpps", 104, src, dst); 842 DO_imm_mandr_r("dpps", 105, src, dst); 843 DO_imm_mandr_r("dpps", 106, src, dst); 844 DO_imm_mandr_r("dpps", 107, src, dst); 845 DO_imm_mandr_r("dpps", 108, src, dst); 846 DO_imm_mandr_r("dpps", 109, src, dst); 847 DO_imm_mandr_r("dpps", 110, src, dst); 848 DO_imm_mandr_r("dpps", 111, src, dst); 849 DO_imm_mandr_r("dpps", 112, src, dst); 850 DO_imm_mandr_r("dpps", 113, src, dst); 851 DO_imm_mandr_r("dpps", 114, src, dst); 852 DO_imm_mandr_r("dpps", 115, src, dst); 853 DO_imm_mandr_r("dpps", 116, src, dst); 854 DO_imm_mandr_r("dpps", 117, src, dst); 855 DO_imm_mandr_r("dpps", 118, src, dst); 856 DO_imm_mandr_r("dpps", 119, src, dst); 857 DO_imm_mandr_r("dpps", 120, src, dst); 858 DO_imm_mandr_r("dpps", 121, src, dst); 859 DO_imm_mandr_r("dpps", 122, src, dst); 860 DO_imm_mandr_r("dpps", 123, src, dst); 861 DO_imm_mandr_r("dpps", 124, src, dst); 862 DO_imm_mandr_r("dpps", 125, src, dst); 863 DO_imm_mandr_r("dpps", 126, src, dst); 864 DO_imm_mandr_r("dpps", 127, src, dst); 865 DO_imm_mandr_r("dpps", 128, src, dst); 866 DO_imm_mandr_r("dpps", 129, src, dst); 867 DO_imm_mandr_r("dpps", 130, src, dst); 868 DO_imm_mandr_r("dpps", 131, src, dst); 869 DO_imm_mandr_r("dpps", 132, src, dst); 870 DO_imm_mandr_r("dpps", 133, src, dst); 871 DO_imm_mandr_r("dpps", 134, src, dst); 872 DO_imm_mandr_r("dpps", 135, src, dst); 873 DO_imm_mandr_r("dpps", 136, src, dst); 874 DO_imm_mandr_r("dpps", 137, src, dst); 875 DO_imm_mandr_r("dpps", 138, src, dst); 876 DO_imm_mandr_r("dpps", 139, src, dst); 877 DO_imm_mandr_r("dpps", 140, src, dst); 878 DO_imm_mandr_r("dpps", 141, src, dst); 879 DO_imm_mandr_r("dpps", 142, src, dst); 880 DO_imm_mandr_r("dpps", 143, src, dst); 881 DO_imm_mandr_r("dpps", 144, src, dst); 882 DO_imm_mandr_r("dpps", 145, src, dst); 883 DO_imm_mandr_r("dpps", 146, src, dst); 884 DO_imm_mandr_r("dpps", 147, src, dst); 885 DO_imm_mandr_r("dpps", 148, src, dst); 886 DO_imm_mandr_r("dpps", 149, src, dst); 887 DO_imm_mandr_r("dpps", 150, src, dst); 888 DO_imm_mandr_r("dpps", 151, src, dst); 889 DO_imm_mandr_r("dpps", 152, src, dst); 890 DO_imm_mandr_r("dpps", 153, src, dst); 891 DO_imm_mandr_r("dpps", 154, src, dst); 892 DO_imm_mandr_r("dpps", 155, src, dst); 893 DO_imm_mandr_r("dpps", 156, src, dst); 894 DO_imm_mandr_r("dpps", 157, src, dst); 895 DO_imm_mandr_r("dpps", 158, src, dst); 896 DO_imm_mandr_r("dpps", 159, src, dst); 897 DO_imm_mandr_r("dpps", 160, src, dst); 898 DO_imm_mandr_r("dpps", 161, src, dst); 899 DO_imm_mandr_r("dpps", 162, src, dst); 900 DO_imm_mandr_r("dpps", 163, src, dst); 901 DO_imm_mandr_r("dpps", 164, src, dst); 902 DO_imm_mandr_r("dpps", 165, src, dst); 903 DO_imm_mandr_r("dpps", 166, src, dst); 904 DO_imm_mandr_r("dpps", 167, src, dst); 905 DO_imm_mandr_r("dpps", 168, src, dst); 906 DO_imm_mandr_r("dpps", 169, src, dst); 907 DO_imm_mandr_r("dpps", 170, src, dst); 908 DO_imm_mandr_r("dpps", 171, src, dst); 909 DO_imm_mandr_r("dpps", 172, src, dst); 910 DO_imm_mandr_r("dpps", 173, src, dst); 911 DO_imm_mandr_r("dpps", 174, src, dst); 912 DO_imm_mandr_r("dpps", 175, src, dst); 913 DO_imm_mandr_r("dpps", 176, src, dst); 914 DO_imm_mandr_r("dpps", 177, src, dst); 915 DO_imm_mandr_r("dpps", 178, src, dst); 916 DO_imm_mandr_r("dpps", 179, src, dst); 917 DO_imm_mandr_r("dpps", 180, src, dst); 918 DO_imm_mandr_r("dpps", 181, src, dst); 919 DO_imm_mandr_r("dpps", 182, src, dst); 920 DO_imm_mandr_r("dpps", 183, src, dst); 921 DO_imm_mandr_r("dpps", 184, src, dst); 922 DO_imm_mandr_r("dpps", 185, src, dst); 923 DO_imm_mandr_r("dpps", 186, src, dst); 924 DO_imm_mandr_r("dpps", 187, src, dst); 925 DO_imm_mandr_r("dpps", 188, src, dst); 926 DO_imm_mandr_r("dpps", 189, src, dst); 927 DO_imm_mandr_r("dpps", 190, src, dst); 928 DO_imm_mandr_r("dpps", 191, src, dst); 929 DO_imm_mandr_r("dpps", 192, src, dst); 930 DO_imm_mandr_r("dpps", 193, src, dst); 931 DO_imm_mandr_r("dpps", 194, src, dst); 932 DO_imm_mandr_r("dpps", 195, src, dst); 933 DO_imm_mandr_r("dpps", 196, src, dst); 934 DO_imm_mandr_r("dpps", 197, src, dst); 935 DO_imm_mandr_r("dpps", 198, src, dst); 936 DO_imm_mandr_r("dpps", 199, src, dst); 937 DO_imm_mandr_r("dpps", 200, src, dst); 938 DO_imm_mandr_r("dpps", 201, src, dst); 939 DO_imm_mandr_r("dpps", 202, src, dst); 940 DO_imm_mandr_r("dpps", 203, src, dst); 941 DO_imm_mandr_r("dpps", 204, src, dst); 942 DO_imm_mandr_r("dpps", 205, src, dst); 943 DO_imm_mandr_r("dpps", 206, src, dst); 944 DO_imm_mandr_r("dpps", 207, src, dst); 945 DO_imm_mandr_r("dpps", 208, src, dst); 946 DO_imm_mandr_r("dpps", 209, src, dst); 947 DO_imm_mandr_r("dpps", 210, src, dst); 948 DO_imm_mandr_r("dpps", 211, src, dst); 949 DO_imm_mandr_r("dpps", 212, src, dst); 950 DO_imm_mandr_r("dpps", 213, src, dst); 951 DO_imm_mandr_r("dpps", 214, src, dst); 952 DO_imm_mandr_r("dpps", 215, src, dst); 953 DO_imm_mandr_r("dpps", 216, src, dst); 954 DO_imm_mandr_r("dpps", 217, src, dst); 955 DO_imm_mandr_r("dpps", 218, src, dst); 956 DO_imm_mandr_r("dpps", 219, src, dst); 957 DO_imm_mandr_r("dpps", 220, src, dst); 958 DO_imm_mandr_r("dpps", 221, src, dst); 959 DO_imm_mandr_r("dpps", 222, src, dst); 960 DO_imm_mandr_r("dpps", 223, src, dst); 961 DO_imm_mandr_r("dpps", 224, src, dst); 962 DO_imm_mandr_r("dpps", 225, src, dst); 963 DO_imm_mandr_r("dpps", 226, src, dst); 964 DO_imm_mandr_r("dpps", 227, src, dst); 965 DO_imm_mandr_r("dpps", 228, src, dst); 966 DO_imm_mandr_r("dpps", 229, src, dst); 967 DO_imm_mandr_r("dpps", 230, src, dst); 968 DO_imm_mandr_r("dpps", 231, src, dst); 969 DO_imm_mandr_r("dpps", 232, src, dst); 970 DO_imm_mandr_r("dpps", 233, src, dst); 971 DO_imm_mandr_r("dpps", 234, src, dst); 972 DO_imm_mandr_r("dpps", 235, src, dst); 973 DO_imm_mandr_r("dpps", 236, src, dst); 974 DO_imm_mandr_r("dpps", 237, src, dst); 975 DO_imm_mandr_r("dpps", 238, src, dst); 976 DO_imm_mandr_r("dpps", 239, src, dst); 977 DO_imm_mandr_r("dpps", 240, src, dst); 978 DO_imm_mandr_r("dpps", 241, src, dst); 979 DO_imm_mandr_r("dpps", 242, src, dst); 980 DO_imm_mandr_r("dpps", 243, src, dst); 981 DO_imm_mandr_r("dpps", 244, src, dst); 982 DO_imm_mandr_r("dpps", 245, src, dst); 983 DO_imm_mandr_r("dpps", 246, src, dst); 984 DO_imm_mandr_r("dpps", 247, src, dst); 985 DO_imm_mandr_r("dpps", 248, src, dst); 986 DO_imm_mandr_r("dpps", 249, src, dst); 987 DO_imm_mandr_r("dpps", 250, src, dst); 988 DO_imm_mandr_r("dpps", 251, src, dst); 989 DO_imm_mandr_r("dpps", 252, src, dst); 990 DO_imm_mandr_r("dpps", 253, src, dst); 991 DO_imm_mandr_r("dpps", 254, src, dst); 992 DO_imm_mandr_r("dpps", 255, src, dst); 993 } 994 } 995 996 void test_INSERTPS ( void ) 997 { 998 V128 src, dst; 999 { 1000 *(float*)(&src[0]) = 1.2; 1001 *(float*)(&src[4]) = -3.4; 1002 *(float*)(&src[8]) = -6.7; 1003 *(float*)(&src[12]) = 8.9; 1004 *(float*)(&dst[0]) = -10.11; 1005 *(float*)(&dst[4]) = 12.13; 1006 *(float*)(&dst[8]) = 14.15; 1007 *(float*)(&dst[12]) = -16.17; 1008 DO_imm_mandr_r("insertps", 0, src, dst); 1009 DO_imm_mandr_r("insertps", 1, src, dst); 1010 DO_imm_mandr_r("insertps", 2, src, dst); 1011 DO_imm_mandr_r("insertps", 3, src, dst); 1012 DO_imm_mandr_r("insertps", 4, src, dst); 1013 DO_imm_mandr_r("insertps", 5, src, dst); 1014 DO_imm_mandr_r("insertps", 6, src, dst); 1015 DO_imm_mandr_r("insertps", 7, src, dst); 1016 DO_imm_mandr_r("insertps", 8, src, dst); 1017 DO_imm_mandr_r("insertps", 9, src, dst); 1018 DO_imm_mandr_r("insertps", 10, src, dst); 1019 DO_imm_mandr_r("insertps", 11, src, dst); 1020 DO_imm_mandr_r("insertps", 12, src, dst); 1021 DO_imm_mandr_r("insertps", 13, src, dst); 1022 DO_imm_mandr_r("insertps", 14, src, dst); 1023 DO_imm_mandr_r("insertps", 15, src, dst); 1024 DO_imm_mandr_r("insertps", 16, src, dst); 1025 DO_imm_mandr_r("insertps", 17, src, dst); 1026 DO_imm_mandr_r("insertps", 18, src, dst); 1027 DO_imm_mandr_r("insertps", 19, src, dst); 1028 DO_imm_mandr_r("insertps", 20, src, dst); 1029 DO_imm_mandr_r("insertps", 21, src, dst); 1030 DO_imm_mandr_r("insertps", 22, src, dst); 1031 DO_imm_mandr_r("insertps", 23, src, dst); 1032 DO_imm_mandr_r("insertps", 24, src, dst); 1033 DO_imm_mandr_r("insertps", 25, src, dst); 1034 DO_imm_mandr_r("insertps", 26, src, dst); 1035 DO_imm_mandr_r("insertps", 27, src, dst); 1036 DO_imm_mandr_r("insertps", 28, src, dst); 1037 DO_imm_mandr_r("insertps", 29, src, dst); 1038 DO_imm_mandr_r("insertps", 30, src, dst); 1039 DO_imm_mandr_r("insertps", 31, src, dst); 1040 DO_imm_mandr_r("insertps", 32, src, dst); 1041 DO_imm_mandr_r("insertps", 33, src, dst); 1042 DO_imm_mandr_r("insertps", 34, src, dst); 1043 DO_imm_mandr_r("insertps", 35, src, dst); 1044 DO_imm_mandr_r("insertps", 36, src, dst); 1045 DO_imm_mandr_r("insertps", 37, src, dst); 1046 DO_imm_mandr_r("insertps", 38, src, dst); 1047 DO_imm_mandr_r("insertps", 39, src, dst); 1048 DO_imm_mandr_r("insertps", 40, src, dst); 1049 DO_imm_mandr_r("insertps", 41, src, dst); 1050 DO_imm_mandr_r("insertps", 42, src, dst); 1051 DO_imm_mandr_r("insertps", 43, src, dst); 1052 DO_imm_mandr_r("insertps", 44, src, dst); 1053 DO_imm_mandr_r("insertps", 45, src, dst); 1054 DO_imm_mandr_r("insertps", 46, src, dst); 1055 DO_imm_mandr_r("insertps", 47, src, dst); 1056 DO_imm_mandr_r("insertps", 48, src, dst); 1057 DO_imm_mandr_r("insertps", 49, src, dst); 1058 DO_imm_mandr_r("insertps", 50, src, dst); 1059 DO_imm_mandr_r("insertps", 51, src, dst); 1060 DO_imm_mandr_r("insertps", 52, src, dst); 1061 DO_imm_mandr_r("insertps", 53, src, dst); 1062 DO_imm_mandr_r("insertps", 54, src, dst); 1063 DO_imm_mandr_r("insertps", 55, src, dst); 1064 DO_imm_mandr_r("insertps", 56, src, dst); 1065 DO_imm_mandr_r("insertps", 57, src, dst); 1066 DO_imm_mandr_r("insertps", 58, src, dst); 1067 DO_imm_mandr_r("insertps", 59, src, dst); 1068 DO_imm_mandr_r("insertps", 60, src, dst); 1069 DO_imm_mandr_r("insertps", 61, src, dst); 1070 DO_imm_mandr_r("insertps", 62, src, dst); 1071 DO_imm_mandr_r("insertps", 63, src, dst); 1072 DO_imm_mandr_r("insertps", 64, src, dst); 1073 DO_imm_mandr_r("insertps", 65, src, dst); 1074 DO_imm_mandr_r("insertps", 66, src, dst); 1075 DO_imm_mandr_r("insertps", 67, src, dst); 1076 DO_imm_mandr_r("insertps", 68, src, dst); 1077 DO_imm_mandr_r("insertps", 69, src, dst); 1078 DO_imm_mandr_r("insertps", 70, src, dst); 1079 DO_imm_mandr_r("insertps", 71, src, dst); 1080 DO_imm_mandr_r("insertps", 72, src, dst); 1081 DO_imm_mandr_r("insertps", 73, src, dst); 1082 DO_imm_mandr_r("insertps", 74, src, dst); 1083 DO_imm_mandr_r("insertps", 75, src, dst); 1084 DO_imm_mandr_r("insertps", 76, src, dst); 1085 DO_imm_mandr_r("insertps", 77, src, dst); 1086 DO_imm_mandr_r("insertps", 78, src, dst); 1087 DO_imm_mandr_r("insertps", 79, src, dst); 1088 DO_imm_mandr_r("insertps", 80, src, dst); 1089 DO_imm_mandr_r("insertps", 81, src, dst); 1090 DO_imm_mandr_r("insertps", 82, src, dst); 1091 DO_imm_mandr_r("insertps", 83, src, dst); 1092 DO_imm_mandr_r("insertps", 84, src, dst); 1093 DO_imm_mandr_r("insertps", 85, src, dst); 1094 DO_imm_mandr_r("insertps", 86, src, dst); 1095 DO_imm_mandr_r("insertps", 87, src, dst); 1096 DO_imm_mandr_r("insertps", 88, src, dst); 1097 DO_imm_mandr_r("insertps", 89, src, dst); 1098 DO_imm_mandr_r("insertps", 90, src, dst); 1099 DO_imm_mandr_r("insertps", 91, src, dst); 1100 DO_imm_mandr_r("insertps", 92, src, dst); 1101 DO_imm_mandr_r("insertps", 93, src, dst); 1102 DO_imm_mandr_r("insertps", 94, src, dst); 1103 DO_imm_mandr_r("insertps", 95, src, dst); 1104 DO_imm_mandr_r("insertps", 96, src, dst); 1105 DO_imm_mandr_r("insertps", 97, src, dst); 1106 DO_imm_mandr_r("insertps", 98, src, dst); 1107 DO_imm_mandr_r("insertps", 99, src, dst); 1108 DO_imm_mandr_r("insertps", 100, src, dst); 1109 DO_imm_mandr_r("insertps", 101, src, dst); 1110 DO_imm_mandr_r("insertps", 102, src, dst); 1111 DO_imm_mandr_r("insertps", 103, src, dst); 1112 DO_imm_mandr_r("insertps", 104, src, dst); 1113 DO_imm_mandr_r("insertps", 105, src, dst); 1114 DO_imm_mandr_r("insertps", 106, src, dst); 1115 DO_imm_mandr_r("insertps", 107, src, dst); 1116 DO_imm_mandr_r("insertps", 108, src, dst); 1117 DO_imm_mandr_r("insertps", 109, src, dst); 1118 DO_imm_mandr_r("insertps", 110, src, dst); 1119 DO_imm_mandr_r("insertps", 111, src, dst); 1120 DO_imm_mandr_r("insertps", 112, src, dst); 1121 DO_imm_mandr_r("insertps", 113, src, dst); 1122 DO_imm_mandr_r("insertps", 114, src, dst); 1123 DO_imm_mandr_r("insertps", 115, src, dst); 1124 DO_imm_mandr_r("insertps", 116, src, dst); 1125 DO_imm_mandr_r("insertps", 117, src, dst); 1126 DO_imm_mandr_r("insertps", 118, src, dst); 1127 DO_imm_mandr_r("insertps", 119, src, dst); 1128 DO_imm_mandr_r("insertps", 120, src, dst); 1129 DO_imm_mandr_r("insertps", 121, src, dst); 1130 DO_imm_mandr_r("insertps", 122, src, dst); 1131 DO_imm_mandr_r("insertps", 123, src, dst); 1132 DO_imm_mandr_r("insertps", 124, src, dst); 1133 DO_imm_mandr_r("insertps", 125, src, dst); 1134 DO_imm_mandr_r("insertps", 126, src, dst); 1135 DO_imm_mandr_r("insertps", 127, src, dst); 1136 DO_imm_mandr_r("insertps", 128, src, dst); 1137 DO_imm_mandr_r("insertps", 129, src, dst); 1138 DO_imm_mandr_r("insertps", 130, src, dst); 1139 DO_imm_mandr_r("insertps", 131, src, dst); 1140 DO_imm_mandr_r("insertps", 132, src, dst); 1141 DO_imm_mandr_r("insertps", 133, src, dst); 1142 DO_imm_mandr_r("insertps", 134, src, dst); 1143 DO_imm_mandr_r("insertps", 135, src, dst); 1144 DO_imm_mandr_r("insertps", 136, src, dst); 1145 DO_imm_mandr_r("insertps", 137, src, dst); 1146 DO_imm_mandr_r("insertps", 138, src, dst); 1147 DO_imm_mandr_r("insertps", 139, src, dst); 1148 DO_imm_mandr_r("insertps", 140, src, dst); 1149 DO_imm_mandr_r("insertps", 141, src, dst); 1150 DO_imm_mandr_r("insertps", 142, src, dst); 1151 DO_imm_mandr_r("insertps", 143, src, dst); 1152 DO_imm_mandr_r("insertps", 144, src, dst); 1153 DO_imm_mandr_r("insertps", 145, src, dst); 1154 DO_imm_mandr_r("insertps", 146, src, dst); 1155 DO_imm_mandr_r("insertps", 147, src, dst); 1156 DO_imm_mandr_r("insertps", 148, src, dst); 1157 DO_imm_mandr_r("insertps", 149, src, dst); 1158 DO_imm_mandr_r("insertps", 150, src, dst); 1159 DO_imm_mandr_r("insertps", 151, src, dst); 1160 DO_imm_mandr_r("insertps", 152, src, dst); 1161 DO_imm_mandr_r("insertps", 153, src, dst); 1162 DO_imm_mandr_r("insertps", 154, src, dst); 1163 DO_imm_mandr_r("insertps", 155, src, dst); 1164 DO_imm_mandr_r("insertps", 156, src, dst); 1165 DO_imm_mandr_r("insertps", 157, src, dst); 1166 DO_imm_mandr_r("insertps", 158, src, dst); 1167 DO_imm_mandr_r("insertps", 159, src, dst); 1168 DO_imm_mandr_r("insertps", 160, src, dst); 1169 DO_imm_mandr_r("insertps", 161, src, dst); 1170 DO_imm_mandr_r("insertps", 162, src, dst); 1171 DO_imm_mandr_r("insertps", 163, src, dst); 1172 DO_imm_mandr_r("insertps", 164, src, dst); 1173 DO_imm_mandr_r("insertps", 165, src, dst); 1174 DO_imm_mandr_r("insertps", 166, src, dst); 1175 DO_imm_mandr_r("insertps", 167, src, dst); 1176 DO_imm_mandr_r("insertps", 168, src, dst); 1177 DO_imm_mandr_r("insertps", 169, src, dst); 1178 DO_imm_mandr_r("insertps", 170, src, dst); 1179 DO_imm_mandr_r("insertps", 171, src, dst); 1180 DO_imm_mandr_r("insertps", 172, src, dst); 1181 DO_imm_mandr_r("insertps", 173, src, dst); 1182 DO_imm_mandr_r("insertps", 174, src, dst); 1183 DO_imm_mandr_r("insertps", 175, src, dst); 1184 DO_imm_mandr_r("insertps", 176, src, dst); 1185 DO_imm_mandr_r("insertps", 177, src, dst); 1186 DO_imm_mandr_r("insertps", 178, src, dst); 1187 DO_imm_mandr_r("insertps", 179, src, dst); 1188 DO_imm_mandr_r("insertps", 180, src, dst); 1189 DO_imm_mandr_r("insertps", 181, src, dst); 1190 DO_imm_mandr_r("insertps", 182, src, dst); 1191 DO_imm_mandr_r("insertps", 183, src, dst); 1192 DO_imm_mandr_r("insertps", 184, src, dst); 1193 DO_imm_mandr_r("insertps", 185, src, dst); 1194 DO_imm_mandr_r("insertps", 186, src, dst); 1195 DO_imm_mandr_r("insertps", 187, src, dst); 1196 DO_imm_mandr_r("insertps", 188, src, dst); 1197 DO_imm_mandr_r("insertps", 189, src, dst); 1198 DO_imm_mandr_r("insertps", 190, src, dst); 1199 DO_imm_mandr_r("insertps", 191, src, dst); 1200 DO_imm_mandr_r("insertps", 192, src, dst); 1201 DO_imm_mandr_r("insertps", 193, src, dst); 1202 DO_imm_mandr_r("insertps", 194, src, dst); 1203 DO_imm_mandr_r("insertps", 195, src, dst); 1204 DO_imm_mandr_r("insertps", 196, src, dst); 1205 DO_imm_mandr_r("insertps", 197, src, dst); 1206 DO_imm_mandr_r("insertps", 198, src, dst); 1207 DO_imm_mandr_r("insertps", 199, src, dst); 1208 DO_imm_mandr_r("insertps", 200, src, dst); 1209 DO_imm_mandr_r("insertps", 201, src, dst); 1210 DO_imm_mandr_r("insertps", 202, src, dst); 1211 DO_imm_mandr_r("insertps", 203, src, dst); 1212 DO_imm_mandr_r("insertps", 204, src, dst); 1213 DO_imm_mandr_r("insertps", 205, src, dst); 1214 DO_imm_mandr_r("insertps", 206, src, dst); 1215 DO_imm_mandr_r("insertps", 207, src, dst); 1216 DO_imm_mandr_r("insertps", 208, src, dst); 1217 DO_imm_mandr_r("insertps", 209, src, dst); 1218 DO_imm_mandr_r("insertps", 210, src, dst); 1219 DO_imm_mandr_r("insertps", 211, src, dst); 1220 DO_imm_mandr_r("insertps", 212, src, dst); 1221 DO_imm_mandr_r("insertps", 213, src, dst); 1222 DO_imm_mandr_r("insertps", 214, src, dst); 1223 DO_imm_mandr_r("insertps", 215, src, dst); 1224 DO_imm_mandr_r("insertps", 216, src, dst); 1225 DO_imm_mandr_r("insertps", 217, src, dst); 1226 DO_imm_mandr_r("insertps", 218, src, dst); 1227 DO_imm_mandr_r("insertps", 219, src, dst); 1228 DO_imm_mandr_r("insertps", 220, src, dst); 1229 DO_imm_mandr_r("insertps", 221, src, dst); 1230 DO_imm_mandr_r("insertps", 222, src, dst); 1231 DO_imm_mandr_r("insertps", 223, src, dst); 1232 DO_imm_mandr_r("insertps", 224, src, dst); 1233 DO_imm_mandr_r("insertps", 225, src, dst); 1234 DO_imm_mandr_r("insertps", 226, src, dst); 1235 DO_imm_mandr_r("insertps", 227, src, dst); 1236 DO_imm_mandr_r("insertps", 228, src, dst); 1237 DO_imm_mandr_r("insertps", 229, src, dst); 1238 DO_imm_mandr_r("insertps", 230, src, dst); 1239 DO_imm_mandr_r("insertps", 231, src, dst); 1240 DO_imm_mandr_r("insertps", 232, src, dst); 1241 DO_imm_mandr_r("insertps", 233, src, dst); 1242 DO_imm_mandr_r("insertps", 234, src, dst); 1243 DO_imm_mandr_r("insertps", 235, src, dst); 1244 DO_imm_mandr_r("insertps", 236, src, dst); 1245 DO_imm_mandr_r("insertps", 237, src, dst); 1246 DO_imm_mandr_r("insertps", 238, src, dst); 1247 DO_imm_mandr_r("insertps", 239, src, dst); 1248 DO_imm_mandr_r("insertps", 240, src, dst); 1249 DO_imm_mandr_r("insertps", 241, src, dst); 1250 DO_imm_mandr_r("insertps", 242, src, dst); 1251 DO_imm_mandr_r("insertps", 243, src, dst); 1252 DO_imm_mandr_r("insertps", 244, src, dst); 1253 DO_imm_mandr_r("insertps", 245, src, dst); 1254 DO_imm_mandr_r("insertps", 246, src, dst); 1255 DO_imm_mandr_r("insertps", 247, src, dst); 1256 DO_imm_mandr_r("insertps", 248, src, dst); 1257 DO_imm_mandr_r("insertps", 249, src, dst); 1258 DO_imm_mandr_r("insertps", 250, src, dst); 1259 DO_imm_mandr_r("insertps", 251, src, dst); 1260 DO_imm_mandr_r("insertps", 252, src, dst); 1261 DO_imm_mandr_r("insertps", 253, src, dst); 1262 DO_imm_mandr_r("insertps", 254, src, dst); 1263 DO_imm_mandr_r("insertps", 255, src, dst); 1264 } 1265 } 1266 1267 void test_MPSADBW ( void ) 1268 { 1269 V128 src, dst; 1270 Int i; 1271 for (i = 0; i < 10; i++) { 1272 randV128(&src); 1273 randV128(&dst); 1274 DO_imm_mandr_r("mpsadbw", 0, src, dst); 1275 DO_imm_mandr_r("mpsadbw", 1, src, dst); 1276 DO_imm_mandr_r("mpsadbw", 2, src, dst); 1277 DO_imm_mandr_r("mpsadbw", 3, src, dst); 1278 DO_imm_mandr_r("mpsadbw", 4, src, dst); 1279 DO_imm_mandr_r("mpsadbw", 5, src, dst); 1280 DO_imm_mandr_r("mpsadbw", 6, src, dst); 1281 DO_imm_mandr_r("mpsadbw", 7, src, dst); 1282 } 1283 } 1284 1285 void test_PACKUSDW ( void ) 1286 { 1287 V128 src, dst; 1288 Int i; 1289 for (i = 0; i < 10; i++) { 1290 if (i < 9) { 1291 randV128(&src); 1292 randV128(&dst); 1293 } else { 1294 memset(&src, 0, sizeof(src)); 1295 memset(&dst, 0, sizeof(src)); 1296 src[0] = 0x11; src[1] = 0x22; 1297 src[4] = 0x33; src[5] = 0x44; 1298 src[8] = 0x55; src[9] = 0x66; 1299 src[12] = 0x77; src[13] = 0x88; 1300 dst[0] = 0xaa; dst[1] = 0xbb; 1301 dst[4] = 0xcc; dst[5] = 0xdd; 1302 dst[8] = 0xee; dst[9] = 0xff; 1303 dst[12] = 0xa1; dst[13] = 0xb2; 1304 } 1305 DO_mandr_r("packusdw", src, dst); 1306 } 1307 } 1308 1309 void test_PBLENDW ( void ) 1310 { 1311 V128 src, dst; 1312 randV128(&src); 1313 randV128(&dst); 1314 { 1315 DO_imm_mandr_r("pblendw", 0, src, dst); 1316 DO_imm_mandr_r("pblendw", 1, src, dst); 1317 DO_imm_mandr_r("pblendw", 2, src, dst); 1318 DO_imm_mandr_r("pblendw", 3, src, dst); 1319 DO_imm_mandr_r("pblendw", 4, src, dst); 1320 DO_imm_mandr_r("pblendw", 5, src, dst); 1321 DO_imm_mandr_r("pblendw", 6, src, dst); 1322 DO_imm_mandr_r("pblendw", 7, src, dst); 1323 DO_imm_mandr_r("pblendw", 8, src, dst); 1324 DO_imm_mandr_r("pblendw", 9, src, dst); 1325 DO_imm_mandr_r("pblendw", 10, src, dst); 1326 DO_imm_mandr_r("pblendw", 11, src, dst); 1327 DO_imm_mandr_r("pblendw", 12, src, dst); 1328 DO_imm_mandr_r("pblendw", 13, src, dst); 1329 DO_imm_mandr_r("pblendw", 14, src, dst); 1330 DO_imm_mandr_r("pblendw", 15, src, dst); 1331 DO_imm_mandr_r("pblendw", 16, src, dst); 1332 DO_imm_mandr_r("pblendw", 17, src, dst); 1333 DO_imm_mandr_r("pblendw", 18, src, dst); 1334 DO_imm_mandr_r("pblendw", 19, src, dst); 1335 DO_imm_mandr_r("pblendw", 20, src, dst); 1336 DO_imm_mandr_r("pblendw", 21, src, dst); 1337 DO_imm_mandr_r("pblendw", 22, src, dst); 1338 DO_imm_mandr_r("pblendw", 23, src, dst); 1339 DO_imm_mandr_r("pblendw", 24, src, dst); 1340 DO_imm_mandr_r("pblendw", 25, src, dst); 1341 DO_imm_mandr_r("pblendw", 26, src, dst); 1342 DO_imm_mandr_r("pblendw", 27, src, dst); 1343 DO_imm_mandr_r("pblendw", 28, src, dst); 1344 DO_imm_mandr_r("pblendw", 29, src, dst); 1345 DO_imm_mandr_r("pblendw", 30, src, dst); 1346 DO_imm_mandr_r("pblendw", 31, src, dst); 1347 DO_imm_mandr_r("pblendw", 32, src, dst); 1348 DO_imm_mandr_r("pblendw", 33, src, dst); 1349 DO_imm_mandr_r("pblendw", 34, src, dst); 1350 DO_imm_mandr_r("pblendw", 35, src, dst); 1351 DO_imm_mandr_r("pblendw", 36, src, dst); 1352 DO_imm_mandr_r("pblendw", 37, src, dst); 1353 DO_imm_mandr_r("pblendw", 38, src, dst); 1354 DO_imm_mandr_r("pblendw", 39, src, dst); 1355 DO_imm_mandr_r("pblendw", 40, src, dst); 1356 DO_imm_mandr_r("pblendw", 41, src, dst); 1357 DO_imm_mandr_r("pblendw", 42, src, dst); 1358 DO_imm_mandr_r("pblendw", 43, src, dst); 1359 DO_imm_mandr_r("pblendw", 44, src, dst); 1360 DO_imm_mandr_r("pblendw", 45, src, dst); 1361 DO_imm_mandr_r("pblendw", 46, src, dst); 1362 DO_imm_mandr_r("pblendw", 47, src, dst); 1363 DO_imm_mandr_r("pblendw", 48, src, dst); 1364 DO_imm_mandr_r("pblendw", 49, src, dst); 1365 DO_imm_mandr_r("pblendw", 50, src, dst); 1366 DO_imm_mandr_r("pblendw", 51, src, dst); 1367 DO_imm_mandr_r("pblendw", 52, src, dst); 1368 DO_imm_mandr_r("pblendw", 53, src, dst); 1369 DO_imm_mandr_r("pblendw", 54, src, dst); 1370 DO_imm_mandr_r("pblendw", 55, src, dst); 1371 DO_imm_mandr_r("pblendw", 56, src, dst); 1372 DO_imm_mandr_r("pblendw", 57, src, dst); 1373 DO_imm_mandr_r("pblendw", 58, src, dst); 1374 DO_imm_mandr_r("pblendw", 59, src, dst); 1375 DO_imm_mandr_r("pblendw", 60, src, dst); 1376 DO_imm_mandr_r("pblendw", 61, src, dst); 1377 DO_imm_mandr_r("pblendw", 62, src, dst); 1378 DO_imm_mandr_r("pblendw", 63, src, dst); 1379 DO_imm_mandr_r("pblendw", 64, src, dst); 1380 DO_imm_mandr_r("pblendw", 65, src, dst); 1381 DO_imm_mandr_r("pblendw", 66, src, dst); 1382 DO_imm_mandr_r("pblendw", 67, src, dst); 1383 DO_imm_mandr_r("pblendw", 68, src, dst); 1384 DO_imm_mandr_r("pblendw", 69, src, dst); 1385 DO_imm_mandr_r("pblendw", 70, src, dst); 1386 DO_imm_mandr_r("pblendw", 71, src, dst); 1387 DO_imm_mandr_r("pblendw", 72, src, dst); 1388 DO_imm_mandr_r("pblendw", 73, src, dst); 1389 DO_imm_mandr_r("pblendw", 74, src, dst); 1390 DO_imm_mandr_r("pblendw", 75, src, dst); 1391 DO_imm_mandr_r("pblendw", 76, src, dst); 1392 DO_imm_mandr_r("pblendw", 77, src, dst); 1393 DO_imm_mandr_r("pblendw", 78, src, dst); 1394 DO_imm_mandr_r("pblendw", 79, src, dst); 1395 DO_imm_mandr_r("pblendw", 80, src, dst); 1396 DO_imm_mandr_r("pblendw", 81, src, dst); 1397 DO_imm_mandr_r("pblendw", 82, src, dst); 1398 DO_imm_mandr_r("pblendw", 83, src, dst); 1399 DO_imm_mandr_r("pblendw", 84, src, dst); 1400 DO_imm_mandr_r("pblendw", 85, src, dst); 1401 DO_imm_mandr_r("pblendw", 86, src, dst); 1402 DO_imm_mandr_r("pblendw", 87, src, dst); 1403 DO_imm_mandr_r("pblendw", 88, src, dst); 1404 DO_imm_mandr_r("pblendw", 89, src, dst); 1405 DO_imm_mandr_r("pblendw", 90, src, dst); 1406 DO_imm_mandr_r("pblendw", 91, src, dst); 1407 DO_imm_mandr_r("pblendw", 92, src, dst); 1408 DO_imm_mandr_r("pblendw", 93, src, dst); 1409 DO_imm_mandr_r("pblendw", 94, src, dst); 1410 DO_imm_mandr_r("pblendw", 95, src, dst); 1411 DO_imm_mandr_r("pblendw", 96, src, dst); 1412 DO_imm_mandr_r("pblendw", 97, src, dst); 1413 DO_imm_mandr_r("pblendw", 98, src, dst); 1414 DO_imm_mandr_r("pblendw", 99, src, dst); 1415 DO_imm_mandr_r("pblendw", 100, src, dst); 1416 DO_imm_mandr_r("pblendw", 101, src, dst); 1417 DO_imm_mandr_r("pblendw", 102, src, dst); 1418 DO_imm_mandr_r("pblendw", 103, src, dst); 1419 DO_imm_mandr_r("pblendw", 104, src, dst); 1420 DO_imm_mandr_r("pblendw", 105, src, dst); 1421 DO_imm_mandr_r("pblendw", 106, src, dst); 1422 DO_imm_mandr_r("pblendw", 107, src, dst); 1423 DO_imm_mandr_r("pblendw", 108, src, dst); 1424 DO_imm_mandr_r("pblendw", 109, src, dst); 1425 DO_imm_mandr_r("pblendw", 110, src, dst); 1426 DO_imm_mandr_r("pblendw", 111, src, dst); 1427 DO_imm_mandr_r("pblendw", 112, src, dst); 1428 DO_imm_mandr_r("pblendw", 113, src, dst); 1429 DO_imm_mandr_r("pblendw", 114, src, dst); 1430 DO_imm_mandr_r("pblendw", 115, src, dst); 1431 DO_imm_mandr_r("pblendw", 116, src, dst); 1432 DO_imm_mandr_r("pblendw", 117, src, dst); 1433 DO_imm_mandr_r("pblendw", 118, src, dst); 1434 DO_imm_mandr_r("pblendw", 119, src, dst); 1435 DO_imm_mandr_r("pblendw", 120, src, dst); 1436 DO_imm_mandr_r("pblendw", 121, src, dst); 1437 DO_imm_mandr_r("pblendw", 122, src, dst); 1438 DO_imm_mandr_r("pblendw", 123, src, dst); 1439 DO_imm_mandr_r("pblendw", 124, src, dst); 1440 DO_imm_mandr_r("pblendw", 125, src, dst); 1441 DO_imm_mandr_r("pblendw", 126, src, dst); 1442 DO_imm_mandr_r("pblendw", 127, src, dst); 1443 DO_imm_mandr_r("pblendw", 128, src, dst); 1444 DO_imm_mandr_r("pblendw", 129, src, dst); 1445 DO_imm_mandr_r("pblendw", 130, src, dst); 1446 DO_imm_mandr_r("pblendw", 131, src, dst); 1447 DO_imm_mandr_r("pblendw", 132, src, dst); 1448 DO_imm_mandr_r("pblendw", 133, src, dst); 1449 DO_imm_mandr_r("pblendw", 134, src, dst); 1450 DO_imm_mandr_r("pblendw", 135, src, dst); 1451 DO_imm_mandr_r("pblendw", 136, src, dst); 1452 DO_imm_mandr_r("pblendw", 137, src, dst); 1453 DO_imm_mandr_r("pblendw", 138, src, dst); 1454 DO_imm_mandr_r("pblendw", 139, src, dst); 1455 DO_imm_mandr_r("pblendw", 140, src, dst); 1456 DO_imm_mandr_r("pblendw", 141, src, dst); 1457 DO_imm_mandr_r("pblendw", 142, src, dst); 1458 DO_imm_mandr_r("pblendw", 143, src, dst); 1459 DO_imm_mandr_r("pblendw", 144, src, dst); 1460 DO_imm_mandr_r("pblendw", 145, src, dst); 1461 DO_imm_mandr_r("pblendw", 146, src, dst); 1462 DO_imm_mandr_r("pblendw", 147, src, dst); 1463 DO_imm_mandr_r("pblendw", 148, src, dst); 1464 DO_imm_mandr_r("pblendw", 149, src, dst); 1465 DO_imm_mandr_r("pblendw", 150, src, dst); 1466 DO_imm_mandr_r("pblendw", 151, src, dst); 1467 DO_imm_mandr_r("pblendw", 152, src, dst); 1468 DO_imm_mandr_r("pblendw", 153, src, dst); 1469 DO_imm_mandr_r("pblendw", 154, src, dst); 1470 DO_imm_mandr_r("pblendw", 155, src, dst); 1471 DO_imm_mandr_r("pblendw", 156, src, dst); 1472 DO_imm_mandr_r("pblendw", 157, src, dst); 1473 DO_imm_mandr_r("pblendw", 158, src, dst); 1474 DO_imm_mandr_r("pblendw", 159, src, dst); 1475 DO_imm_mandr_r("pblendw", 160, src, dst); 1476 DO_imm_mandr_r("pblendw", 161, src, dst); 1477 DO_imm_mandr_r("pblendw", 162, src, dst); 1478 DO_imm_mandr_r("pblendw", 163, src, dst); 1479 DO_imm_mandr_r("pblendw", 164, src, dst); 1480 DO_imm_mandr_r("pblendw", 165, src, dst); 1481 DO_imm_mandr_r("pblendw", 166, src, dst); 1482 DO_imm_mandr_r("pblendw", 167, src, dst); 1483 DO_imm_mandr_r("pblendw", 168, src, dst); 1484 DO_imm_mandr_r("pblendw", 169, src, dst); 1485 DO_imm_mandr_r("pblendw", 170, src, dst); 1486 DO_imm_mandr_r("pblendw", 171, src, dst); 1487 DO_imm_mandr_r("pblendw", 172, src, dst); 1488 DO_imm_mandr_r("pblendw", 173, src, dst); 1489 DO_imm_mandr_r("pblendw", 174, src, dst); 1490 DO_imm_mandr_r("pblendw", 175, src, dst); 1491 DO_imm_mandr_r("pblendw", 176, src, dst); 1492 DO_imm_mandr_r("pblendw", 177, src, dst); 1493 DO_imm_mandr_r("pblendw", 178, src, dst); 1494 DO_imm_mandr_r("pblendw", 179, src, dst); 1495 DO_imm_mandr_r("pblendw", 180, src, dst); 1496 DO_imm_mandr_r("pblendw", 181, src, dst); 1497 DO_imm_mandr_r("pblendw", 182, src, dst); 1498 DO_imm_mandr_r("pblendw", 183, src, dst); 1499 DO_imm_mandr_r("pblendw", 184, src, dst); 1500 DO_imm_mandr_r("pblendw", 185, src, dst); 1501 DO_imm_mandr_r("pblendw", 186, src, dst); 1502 DO_imm_mandr_r("pblendw", 187, src, dst); 1503 DO_imm_mandr_r("pblendw", 188, src, dst); 1504 DO_imm_mandr_r("pblendw", 189, src, dst); 1505 DO_imm_mandr_r("pblendw", 190, src, dst); 1506 DO_imm_mandr_r("pblendw", 191, src, dst); 1507 DO_imm_mandr_r("pblendw", 192, src, dst); 1508 DO_imm_mandr_r("pblendw", 193, src, dst); 1509 DO_imm_mandr_r("pblendw", 194, src, dst); 1510 DO_imm_mandr_r("pblendw", 195, src, dst); 1511 DO_imm_mandr_r("pblendw", 196, src, dst); 1512 DO_imm_mandr_r("pblendw", 197, src, dst); 1513 DO_imm_mandr_r("pblendw", 198, src, dst); 1514 DO_imm_mandr_r("pblendw", 199, src, dst); 1515 DO_imm_mandr_r("pblendw", 200, src, dst); 1516 DO_imm_mandr_r("pblendw", 201, src, dst); 1517 DO_imm_mandr_r("pblendw", 202, src, dst); 1518 DO_imm_mandr_r("pblendw", 203, src, dst); 1519 DO_imm_mandr_r("pblendw", 204, src, dst); 1520 DO_imm_mandr_r("pblendw", 205, src, dst); 1521 DO_imm_mandr_r("pblendw", 206, src, dst); 1522 DO_imm_mandr_r("pblendw", 207, src, dst); 1523 DO_imm_mandr_r("pblendw", 208, src, dst); 1524 DO_imm_mandr_r("pblendw", 209, src, dst); 1525 DO_imm_mandr_r("pblendw", 210, src, dst); 1526 DO_imm_mandr_r("pblendw", 211, src, dst); 1527 DO_imm_mandr_r("pblendw", 212, src, dst); 1528 DO_imm_mandr_r("pblendw", 213, src, dst); 1529 DO_imm_mandr_r("pblendw", 214, src, dst); 1530 DO_imm_mandr_r("pblendw", 215, src, dst); 1531 DO_imm_mandr_r("pblendw", 216, src, dst); 1532 DO_imm_mandr_r("pblendw", 217, src, dst); 1533 DO_imm_mandr_r("pblendw", 218, src, dst); 1534 DO_imm_mandr_r("pblendw", 219, src, dst); 1535 DO_imm_mandr_r("pblendw", 220, src, dst); 1536 DO_imm_mandr_r("pblendw", 221, src, dst); 1537 DO_imm_mandr_r("pblendw", 222, src, dst); 1538 DO_imm_mandr_r("pblendw", 223, src, dst); 1539 DO_imm_mandr_r("pblendw", 224, src, dst); 1540 DO_imm_mandr_r("pblendw", 225, src, dst); 1541 DO_imm_mandr_r("pblendw", 226, src, dst); 1542 DO_imm_mandr_r("pblendw", 227, src, dst); 1543 DO_imm_mandr_r("pblendw", 228, src, dst); 1544 DO_imm_mandr_r("pblendw", 229, src, dst); 1545 DO_imm_mandr_r("pblendw", 230, src, dst); 1546 DO_imm_mandr_r("pblendw", 231, src, dst); 1547 DO_imm_mandr_r("pblendw", 232, src, dst); 1548 DO_imm_mandr_r("pblendw", 233, src, dst); 1549 DO_imm_mandr_r("pblendw", 234, src, dst); 1550 DO_imm_mandr_r("pblendw", 235, src, dst); 1551 DO_imm_mandr_r("pblendw", 236, src, dst); 1552 DO_imm_mandr_r("pblendw", 237, src, dst); 1553 DO_imm_mandr_r("pblendw", 238, src, dst); 1554 DO_imm_mandr_r("pblendw", 239, src, dst); 1555 DO_imm_mandr_r("pblendw", 240, src, dst); 1556 DO_imm_mandr_r("pblendw", 241, src, dst); 1557 DO_imm_mandr_r("pblendw", 242, src, dst); 1558 DO_imm_mandr_r("pblendw", 243, src, dst); 1559 DO_imm_mandr_r("pblendw", 244, src, dst); 1560 DO_imm_mandr_r("pblendw", 245, src, dst); 1561 DO_imm_mandr_r("pblendw", 246, src, dst); 1562 DO_imm_mandr_r("pblendw", 247, src, dst); 1563 DO_imm_mandr_r("pblendw", 248, src, dst); 1564 DO_imm_mandr_r("pblendw", 249, src, dst); 1565 DO_imm_mandr_r("pblendw", 250, src, dst); 1566 DO_imm_mandr_r("pblendw", 251, src, dst); 1567 DO_imm_mandr_r("pblendw", 252, src, dst); 1568 DO_imm_mandr_r("pblendw", 253, src, dst); 1569 DO_imm_mandr_r("pblendw", 254, src, dst); 1570 DO_imm_mandr_r("pblendw", 255, src, dst); 1571 } 1572 } 1573 1574 1575 void test_PCMPEQQ ( void ) 1576 { 1577 V128 src, dst; 1578 Int i; 1579 for (i = 0; i < 10; i++) { 1580 randV128(&src); 1581 randV128(&dst); 1582 switch (i - 6) { 1583 case 0: memset(&src[0], 0x55, 8); 1584 memset(&dst[0], 0x55, 8); break; 1585 case 1: memset(&src[8], 0x55, 8); 1586 memset(&dst[8], 0x55, 8); break; 1587 default: 1588 break; 1589 } 1590 DO_mandr_r("pcmpeqq", src, dst); 1591 } 1592 } 1593 1594 1595 void test_PEXTRB ( void ) 1596 { 1597 V128 src; 1598 randV128(&src); 1599 DO_imm_r_to_mandrscalar("pextrb", 0, src, "d"); 1600 DO_imm_r_to_mandrscalar("pextrb", 1, src, "d"); 1601 DO_imm_r_to_mandrscalar("pextrb", 2, src, "d"); 1602 DO_imm_r_to_mandrscalar("pextrb", 3, src, "d"); 1603 DO_imm_r_to_mandrscalar("pextrb", 4, src, "d"); 1604 DO_imm_r_to_mandrscalar("pextrb", 5, src, "d"); 1605 DO_imm_r_to_mandrscalar("pextrb", 6, src, "d"); 1606 DO_imm_r_to_mandrscalar("pextrb", 7, src, "d"); 1607 DO_imm_r_to_mandrscalar("pextrb", 8, src, "d"); 1608 DO_imm_r_to_mandrscalar("pextrb", 9, src, "d"); 1609 DO_imm_r_to_mandrscalar("pextrb", 10, src, "d"); 1610 DO_imm_r_to_mandrscalar("pextrb", 11, src, "d"); 1611 DO_imm_r_to_mandrscalar("pextrb", 12, src, "d"); 1612 DO_imm_r_to_mandrscalar("pextrb", 13, src, "d"); 1613 DO_imm_r_to_mandrscalar("pextrb", 14, src, "d"); 1614 DO_imm_r_to_mandrscalar("pextrb", 15, src, "d"); 1615 } 1616 1617 void test_PINSRB ( void ) 1618 { 1619 ULong src; 1620 src = randULong(); 1621 DO_imm_mandrscalar_to_r("pinsrb", 0, src, "d"); 1622 src = randULong(); 1623 DO_imm_mandrscalar_to_r("pinsrb", 1, src, "d"); 1624 src = randULong(); 1625 DO_imm_mandrscalar_to_r("pinsrb", 2, src, "d"); 1626 src = randULong(); 1627 DO_imm_mandrscalar_to_r("pinsrb", 3, src, "d"); 1628 src = randULong(); 1629 DO_imm_mandrscalar_to_r("pinsrb", 4, src, "d"); 1630 src = randULong(); 1631 DO_imm_mandrscalar_to_r("pinsrb", 5, src, "d"); 1632 src = randULong(); 1633 DO_imm_mandrscalar_to_r("pinsrb", 6, src, "d"); 1634 src = randULong(); 1635 DO_imm_mandrscalar_to_r("pinsrb", 7, src, "d"); 1636 src = randULong(); 1637 DO_imm_mandrscalar_to_r("pinsrb", 8, src, "d"); 1638 src = randULong(); 1639 DO_imm_mandrscalar_to_r("pinsrb", 9, src, "d"); 1640 src = randULong(); 1641 DO_imm_mandrscalar_to_r("pinsrb", 10, src, "d"); 1642 src = randULong(); 1643 DO_imm_mandrscalar_to_r("pinsrb", 11, src, "d"); 1644 src = randULong(); 1645 DO_imm_mandrscalar_to_r("pinsrb", 12, src, "d"); 1646 src = randULong(); 1647 DO_imm_mandrscalar_to_r("pinsrb", 13, src, "d"); 1648 src = randULong(); 1649 DO_imm_mandrscalar_to_r("pinsrb", 14, src, "d"); 1650 src = randULong(); 1651 DO_imm_mandrscalar_to_r("pinsrb", 15, src, "d"); 1652 } 1653 1654 1655 void test_PEXTRW ( void ) 1656 { 1657 V128 src; 1658 randV128(&src); 1659 DO_imm_r_to_mandrscalar("pextrw", 0, src, "d"); 1660 DO_imm_r_to_mandrscalar("pextrw", 1, src, "d"); 1661 DO_imm_r_to_mandrscalar("pextrw", 2, src, "d"); 1662 DO_imm_r_to_mandrscalar("pextrw", 3, src, "d"); 1663 DO_imm_r_to_mandrscalar("pextrw", 4, src, "d"); 1664 DO_imm_r_to_mandrscalar("pextrw", 5, src, "d"); 1665 DO_imm_r_to_mandrscalar("pextrw", 6, src, "d"); 1666 DO_imm_r_to_mandrscalar("pextrw", 7, src, "d"); 1667 } 1668 1669 void test_PINSRW ( void ) 1670 { 1671 ULong src; 1672 src = randULong(); 1673 DO_imm_mandrscalar_to_r("pinsrw", 0, src, "d"); 1674 src = randULong(); 1675 DO_imm_mandrscalar_to_r("pinsrw", 1, src, "d"); 1676 src = randULong(); 1677 DO_imm_mandrscalar_to_r("pinsrw", 2, src, "d"); 1678 src = randULong(); 1679 DO_imm_mandrscalar_to_r("pinsrw", 3, src, "d"); 1680 src = randULong(); 1681 DO_imm_mandrscalar_to_r("pinsrw", 4, src, "d"); 1682 src = randULong(); 1683 DO_imm_mandrscalar_to_r("pinsrw", 5, src, "d"); 1684 src = randULong(); 1685 DO_imm_mandrscalar_to_r("pinsrw", 6, src, "d"); 1686 src = randULong(); 1687 DO_imm_mandrscalar_to_r("pinsrw", 7, src, "d"); 1688 } 1689 1690 1691 void test_PEXTRD ( void ) 1692 { 1693 V128 src; 1694 randV128(&src); 1695 DO_imm_r_to_mandrscalar("pextrd", 0, src, "d"); 1696 DO_imm_r_to_mandrscalar("pextrd", 1, src, "d"); 1697 DO_imm_r_to_mandrscalar("pextrd", 2, src, "d"); 1698 DO_imm_r_to_mandrscalar("pextrd", 3, src, "d"); 1699 } 1700 1701 void test_PINSRD ( void ) 1702 { 1703 ULong src; 1704 src = randULong(); 1705 DO_imm_mandrscalar_to_r("pinsrd", 0, src, "d"); 1706 src = randULong(); 1707 DO_imm_mandrscalar_to_r("pinsrd", 1, src, "d"); 1708 src = randULong(); 1709 DO_imm_mandrscalar_to_r("pinsrd", 2, src, "d"); 1710 src = randULong(); 1711 DO_imm_mandrscalar_to_r("pinsrd", 3, src, "d"); 1712 } 1713 1714 1715 void test_PEXTRQ ( void ) 1716 { 1717 V128 src; 1718 randV128(&src); 1719 DO_imm_r_to_mandrscalar("pextrq", 0, src, ""); 1720 DO_imm_r_to_mandrscalar("pextrq", 1, src, ""); 1721 } 1722 1723 void test_PINSRQ ( void ) 1724 { 1725 ULong src; 1726 src = randULong(); 1727 DO_imm_mandrscalar_to_r("pinsrq", 0, src, ""); 1728 src = randULong(); 1729 DO_imm_mandrscalar_to_r("pinsrq", 1, src, ""); 1730 } 1731 1732 1733 void test_EXTRACTPS ( void ) 1734 { 1735 V128 src; 1736 randV128(&src); 1737 DO_imm_r_to_mandrscalar("extractps", 0, src, "d"); 1738 DO_imm_r_to_mandrscalar("extractps", 1, src, "d"); 1739 DO_imm_r_to_mandrscalar("extractps", 2, src, "d"); 1740 DO_imm_r_to_mandrscalar("extractps", 3, src, "d"); 1741 } 1742 1743 1744 void test_PHMINPOSUW ( void ) 1745 { 1746 V128 src, dst; 1747 Int i; 1748 for (i = 0; i < 10; i++) { 1749 randV128(&src); 1750 randV128(&dst); 1751 DO_mandr_r("phminposuw", src, dst); 1752 } 1753 } 1754 1755 void test_PMAXSB ( void ) 1756 { 1757 V128 src, dst; 1758 Int i; 1759 for (i = 0; i < 10; i++) { 1760 randV128(&src); 1761 randV128(&dst); 1762 DO_mandr_r("pmaxsb", src, dst); 1763 } 1764 } 1765 1766 void test_PMAXSD ( void ) 1767 { 1768 V128 src, dst; 1769 Int i; 1770 for (i = 0; i < 10; i++) { 1771 randV128(&src); 1772 randV128(&dst); 1773 DO_mandr_r("pmaxsd", src, dst); 1774 } 1775 } 1776 1777 void test_PMAXUD ( void ) 1778 { 1779 V128 src, dst; 1780 Int i; 1781 for (i = 0; i < 10; i++) { 1782 randV128(&src); 1783 randV128(&dst); 1784 DO_mandr_r("pmaxud", src, dst); 1785 } 1786 } 1787 1788 void test_PMAXUW ( void ) 1789 { 1790 V128 src, dst; 1791 Int i; 1792 for (i = 0; i < 10; i++) { 1793 randV128(&src); 1794 randV128(&dst); 1795 DO_mandr_r("pmaxuw", src, dst); 1796 } 1797 } 1798 1799 void test_PMINSB ( void ) 1800 { 1801 V128 src, dst; 1802 Int i; 1803 for (i = 0; i < 10; i++) { 1804 randV128(&src); 1805 randV128(&dst); 1806 DO_mandr_r("pminsb", src, dst); 1807 } 1808 } 1809 1810 void test_PMINSD ( void ) 1811 { 1812 V128 src, dst; 1813 Int i; 1814 for (i = 0; i < 10; i++) { 1815 randV128(&src); 1816 randV128(&dst); 1817 DO_mandr_r("pminsd", src, dst); 1818 } 1819 } 1820 1821 void test_PMINUD ( void ) 1822 { 1823 V128 src, dst; 1824 Int i; 1825 for (i = 0; i < 10; i++) { 1826 randV128(&src); 1827 randV128(&dst); 1828 DO_mandr_r("pminud", src, dst); 1829 } 1830 } 1831 1832 void test_PMINUW ( void ) 1833 { 1834 V128 src, dst; 1835 Int i; 1836 for (i = 0; i < 10; i++) { 1837 randV128(&src); 1838 randV128(&dst); 1839 DO_mandr_r("pminuw", src, dst); 1840 } 1841 } 1842 1843 void test_PMOVSXBW ( void ) 1844 { 1845 V128 src, dst; 1846 Int i; 1847 for (i = 0; i < 10; i++) { 1848 randV128(&src); 1849 randV128(&dst); 1850 DO_mandr_r("pmovsxbw", src, dst); 1851 } 1852 } 1853 1854 void test_PMOVSXBD ( void ) 1855 { 1856 V128 src, dst; 1857 Int i; 1858 for (i = 0; i < 10; i++) { 1859 randV128(&src); 1860 randV128(&dst); 1861 DO_mandr_r("pmovsxbd", src, dst); 1862 } 1863 } 1864 1865 void test_PMOVSXBQ ( void ) 1866 { 1867 V128 src, dst; 1868 Int i; 1869 for (i = 0; i < 10; i++) { 1870 randV128(&src); 1871 randV128(&dst); 1872 DO_mandr_r("pmovsxbq", src, dst); 1873 } 1874 } 1875 1876 void test_PMOVSXWD ( void ) 1877 { 1878 V128 src, dst; 1879 Int i; 1880 for (i = 0; i < 10; i++) { 1881 randV128(&src); 1882 randV128(&dst); 1883 DO_mandr_r("pmovsxwd", src, dst); 1884 } 1885 } 1886 1887 void test_PMOVSXWQ ( void ) 1888 { 1889 V128 src, dst; 1890 Int i; 1891 for (i = 0; i < 10; i++) { 1892 randV128(&src); 1893 randV128(&dst); 1894 DO_mandr_r("pmovsxwq", src, dst); 1895 } 1896 } 1897 1898 void test_PMOVSXDQ ( void ) 1899 { 1900 V128 src, dst; 1901 Int i; 1902 for (i = 0; i < 10; i++) { 1903 randV128(&src); 1904 randV128(&dst); 1905 DO_mandr_r("pmovsxdq", src, dst); 1906 } 1907 } 1908 1909 void test_PMOVZXBW ( void ) 1910 { 1911 V128 src, dst; 1912 Int i; 1913 for (i = 0; i < 10; i++) { 1914 randV128(&src); 1915 randV128(&dst); 1916 DO_mandr_r("pmovzxbw", src, dst); 1917 } 1918 } 1919 1920 void test_PMOVZXBD ( void ) 1921 { 1922 V128 src, dst; 1923 Int i; 1924 for (i = 0; i < 10; i++) { 1925 randV128(&src); 1926 randV128(&dst); 1927 DO_mandr_r("pmovzxbd", src, dst); 1928 } 1929 } 1930 1931 void test_PMOVZXBQ ( void ) 1932 { 1933 V128 src, dst; 1934 Int i; 1935 for (i = 0; i < 10; i++) { 1936 randV128(&src); 1937 randV128(&dst); 1938 DO_mandr_r("pmovzxbq", src, dst); 1939 } 1940 } 1941 1942 void test_PMOVZXWD ( void ) 1943 { 1944 V128 src, dst; 1945 Int i; 1946 for (i = 0; i < 10; i++) { 1947 randV128(&src); 1948 randV128(&dst); 1949 DO_mandr_r("pmovzxwd", src, dst); 1950 } 1951 } 1952 1953 void test_PMOVZXWQ ( void ) 1954 { 1955 V128 src, dst; 1956 Int i; 1957 for (i = 0; i < 10; i++) { 1958 randV128(&src); 1959 randV128(&dst); 1960 DO_mandr_r("pmovzxwq", src, dst); 1961 } 1962 } 1963 1964 void test_PMOVZXDQ ( void ) 1965 { 1966 V128 src, dst; 1967 Int i; 1968 for (i = 0; i < 10; i++) { 1969 randV128(&src); 1970 randV128(&dst); 1971 DO_mandr_r("pmovzxdq", src, dst); 1972 } 1973 } 1974 1975 void test_PMULDQ ( void ) 1976 { 1977 V128 src, dst; 1978 Int i; 1979 for (i = 0; i < 10; i++) { 1980 randV128(&src); 1981 randV128(&dst); 1982 DO_mandr_r("pmuldq", src, dst); 1983 } 1984 } 1985 1986 1987 void test_PMULLD ( void ) 1988 { 1989 V128 src, dst; 1990 Int i; 1991 for (i = 0; i < 10; i++) { 1992 randV128(&src); 1993 randV128(&dst); 1994 DO_mandr_r("pmulld", src, dst); 1995 } 1996 } 1997 1998 1999 void test_POPCNTQ ( void ) 2000 { 2001 ULong block[4]; 2002 Int i; 2003 ULong oszacp_mask = 0x8D5; 2004 for (i = 0; i < 10; i++) { 2005 block[0] = i == 0 ? 0 : randULong(); 2006 block[1] = randULong(); 2007 block[2] = randULong(); 2008 block[3] = randULong(); 2009 __asm__ __volatile__( 2010 "movq %0, %%rax" "\n\t" 2011 "movq 0(%%rax), %%rdi" "\n\t" 2012 "movq 8(%%rax), %%r11" "\n\t" 2013 #ifndef VGP_amd64_darwin 2014 "popcntq %%rdi, %%r11" "\n\t" 2015 #else 2016 "popcnt %%rdi, %%r11" "\n\t" 2017 #endif 2018 "movq %%r11, 16(%%rax)" "\n\t" 2019 "pushfq" "\n\t" 2020 "popq %%r12" "\n\t" 2021 "movq %%r12, 24(%%rax)" "\n" 2022 : /*out*/ 2023 : /*in*/"r"(&block[0]) 2024 : /*trash*/ "cc", "memory", "rdi", "r11", "r12" 2025 ); 2026 printf("r popcntq %016llx %016llx %016llx %016llx\n", 2027 block[0], block[1], block[2], block[3] & oszacp_mask); 2028 2029 block[0] = i == 0 ? 0 : randULong(); 2030 block[1] = randULong(); 2031 block[2] = randULong(); 2032 block[3] = randULong(); 2033 __asm__ __volatile__( 2034 "movq %0, %%rax" "\n\t" 2035 "movq 8(%%rax), %%r11" "\n\t" 2036 #ifndef VGP_amd64_darwin 2037 "popcntq 0(%%rax), %%r11" "\n\t" 2038 #else 2039 "popcnt 0(%%rax), %%r11" "\n\t" 2040 #endif 2041 "movq %%r11, 16(%%rax)" "\n\t" 2042 "pushfq" "\n\t" 2043 "popq %%r12" "\n\t" 2044 "movq %%r12, 24(%%rax)" "\n" 2045 : /*out*/ 2046 : /*in*/"r"(&block[0]) 2047 : /*trash*/ "cc", "memory", "r11", "r12" 2048 ); 2049 printf("m popcntq %016llx %016llx %016llx %016llx\n", 2050 block[0], block[1], block[2], block[3] & oszacp_mask); 2051 } 2052 } 2053 2054 2055 void test_POPCNTL ( void ) 2056 { 2057 ULong block[4]; 2058 Int i; 2059 ULong oszacp_mask = 0x8D5; 2060 for (i = 0; i < 10; i++) { 2061 block[0] = i == 0 ? 0 : randULong(); 2062 block[1] = randULong(); 2063 block[2] = randULong(); 2064 block[3] = randULong(); 2065 __asm__ __volatile__( 2066 "movq %0, %%rax" "\n\t" 2067 "movq 0(%%rax), %%rdi" "\n\t" 2068 "movq 8(%%rax), %%r11" "\n\t" 2069 #ifndef VGP_amd64_darwin 2070 "popcntl %%edi, %%r11d" "\n\t" 2071 #else 2072 "popcnt %%edi, %%r11d" "\n\t" 2073 #endif 2074 "movq %%r11, 16(%%rax)" "\n\t" 2075 "pushfq" "\n\t" 2076 "popq %%r12" "\n\t" 2077 "movq %%r12, 24(%%rax)" "\n" 2078 : /*out*/ 2079 : /*in*/"r"(&block[0]) 2080 : /*trash*/ "cc", "memory", "rdi", "r11", "r12" 2081 ); 2082 printf("r popcntl %016llx %016llx %016llx %016llx\n", 2083 block[0], block[1], block[2], block[3] & oszacp_mask); 2084 2085 block[0] = i == 0 ? 0 : randULong(); 2086 block[1] = randULong(); 2087 block[2] = randULong(); 2088 block[3] = randULong(); 2089 __asm__ __volatile__( 2090 "movq %0, %%rax" "\n\t" 2091 "movq 8(%%rax), %%r11" "\n\t" 2092 #ifndef VGP_amd64_darwin 2093 "popcntl 0(%%rax), %%r11d" "\n\t" 2094 #else 2095 "popcnt 0(%%rax), %%r11d" "\n\t" 2096 #endif 2097 "movq %%r11, 16(%%rax)" "\n\t" 2098 "pushfq" "\n\t" 2099 "popq %%r12" "\n\t" 2100 "movq %%r12, 24(%%rax)" "\n" 2101 : /*out*/ 2102 : /*in*/"r"(&block[0]) 2103 : /*trash*/ "cc", "memory", "r11", "r12" 2104 ); 2105 printf("m popcntl %016llx %016llx %016llx %016llx\n", 2106 block[0], block[1], block[2], block[3] & oszacp_mask); 2107 } 2108 } 2109 2110 2111 void test_POPCNTW ( void ) 2112 { 2113 ULong block[4]; 2114 Int i; 2115 ULong oszacp_mask = 0x8D5; 2116 for (i = 0; i < 10; i++) { 2117 block[0] = i == 0 ? 0 : randULong(); 2118 block[1] = randULong(); 2119 block[2] = randULong(); 2120 block[3] = randULong(); 2121 __asm__ __volatile__( 2122 "movq %0, %%rax" "\n\t" 2123 "movq 0(%%rax), %%rdi" "\n\t" 2124 "movq 8(%%rax), %%r11" "\n\t" 2125 #ifndef VGP_amd64_darwin 2126 "popcntw %%di, %%r11w" "\n\t" 2127 #else 2128 "popcnt %%di, %%r11w" "\n\t" 2129 #endif 2130 "movq %%r11, 16(%%rax)" "\n\t" 2131 "pushfq" "\n\t" 2132 "popq %%r12" "\n\t" 2133 "movq %%r12, 24(%%rax)" "\n" 2134 : /*out*/ 2135 : /*in*/"r"(&block[0]) 2136 : /*trash*/ "cc", "memory", "rdi", "r11", "r12" 2137 ); 2138 printf("r popcntw %016llx %016llx %016llx %016llx\n", 2139 block[0], block[1], block[2], block[3] & oszacp_mask); 2140 2141 block[0] = i == 0 ? 0 : randULong(); 2142 block[1] = randULong(); 2143 block[2] = randULong(); 2144 block[3] = randULong(); 2145 __asm__ __volatile__( 2146 "movq %0, %%rax" "\n\t" 2147 "movq 8(%%rax), %%r11" "\n\t" 2148 #ifndef VGP_amd64_darwin 2149 "popcntw 0(%%rax), %%r11w" "\n\t" 2150 #else 2151 "popcnt 0(%%rax), %%r11w" "\n\t" 2152 #endif 2153 "movq %%r11, 16(%%rax)" "\n\t" 2154 "pushfq" "\n\t" 2155 "popq %%r12" "\n\t" 2156 "movq %%r12, 24(%%rax)" "\n" 2157 : /*out*/ 2158 : /*in*/"r"(&block[0]) 2159 : /*trash*/ "cc", "memory", "r11", "r12" 2160 ); 2161 printf("m popcntw %016llx %016llx %016llx %016llx\n", 2162 block[0], block[1], block[2], block[3] & oszacp_mask); 2163 } 2164 } 2165 2166 2167 void test_PCMPGTQ ( void ) 2168 { 2169 V128 spec[7]; 2170 do64HLtoV128( &spec[0], 0x0000000000000000ULL, 0xffffffffffffffffULL ); 2171 do64HLtoV128( &spec[1], 0x0000000000000001ULL, 0xfffffffffffffffeULL ); 2172 do64HLtoV128( &spec[2], 0x7fffffffffffffffULL, 0x8000000000000001ULL ); 2173 do64HLtoV128( &spec[3], 0x8000000000000000ULL, 0x8000000000000000ULL ); 2174 do64HLtoV128( &spec[4], 0x8000000000000001ULL, 0x7fffffffffffffffULL ); 2175 do64HLtoV128( &spec[5], 0xfffffffffffffffeULL, 0x0000000000000001ULL ); 2176 do64HLtoV128( &spec[6], 0xffffffffffffffffULL, 0x0000000000000000ULL ); 2177 2178 V128 src, dst; 2179 Int i, j; 2180 for (i = 0; i < 10; i++) { 2181 randV128(&src); 2182 randV128(&dst); 2183 DO_mandr_r("pcmpgtq", src, dst); 2184 } 2185 for (i = 0; i < 7; i++) { 2186 for (j = 0; j < 7; j++) { 2187 memcpy(&src, &spec[i], 16); 2188 memcpy(&dst, &spec[j], 16); 2189 DO_mandr_r("pcmpgtq", src, dst); 2190 } 2191 } 2192 } 2193 2194 /* ------------ ROUNDSD ------------ */ 2195 2196 void do_ROUNDSD_000 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2197 { 2198 if (mem) { 2199 __asm__ __volatile__( 2200 "movupd (%1), %%xmm11" "\n\t" 2201 "roundsd $0, (%0), %%xmm11" "\n\t" 2202 "movupd %%xmm11, (%1)" "\n" 2203 : /*OUT*/ 2204 : /*IN*/ "r"(src), "r"(dst) 2205 : /*TRASH*/ "xmm11" 2206 ); 2207 } else { 2208 __asm__ __volatile__( 2209 "movupd (%1), %%xmm11" "\n\t" 2210 "movupd (%0), %%xmm2" "\n\t" 2211 "roundsd $0, %%xmm2, %%xmm11" "\n\t" 2212 "movupd %%xmm11, (%1)" "\n" 2213 : /*OUT*/ 2214 : /*IN*/ "r"(src), "r"(dst) 2215 : /*TRASH*/ "xmm11","xmm2" 2216 ); 2217 } 2218 } 2219 2220 void do_ROUNDSD_001 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2221 { 2222 if (mem) { 2223 __asm__ __volatile__( 2224 "movupd (%1), %%xmm11" "\n\t" 2225 "roundsd $1, (%0), %%xmm11" "\n\t" 2226 "movupd %%xmm11, (%1)" "\n" 2227 : /*OUT*/ 2228 : /*IN*/ "r"(src), "r"(dst) 2229 : /*TRASH*/ "xmm11" 2230 ); 2231 } else { 2232 __asm__ __volatile__( 2233 "movupd (%1), %%xmm11" "\n\t" 2234 "movupd (%0), %%xmm2" "\n\t" 2235 "roundsd $1, %%xmm2, %%xmm11" "\n\t" 2236 "movupd %%xmm11, (%1)" "\n" 2237 : /*OUT*/ 2238 : /*IN*/ "r"(src), "r"(dst) 2239 : /*TRASH*/ "xmm11","xmm2" 2240 ); 2241 } 2242 } 2243 2244 void do_ROUNDSD_010 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2245 { 2246 if (mem) { 2247 __asm__ __volatile__( 2248 "movupd (%1), %%xmm11" "\n\t" 2249 "roundsd $2, (%0), %%xmm11" "\n\t" 2250 "movupd %%xmm11, (%1)" "\n" 2251 : /*OUT*/ 2252 : /*IN*/ "r"(src), "r"(dst) 2253 : /*TRASH*/ "xmm11" 2254 ); 2255 } else { 2256 __asm__ __volatile__( 2257 "movupd (%1), %%xmm11" "\n\t" 2258 "movupd (%0), %%xmm2" "\n\t" 2259 "roundsd $2, %%xmm2, %%xmm11" "\n\t" 2260 "movupd %%xmm11, (%1)" "\n" 2261 : /*OUT*/ 2262 : /*IN*/ "r"(src), "r"(dst) 2263 : /*TRASH*/ "xmm11","xmm2" 2264 ); 2265 } 2266 } 2267 2268 void do_ROUNDSD_011 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2269 { 2270 if (mem) { 2271 __asm__ __volatile__( 2272 "movupd (%1), %%xmm11" "\n\t" 2273 "roundsd $3, (%0), %%xmm11" "\n\t" 2274 "movupd %%xmm11, (%1)" "\n" 2275 : /*OUT*/ 2276 : /*IN*/ "r"(src), "r"(dst) 2277 : /*TRASH*/ "xmm11" 2278 ); 2279 } else { 2280 __asm__ __volatile__( 2281 "movupd (%1), %%xmm11" "\n\t" 2282 "movupd (%0), %%xmm2" "\n\t" 2283 "roundsd $3, %%xmm2, %%xmm11" "\n\t" 2284 "movupd %%xmm11, (%1)" "\n" 2285 : /*OUT*/ 2286 : /*IN*/ "r"(src), "r"(dst) 2287 : /*TRASH*/ "xmm11","xmm2" 2288 ); 2289 } 2290 } 2291 2292 void do_ROUNDSD_1XX ( Bool mem, V128* src, /*OUT*/V128* dst ) 2293 { 2294 if (mem) { 2295 __asm__ __volatile__( 2296 "movupd (%1), %%xmm11" "\n\t" 2297 "roundsd $4, (%0), %%xmm11" "\n\t" 2298 "movupd %%xmm11, (%1)" "\n" 2299 : /*OUT*/ 2300 : /*IN*/ "r"(src), "r"(dst) 2301 : /*TRASH*/ "xmm11" 2302 ); 2303 } else { 2304 __asm__ __volatile__( 2305 "movupd (%1), %%xmm11" "\n\t" 2306 "movupd (%0), %%xmm2" "\n\t" 2307 "roundsd $4, %%xmm2, %%xmm11" "\n\t" 2308 "movupd %%xmm11, (%1)" "\n" 2309 : /*OUT*/ 2310 : /*IN*/ "r"(src), "r"(dst) 2311 : /*TRASH*/ "xmm11","xmm2" 2312 ); 2313 } 2314 } 2315 2316 void test_ROUNDSD_w_immediate_rounding ( void ) 2317 { 2318 double vals[22]; 2319 Int i = 0; 2320 vals[i++] = 0.0; 2321 vals[i++] = -0.0; 2322 vals[i++] = mkPosInf(); 2323 vals[i++] = mkNegInf(); 2324 vals[i++] = mkPosNan(); 2325 vals[i++] = mkNegNan(); 2326 vals[i++] = -1.3; 2327 vals[i++] = -1.1; 2328 vals[i++] = -0.9; 2329 vals[i++] = -0.7; 2330 vals[i++] = -0.50001; 2331 vals[i++] = -0.49999; 2332 vals[i++] = -0.3; 2333 vals[i++] = -0.1; 2334 vals[i++] = 0.1; 2335 vals[i++] = 0.3; 2336 vals[i++] = 0.49999; 2337 vals[i++] = 0.50001; 2338 vals[i++] = 0.7; 2339 vals[i++] = 0.9; 2340 vals[i++] = 1.1; 2341 vals[i++] = 1.3; 2342 assert(i == 22); 2343 2344 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 2345 V128 src, dst; 2346 2347 randV128(&src); 2348 randV128(&dst); 2349 memcpy(&src[0], &vals[i], 8); 2350 do_ROUNDSD_000(False/*reg*/, &src, &dst); 2351 printf("r roundsd_000 "); 2352 showV128(&src); 2353 printf(" "); 2354 showV128(&dst); 2355 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2356 printf("\n"); 2357 2358 randV128(&src); 2359 randV128(&dst); 2360 memcpy(&src[0], &vals[i], 8); 2361 do_ROUNDSD_000(True/*mem*/, &src, &dst); 2362 printf("m roundsd_000 "); 2363 showV128(&src); 2364 printf(" "); 2365 showV128(&dst); 2366 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2367 printf("\n"); 2368 2369 2370 randV128(&src); 2371 randV128(&dst); 2372 memcpy(&src[0], &vals[i], 8); 2373 do_ROUNDSD_001(False/*reg*/, &src, &dst); 2374 printf("r roundsd_001 "); 2375 showV128(&src); 2376 printf(" "); 2377 showV128(&dst); 2378 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2379 printf("\n"); 2380 2381 randV128(&src); 2382 randV128(&dst); 2383 memcpy(&src[0], &vals[i], 8); 2384 do_ROUNDSD_001(True/*mem*/, &src, &dst); 2385 printf("m roundsd_001 "); 2386 showV128(&src); 2387 printf(" "); 2388 showV128(&dst); 2389 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2390 printf("\n"); 2391 2392 2393 randV128(&src); 2394 randV128(&dst); 2395 memcpy(&src[0], &vals[i], 8); 2396 do_ROUNDSD_010(False/*reg*/, &src, &dst); 2397 printf("r roundsd_010 "); 2398 showV128(&src); 2399 printf(" "); 2400 showV128(&dst); 2401 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2402 printf("\n"); 2403 2404 randV128(&src); 2405 randV128(&dst); 2406 memcpy(&src[0], &vals[i], 8); 2407 do_ROUNDSD_010(True/*mem*/, &src, &dst); 2408 printf("m roundsd_010 "); 2409 showV128(&src); 2410 printf(" "); 2411 showV128(&dst); 2412 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2413 printf("\n"); 2414 2415 2416 randV128(&src); 2417 randV128(&dst); 2418 memcpy(&src[0], &vals[i], 8); 2419 do_ROUNDSD_011(False/*reg*/, &src, &dst); 2420 printf("r roundsd_011 "); 2421 showV128(&src); 2422 printf(" "); 2423 showV128(&dst); 2424 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2425 printf("\n"); 2426 2427 randV128(&src); 2428 randV128(&dst); 2429 memcpy(&src[0], &vals[i], 8); 2430 do_ROUNDSD_011(True/*mem*/, &src, &dst); 2431 printf("m roundsd_011 "); 2432 showV128(&src); 2433 printf(" "); 2434 showV128(&dst); 2435 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2436 printf("\n"); 2437 } 2438 } 2439 2440 void test_ROUNDSD_w_mxcsr_rounding ( void ) 2441 { 2442 UInt rm; 2443 double vals[22]; 2444 Int i = 0; 2445 vals[i++] = 0.0; 2446 vals[i++] = -0.0; 2447 vals[i++] = mkPosInf(); 2448 vals[i++] = mkNegInf(); 2449 vals[i++] = mkPosNan(); 2450 vals[i++] = mkNegNan(); 2451 vals[i++] = -1.3; 2452 vals[i++] = -1.1; 2453 vals[i++] = -0.9; 2454 vals[i++] = -0.7; 2455 vals[i++] = -0.50001; 2456 vals[i++] = -0.49999; 2457 vals[i++] = -0.3; 2458 vals[i++] = -0.1; 2459 vals[i++] = 0.1; 2460 vals[i++] = 0.3; 2461 vals[i++] = 0.49999; 2462 vals[i++] = 0.50001; 2463 vals[i++] = 0.7; 2464 vals[i++] = 0.9; 2465 vals[i++] = 1.1; 2466 vals[i++] = 1.3; 2467 assert(i == 22); 2468 2469 rm = get_sse_roundingmode(); 2470 assert(rm == 0); // 0 == RN == default 2471 2472 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 2473 V128 src, dst; 2474 2475 for (rm = 0; rm <= 3; rm++) { 2476 set_sse_roundingmode(rm); 2477 2478 randV128(&src); 2479 randV128(&dst); 2480 memcpy(&src[0], &vals[i], 8); 2481 do_ROUNDSD_1XX(False/*reg*/, &src, &dst); 2482 printf("r (rm=%u) roundsd_1XX ", rm); 2483 showV128(&src); 2484 printf(" "); 2485 showV128(&dst); 2486 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2487 printf("\n"); 2488 2489 randV128(&src); 2490 randV128(&dst); 2491 memcpy(&src[0], &vals[i], 8); 2492 do_ROUNDSD_1XX(True/*mem*/, &src, &dst); 2493 printf("m (rm=%u) roundsd_1XX ", rm); 2494 showV128(&src); 2495 printf(" "); 2496 showV128(&dst); 2497 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2498 printf("\n"); 2499 } 2500 } 2501 2502 rm = get_sse_roundingmode(); 2503 assert(rm == 3); 2504 set_sse_roundingmode(0); 2505 rm = get_sse_roundingmode(); 2506 assert(rm == 0); // 0 == RN == default 2507 } 2508 2509 2510 /* ------------ ROUNDSS ------------ */ 2511 2512 void do_ROUNDSS_000 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2513 { 2514 if (mem) { 2515 __asm__ __volatile__( 2516 "movupd (%1), %%xmm11" "\n\t" 2517 "roundss $0, (%0), %%xmm11" "\n\t" 2518 "movupd %%xmm11, (%1)" "\n" 2519 : /*OUT*/ 2520 : /*IN*/ "r"(src), "r"(dst) 2521 : /*TRASH*/ "xmm11" 2522 ); 2523 } else { 2524 __asm__ __volatile__( 2525 "movupd (%1), %%xmm11" "\n\t" 2526 "movupd (%0), %%xmm2" "\n\t" 2527 "roundss $0, %%xmm2, %%xmm11" "\n\t" 2528 "movupd %%xmm11, (%1)" "\n" 2529 : /*OUT*/ 2530 : /*IN*/ "r"(src), "r"(dst) 2531 : /*TRASH*/ "xmm11","xmm2" 2532 ); 2533 } 2534 } 2535 2536 void do_ROUNDSS_001 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2537 { 2538 if (mem) { 2539 __asm__ __volatile__( 2540 "movupd (%1), %%xmm11" "\n\t" 2541 "roundss $1, (%0), %%xmm11" "\n\t" 2542 "movupd %%xmm11, (%1)" "\n" 2543 : /*OUT*/ 2544 : /*IN*/ "r"(src), "r"(dst) 2545 : /*TRASH*/ "xmm11" 2546 ); 2547 } else { 2548 __asm__ __volatile__( 2549 "movupd (%1), %%xmm11" "\n\t" 2550 "movupd (%0), %%xmm2" "\n\t" 2551 "roundss $1, %%xmm2, %%xmm11" "\n\t" 2552 "movupd %%xmm11, (%1)" "\n" 2553 : /*OUT*/ 2554 : /*IN*/ "r"(src), "r"(dst) 2555 : /*TRASH*/ "xmm11","xmm2" 2556 ); 2557 } 2558 } 2559 2560 void do_ROUNDSS_010 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2561 { 2562 if (mem) { 2563 __asm__ __volatile__( 2564 "movupd (%1), %%xmm11" "\n\t" 2565 "roundss $2, (%0), %%xmm11" "\n\t" 2566 "movupd %%xmm11, (%1)" "\n" 2567 : /*OUT*/ 2568 : /*IN*/ "r"(src), "r"(dst) 2569 : /*TRASH*/ "xmm11" 2570 ); 2571 } else { 2572 __asm__ __volatile__( 2573 "movupd (%1), %%xmm11" "\n\t" 2574 "movupd (%0), %%xmm2" "\n\t" 2575 "roundss $2, %%xmm2, %%xmm11" "\n\t" 2576 "movupd %%xmm11, (%1)" "\n" 2577 : /*OUT*/ 2578 : /*IN*/ "r"(src), "r"(dst) 2579 : /*TRASH*/ "xmm11","xmm2" 2580 ); 2581 } 2582 } 2583 2584 void do_ROUNDSS_011 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2585 { 2586 if (mem) { 2587 __asm__ __volatile__( 2588 "movupd (%1), %%xmm11" "\n\t" 2589 "roundss $3, (%0), %%xmm11" "\n\t" 2590 "movupd %%xmm11, (%1)" "\n" 2591 : /*OUT*/ 2592 : /*IN*/ "r"(src), "r"(dst) 2593 : /*TRASH*/ "xmm11" 2594 ); 2595 } else { 2596 __asm__ __volatile__( 2597 "movupd (%1), %%xmm11" "\n\t" 2598 "movupd (%0), %%xmm2" "\n\t" 2599 "roundss $3, %%xmm2, %%xmm11" "\n\t" 2600 "movupd %%xmm11, (%1)" "\n" 2601 : /*OUT*/ 2602 : /*IN*/ "r"(src), "r"(dst) 2603 : /*TRASH*/ "xmm11","xmm2" 2604 ); 2605 } 2606 } 2607 2608 void do_ROUNDSS_1XX ( Bool mem, V128* src, /*OUT*/V128* dst ) 2609 { 2610 if (mem) { 2611 __asm__ __volatile__( 2612 "movupd (%1), %%xmm11" "\n\t" 2613 "roundss $4, (%0), %%xmm11" "\n\t" 2614 "movupd %%xmm11, (%1)" "\n" 2615 : /*OUT*/ 2616 : /*IN*/ "r"(src), "r"(dst) 2617 : /*TRASH*/ "xmm11" 2618 ); 2619 } else { 2620 __asm__ __volatile__( 2621 "movupd (%1), %%xmm11" "\n\t" 2622 "movupd (%0), %%xmm2" "\n\t" 2623 "roundss $4, %%xmm2, %%xmm11" "\n\t" 2624 "movupd %%xmm11, (%1)" "\n" 2625 : /*OUT*/ 2626 : /*IN*/ "r"(src), "r"(dst) 2627 : /*TRASH*/ "xmm11","xmm2" 2628 ); 2629 } 2630 } 2631 2632 void test_ROUNDSS_w_immediate_rounding ( void ) 2633 { 2634 float vals[22]; 2635 Int i = 0; 2636 vals[i++] = 0.0; 2637 vals[i++] = -0.0; 2638 vals[i++] = mkPosInf(); 2639 vals[i++] = mkNegInf(); 2640 vals[i++] = mkPosNan(); 2641 vals[i++] = mkNegNan(); 2642 vals[i++] = -1.3; 2643 vals[i++] = -1.1; 2644 vals[i++] = -0.9; 2645 vals[i++] = -0.7; 2646 vals[i++] = -0.50001; 2647 vals[i++] = -0.49999; 2648 vals[i++] = -0.3; 2649 vals[i++] = -0.1; 2650 vals[i++] = 0.1; 2651 vals[i++] = 0.3; 2652 vals[i++] = 0.49999; 2653 vals[i++] = 0.50001; 2654 vals[i++] = 0.7; 2655 vals[i++] = 0.9; 2656 vals[i++] = 1.1; 2657 vals[i++] = 1.3; 2658 assert(i == 22); 2659 2660 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 2661 V128 src, dst; 2662 2663 randV128(&src); 2664 randV128(&dst); 2665 memcpy(&src[0], &vals[i], 4); 2666 do_ROUNDSS_000(False/*reg*/, &src, &dst); 2667 printf("r roundss_000 "); 2668 showV128(&src); 2669 printf(" "); 2670 showV128(&dst); 2671 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2672 printf("\n"); 2673 2674 randV128(&src); 2675 randV128(&dst); 2676 memcpy(&src[0], &vals[i], 4); 2677 do_ROUNDSS_000(True/*mem*/, &src, &dst); 2678 printf("m roundss_000 "); 2679 showV128(&src); 2680 printf(" "); 2681 showV128(&dst); 2682 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2683 printf("\n"); 2684 2685 2686 randV128(&src); 2687 randV128(&dst); 2688 memcpy(&src[0], &vals[i], 4); 2689 do_ROUNDSS_001(False/*reg*/, &src, &dst); 2690 printf("r roundss_001 "); 2691 showV128(&src); 2692 printf(" "); 2693 showV128(&dst); 2694 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2695 printf("\n"); 2696 2697 randV128(&src); 2698 randV128(&dst); 2699 memcpy(&src[0], &vals[i], 4); 2700 do_ROUNDSS_001(True/*mem*/, &src, &dst); 2701 printf("m roundss_001 "); 2702 showV128(&src); 2703 printf(" "); 2704 showV128(&dst); 2705 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2706 printf("\n"); 2707 2708 2709 randV128(&src); 2710 randV128(&dst); 2711 memcpy(&src[0], &vals[i], 4); 2712 do_ROUNDSS_010(False/*reg*/, &src, &dst); 2713 printf("r roundss_010 "); 2714 showV128(&src); 2715 printf(" "); 2716 showV128(&dst); 2717 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2718 printf("\n"); 2719 2720 randV128(&src); 2721 randV128(&dst); 2722 memcpy(&src[0], &vals[i], 4); 2723 do_ROUNDSS_010(True/*mem*/, &src, &dst); 2724 printf("m roundss_010 "); 2725 showV128(&src); 2726 printf(" "); 2727 showV128(&dst); 2728 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2729 printf("\n"); 2730 2731 2732 randV128(&src); 2733 randV128(&dst); 2734 memcpy(&src[0], &vals[i], 4); 2735 do_ROUNDSS_011(False/*reg*/, &src, &dst); 2736 printf("r roundss_011 "); 2737 showV128(&src); 2738 printf(" "); 2739 showV128(&dst); 2740 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2741 printf("\n"); 2742 2743 randV128(&src); 2744 randV128(&dst); 2745 memcpy(&src[0], &vals[i], 4); 2746 do_ROUNDSS_011(True/*mem*/, &src, &dst); 2747 printf("m roundss_011 "); 2748 showV128(&src); 2749 printf(" "); 2750 showV128(&dst); 2751 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2752 printf("\n"); 2753 } 2754 } 2755 2756 void test_ROUNDSS_w_mxcsr_rounding ( void ) 2757 { 2758 UInt rm; 2759 float vals[22]; 2760 Int i = 0; 2761 vals[i++] = 0.0; 2762 vals[i++] = -0.0; 2763 vals[i++] = mkPosInf(); 2764 vals[i++] = mkNegInf(); 2765 vals[i++] = mkPosNan(); 2766 vals[i++] = mkNegNan(); 2767 vals[i++] = -1.3; 2768 vals[i++] = -1.1; 2769 vals[i++] = -0.9; 2770 vals[i++] = -0.7; 2771 vals[i++] = -0.50001; 2772 vals[i++] = -0.49999; 2773 vals[i++] = -0.3; 2774 vals[i++] = -0.1; 2775 vals[i++] = 0.1; 2776 vals[i++] = 0.3; 2777 vals[i++] = 0.49999; 2778 vals[i++] = 0.50001; 2779 vals[i++] = 0.7; 2780 vals[i++] = 0.9; 2781 vals[i++] = 1.1; 2782 vals[i++] = 1.3; 2783 assert(i == 22); 2784 2785 rm = get_sse_roundingmode(); 2786 assert(rm == 0); // 0 == RN == default 2787 2788 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 2789 V128 src, dst; 2790 2791 for (rm = 0; rm <= 3; rm++) { 2792 set_sse_roundingmode(rm); 2793 2794 randV128(&src); 2795 randV128(&dst); 2796 memcpy(&src[0], &vals[i], 4); 2797 do_ROUNDSS_1XX(False/*reg*/, &src, &dst); 2798 printf("r (rm=%u) roundss_1XX ", rm); 2799 showV128(&src); 2800 printf(" "); 2801 showV128(&dst); 2802 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2803 printf("\n"); 2804 2805 randV128(&src); 2806 randV128(&dst); 2807 memcpy(&src[0], &vals[i], 4); 2808 do_ROUNDSS_1XX(True/*mem*/, &src, &dst); 2809 printf("m (rm=%u) roundss_1XX ", rm); 2810 showV128(&src); 2811 printf(" "); 2812 showV128(&dst); 2813 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2814 printf("\n"); 2815 } 2816 } 2817 2818 rm = get_sse_roundingmode(); 2819 assert(rm == 3); 2820 set_sse_roundingmode(0); 2821 rm = get_sse_roundingmode(); 2822 assert(rm == 0); // 0 == RN == default 2823 } 2824 2825 /* ------------ ROUNDPD ------------ */ 2826 2827 void do_ROUNDPD_000 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2828 { 2829 if (mem) { 2830 __asm__ __volatile__( 2831 "movupd (%1), %%xmm11" "\n\t" 2832 "roundpd $0, (%0), %%xmm11" "\n\t" 2833 "movupd %%xmm11, (%1)" "\n" 2834 : /*OUT*/ 2835 : /*IN*/ "r"(src), "r"(dst) 2836 : /*TRASH*/ "xmm11" 2837 ); 2838 } else { 2839 __asm__ __volatile__( 2840 "movupd (%1), %%xmm11" "\n\t" 2841 "movupd (%0), %%xmm2" "\n\t" 2842 "roundpd $0, %%xmm2, %%xmm11" "\n\t" 2843 "movupd %%xmm11, (%1)" "\n" 2844 : /*OUT*/ 2845 : /*IN*/ "r"(src), "r"(dst) 2846 : /*TRASH*/ "xmm11","xmm2" 2847 ); 2848 } 2849 } 2850 2851 void do_ROUNDPD_001 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2852 { 2853 if (mem) { 2854 __asm__ __volatile__( 2855 "movupd (%1), %%xmm11" "\n\t" 2856 "roundpd $1, (%0), %%xmm11" "\n\t" 2857 "movupd %%xmm11, (%1)" "\n" 2858 : /*OUT*/ 2859 : /*IN*/ "r"(src), "r"(dst) 2860 : /*TRASH*/ "xmm11" 2861 ); 2862 } else { 2863 __asm__ __volatile__( 2864 "movupd (%1), %%xmm11" "\n\t" 2865 "movupd (%0), %%xmm2" "\n\t" 2866 "roundpd $1, %%xmm2, %%xmm11" "\n\t" 2867 "movupd %%xmm11, (%1)" "\n" 2868 : /*OUT*/ 2869 : /*IN*/ "r"(src), "r"(dst) 2870 : /*TRASH*/ "xmm11","xmm2" 2871 ); 2872 } 2873 } 2874 2875 void do_ROUNDPD_010 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2876 { 2877 if (mem) { 2878 __asm__ __volatile__( 2879 "movupd (%1), %%xmm11" "\n\t" 2880 "roundpd $2, (%0), %%xmm11" "\n\t" 2881 "movupd %%xmm11, (%1)" "\n" 2882 : /*OUT*/ 2883 : /*IN*/ "r"(src), "r"(dst) 2884 : /*TRASH*/ "xmm11" 2885 ); 2886 } else { 2887 __asm__ __volatile__( 2888 "movupd (%1), %%xmm11" "\n\t" 2889 "movupd (%0), %%xmm2" "\n\t" 2890 "roundpd $2, %%xmm2, %%xmm11" "\n\t" 2891 "movupd %%xmm11, (%1)" "\n" 2892 : /*OUT*/ 2893 : /*IN*/ "r"(src), "r"(dst) 2894 : /*TRASH*/ "xmm11","xmm2" 2895 ); 2896 } 2897 } 2898 2899 void do_ROUNDPD_011 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2900 { 2901 if (mem) { 2902 __asm__ __volatile__( 2903 "movupd (%1), %%xmm11" "\n\t" 2904 "roundpd $3, (%0), %%xmm11" "\n\t" 2905 "movupd %%xmm11, (%1)" "\n" 2906 : /*OUT*/ 2907 : /*IN*/ "r"(src), "r"(dst) 2908 : /*TRASH*/ "xmm11" 2909 ); 2910 } else { 2911 __asm__ __volatile__( 2912 "movupd (%1), %%xmm11" "\n\t" 2913 "movupd (%0), %%xmm2" "\n\t" 2914 "roundpd $3, %%xmm2, %%xmm11" "\n\t" 2915 "movupd %%xmm11, (%1)" "\n" 2916 : /*OUT*/ 2917 : /*IN*/ "r"(src), "r"(dst) 2918 : /*TRASH*/ "xmm11","xmm2" 2919 ); 2920 } 2921 } 2922 2923 void do_ROUNDPD_1XX ( Bool mem, V128* src, /*OUT*/V128* dst ) 2924 { 2925 if (mem) { 2926 __asm__ __volatile__( 2927 "movupd (%1), %%xmm11" "\n\t" 2928 "roundpd $4, (%0), %%xmm11" "\n\t" 2929 "movupd %%xmm11, (%1)" "\n" 2930 : /*OUT*/ 2931 : /*IN*/ "r"(src), "r"(dst) 2932 : /*TRASH*/ "xmm11" 2933 ); 2934 } else { 2935 __asm__ __volatile__( 2936 "movupd (%1), %%xmm11" "\n\t" 2937 "movupd (%0), %%xmm2" "\n\t" 2938 "roundpd $4, %%xmm2, %%xmm11" "\n\t" 2939 "movupd %%xmm11, (%1)" "\n" 2940 : /*OUT*/ 2941 : /*IN*/ "r"(src), "r"(dst) 2942 : /*TRASH*/ "xmm11","xmm2" 2943 ); 2944 } 2945 } 2946 2947 void test_ROUNDPD_w_immediate_rounding ( void ) 2948 { 2949 double vals[22]; 2950 Int i = 0; 2951 vals[i++] = 0.0; 2952 vals[i++] = -0.0; 2953 vals[i++] = mkPosInf(); 2954 vals[i++] = mkNegInf(); 2955 vals[i++] = mkPosNan(); 2956 vals[i++] = mkNegNan(); 2957 vals[i++] = -1.3; 2958 vals[i++] = -1.1; 2959 vals[i++] = -0.9; 2960 vals[i++] = -0.7; 2961 vals[i++] = -0.50001; 2962 vals[i++] = -0.49999; 2963 vals[i++] = -0.3; 2964 vals[i++] = -0.1; 2965 vals[i++] = 0.1; 2966 vals[i++] = 0.3; 2967 vals[i++] = 0.49999; 2968 vals[i++] = 0.50001; 2969 vals[i++] = 0.7; 2970 vals[i++] = 0.9; 2971 vals[i++] = 1.1; 2972 vals[i++] = 1.3; 2973 assert(i == 22); 2974 2975 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 2976 V128 src, dst; 2977 2978 randV128(&src); 2979 randV128(&dst); 2980 memcpy(&src[0], &vals[i], 8); 2981 memcpy(&src[8], &vals[(i+11)%22], 8); 2982 do_ROUNDPD_000(False/*reg*/, &src, &dst); 2983 printf("r roundpd_000 "); 2984 showV128(&src); 2985 printf(" "); 2986 showV128(&dst); 2987 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 2988 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 2989 printf("\n"); 2990 2991 randV128(&src); 2992 randV128(&dst); 2993 memcpy(&src[0], &vals[i], 8); 2994 memcpy(&src[8], &vals[(i+11)%22], 8); 2995 do_ROUNDPD_000(True/*mem*/, &src, &dst); 2996 printf("m roundpd_000 "); 2997 showV128(&src); 2998 printf(" "); 2999 showV128(&dst); 3000 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3001 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3002 printf("\n"); 3003 3004 3005 randV128(&src); 3006 randV128(&dst); 3007 memcpy(&src[0], &vals[i], 8); 3008 memcpy(&src[8], &vals[(i+11)%22], 8); 3009 do_ROUNDPD_001(False/*reg*/, &src, &dst); 3010 printf("r roundpd_001 "); 3011 showV128(&src); 3012 printf(" "); 3013 showV128(&dst); 3014 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3015 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3016 printf("\n"); 3017 3018 randV128(&src); 3019 randV128(&dst); 3020 memcpy(&src[0], &vals[i], 8); 3021 memcpy(&src[8], &vals[(i+11)%22], 8); 3022 do_ROUNDPD_001(True/*mem*/, &src, &dst); 3023 printf("m roundpd_001 "); 3024 showV128(&src); 3025 printf(" "); 3026 showV128(&dst); 3027 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3028 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3029 printf("\n"); 3030 3031 3032 randV128(&src); 3033 randV128(&dst); 3034 memcpy(&src[0], &vals[i], 8); 3035 memcpy(&src[8], &vals[(i+11)%22], 8); 3036 do_ROUNDPD_010(False/*reg*/, &src, &dst); 3037 printf("r roundpd_010 "); 3038 showV128(&src); 3039 printf(" "); 3040 showV128(&dst); 3041 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3042 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3043 printf("\n"); 3044 3045 randV128(&src); 3046 randV128(&dst); 3047 memcpy(&src[0], &vals[i], 8); 3048 memcpy(&src[8], &vals[(i+11)%22], 8); 3049 do_ROUNDPD_010(True/*mem*/, &src, &dst); 3050 printf("m roundpd_010 "); 3051 showV128(&src); 3052 printf(" "); 3053 showV128(&dst); 3054 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3055 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3056 printf("\n"); 3057 3058 3059 randV128(&src); 3060 randV128(&dst); 3061 memcpy(&src[0], &vals[i], 8); 3062 memcpy(&src[8], &vals[(i+11)%22], 8); 3063 do_ROUNDPD_011(False/*reg*/, &src, &dst); 3064 printf("r roundpd_011 "); 3065 showV128(&src); 3066 printf(" "); 3067 showV128(&dst); 3068 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3069 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3070 printf("\n"); 3071 3072 randV128(&src); 3073 randV128(&dst); 3074 memcpy(&src[0], &vals[i], 8); 3075 memcpy(&src[8], &vals[(i+11)%22], 8); 3076 do_ROUNDPD_011(True/*mem*/, &src, &dst); 3077 printf("m roundpd_011 "); 3078 showV128(&src); 3079 printf(" "); 3080 showV128(&dst); 3081 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3082 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3083 printf("\n"); 3084 } 3085 } 3086 3087 void test_ROUNDPD_w_mxcsr_rounding ( void ) 3088 { 3089 UInt rm; 3090 double vals[22]; 3091 Int i = 0; 3092 vals[i++] = 0.0; 3093 vals[i++] = -0.0; 3094 vals[i++] = mkPosInf(); 3095 vals[i++] = mkNegInf(); 3096 vals[i++] = mkPosNan(); 3097 vals[i++] = mkNegNan(); 3098 vals[i++] = -1.3; 3099 vals[i++] = -1.1; 3100 vals[i++] = -0.9; 3101 vals[i++] = -0.7; 3102 vals[i++] = -0.50001; 3103 vals[i++] = -0.49999; 3104 vals[i++] = -0.3; 3105 vals[i++] = -0.1; 3106 vals[i++] = 0.1; 3107 vals[i++] = 0.3; 3108 vals[i++] = 0.49999; 3109 vals[i++] = 0.50001; 3110 vals[i++] = 0.7; 3111 vals[i++] = 0.9; 3112 vals[i++] = 1.1; 3113 vals[i++] = 1.3; 3114 assert(i == 22); 3115 3116 rm = get_sse_roundingmode(); 3117 assert(rm == 0); // 0 == RN == default 3118 3119 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 3120 V128 src, dst; 3121 3122 for (rm = 0; rm <= 3; rm++) { 3123 set_sse_roundingmode(rm); 3124 3125 randV128(&src); 3126 randV128(&dst); 3127 memcpy(&src[0], &vals[i], 8); 3128 memcpy(&src[8], &vals[(i+11)%22], 8); 3129 do_ROUNDPD_1XX(False/*reg*/, &src, &dst); 3130 printf("r (rm=%u) roundpd_1XX ", rm); 3131 showV128(&src); 3132 printf(" "); 3133 showV128(&dst); 3134 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3135 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3136 printf("\n"); 3137 3138 randV128(&src); 3139 randV128(&dst); 3140 memcpy(&src[0], &vals[i], 8); 3141 memcpy(&src[8], &vals[(i+11)%22], 8); 3142 do_ROUNDPD_1XX(True/*mem*/, &src, &dst); 3143 printf("m (rm=%u) roundpd_1XX ", rm); 3144 showV128(&src); 3145 printf(" "); 3146 showV128(&dst); 3147 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3148 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3149 printf("\n"); 3150 } 3151 } 3152 3153 rm = get_sse_roundingmode(); 3154 assert(rm == 3); 3155 set_sse_roundingmode(0); 3156 rm = get_sse_roundingmode(); 3157 assert(rm == 0); // 0 == RN == default 3158 } 3159 3160 /* ------------ ROUNDPS ------------ */ 3161 3162 void do_ROUNDPS_000 ( Bool mem, V128* src, /*OUT*/V128* dst ) 3163 { 3164 if (mem) { 3165 __asm__ __volatile__( 3166 "movupd (%1), %%xmm11" "\n\t" 3167 "roundps $0, (%0), %%xmm11" "\n\t" 3168 "movupd %%xmm11, (%1)" "\n" 3169 : /*OUT*/ 3170 : /*IN*/ "r"(src), "r"(dst) 3171 : /*TRASH*/ "xmm11" 3172 ); 3173 } else { 3174 __asm__ __volatile__( 3175 "movupd (%1), %%xmm11" "\n\t" 3176 "movupd (%0), %%xmm2" "\n\t" 3177 "roundps $0, %%xmm2, %%xmm11" "\n\t" 3178 "movupd %%xmm11, (%1)" "\n" 3179 : /*OUT*/ 3180 : /*IN*/ "r"(src), "r"(dst) 3181 : /*TRASH*/ "xmm11","xmm2" 3182 ); 3183 } 3184 } 3185 3186 void do_ROUNDPS_001 ( Bool mem, V128* src, /*OUT*/V128* dst ) 3187 { 3188 if (mem) { 3189 __asm__ __volatile__( 3190 "movupd (%1), %%xmm11" "\n\t" 3191 "roundps $1, (%0), %%xmm11" "\n\t" 3192 "movupd %%xmm11, (%1)" "\n" 3193 : /*OUT*/ 3194 : /*IN*/ "r"(src), "r"(dst) 3195 : /*TRASH*/ "xmm11" 3196 ); 3197 } else { 3198 __asm__ __volatile__( 3199 "movupd (%1), %%xmm11" "\n\t" 3200 "movupd (%0), %%xmm2" "\n\t" 3201 "roundps $1, %%xmm2, %%xmm11" "\n\t" 3202 "movupd %%xmm11, (%1)" "\n" 3203 : /*OUT*/ 3204 : /*IN*/ "r"(src), "r"(dst) 3205 : /*TRASH*/ "xmm11","xmm2" 3206 ); 3207 } 3208 } 3209 3210 void do_ROUNDPS_010 ( Bool mem, V128* src, /*OUT*/V128* dst ) 3211 { 3212 if (mem) { 3213 __asm__ __volatile__( 3214 "movupd (%1), %%xmm11" "\n\t" 3215 "roundps $2, (%0), %%xmm11" "\n\t" 3216 "movupd %%xmm11, (%1)" "\n" 3217 : /*OUT*/ 3218 : /*IN*/ "r"(src), "r"(dst) 3219 : /*TRASH*/ "xmm11" 3220 ); 3221 } else { 3222 __asm__ __volatile__( 3223 "movupd (%1), %%xmm11" "\n\t" 3224 "movupd (%0), %%xmm2" "\n\t" 3225 "roundps $2, %%xmm2, %%xmm11" "\n\t" 3226 "movupd %%xmm11, (%1)" "\n" 3227 : /*OUT*/ 3228 : /*IN*/ "r"(src), "r"(dst) 3229 : /*TRASH*/ "xmm11","xmm2" 3230 ); 3231 } 3232 } 3233 3234 void do_ROUNDPS_011 ( Bool mem, V128* src, /*OUT*/V128* dst ) 3235 { 3236 if (mem) { 3237 __asm__ __volatile__( 3238 "movupd (%1), %%xmm11" "\n\t" 3239 "roundps $3, (%0), %%xmm11" "\n\t" 3240 "movupd %%xmm11, (%1)" "\n" 3241 : /*OUT*/ 3242 : /*IN*/ "r"(src), "r"(dst) 3243 : /*TRASH*/ "xmm11" 3244 ); 3245 } else { 3246 __asm__ __volatile__( 3247 "movupd (%1), %%xmm11" "\n\t" 3248 "movupd (%0), %%xmm2" "\n\t" 3249 "roundps $3, %%xmm2, %%xmm11" "\n\t" 3250 "movupd %%xmm11, (%1)" "\n" 3251 : /*OUT*/ 3252 : /*IN*/ "r"(src), "r"(dst) 3253 : /*TRASH*/ "xmm11","xmm2" 3254 ); 3255 } 3256 } 3257 3258 void do_ROUNDPS_1XX ( Bool mem, V128* src, /*OUT*/V128* dst ) 3259 { 3260 if (mem) { 3261 __asm__ __volatile__( 3262 "movupd (%1), %%xmm11" "\n\t" 3263 "roundps $4, (%0), %%xmm11" "\n\t" 3264 "movupd %%xmm11, (%1)" "\n" 3265 : /*OUT*/ 3266 : /*IN*/ "r"(src), "r"(dst) 3267 : /*TRASH*/ "xmm11" 3268 ); 3269 } else { 3270 __asm__ __volatile__( 3271 "movupd (%1), %%xmm11" "\n\t" 3272 "movupd (%0), %%xmm2" "\n\t" 3273 "roundps $4, %%xmm2, %%xmm11" "\n\t" 3274 "movupd %%xmm11, (%1)" "\n" 3275 : /*OUT*/ 3276 : /*IN*/ "r"(src), "r"(dst) 3277 : /*TRASH*/ "xmm11","xmm2" 3278 ); 3279 } 3280 } 3281 3282 void test_ROUNDPS_w_immediate_rounding ( void ) 3283 { 3284 float vals[22]; 3285 Int i = 0; 3286 vals[i++] = 0.0; 3287 vals[i++] = -0.0; 3288 vals[i++] = mkPosInf(); 3289 vals[i++] = mkNegInf(); 3290 vals[i++] = mkPosNan(); 3291 vals[i++] = mkNegNan(); 3292 vals[i++] = -1.3; 3293 vals[i++] = -1.1; 3294 vals[i++] = -0.9; 3295 vals[i++] = -0.7; 3296 vals[i++] = -0.50001; 3297 vals[i++] = -0.49999; 3298 vals[i++] = -0.3; 3299 vals[i++] = -0.1; 3300 vals[i++] = 0.1; 3301 vals[i++] = 0.3; 3302 vals[i++] = 0.49999; 3303 vals[i++] = 0.50001; 3304 vals[i++] = 0.7; 3305 vals[i++] = 0.9; 3306 vals[i++] = 1.1; 3307 vals[i++] = 1.3; 3308 assert(i == 22); 3309 3310 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 3311 V128 src, dst; 3312 3313 randV128(&src); 3314 randV128(&dst); 3315 memcpy(&src[0], &vals[i], 4); 3316 memcpy(&src[4], &vals[(i+5)%22], 4); 3317 memcpy(&src[8], &vals[(i+11)%22], 4); 3318 memcpy(&src[12], &vals[(i+17)%22], 4); 3319 do_ROUNDPS_000(False/*reg*/, &src, &dst); 3320 printf("r roundps_000 "); 3321 showV128(&src); 3322 printf(" "); 3323 showV128(&dst); 3324 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3325 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3326 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3327 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3328 printf("\n"); 3329 3330 randV128(&src); 3331 randV128(&dst); 3332 memcpy(&src[0], &vals[i], 4); 3333 memcpy(&src[4], &vals[(i+5)%22], 4); 3334 memcpy(&src[8], &vals[(i+11)%22], 4); 3335 memcpy(&src[12], &vals[(i+17)%22], 4); 3336 do_ROUNDPS_000(True/*mem*/, &src, &dst); 3337 printf("m roundps_000 "); 3338 showV128(&src); 3339 printf(" "); 3340 showV128(&dst); 3341 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3342 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3343 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3344 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3345 printf("\n"); 3346 3347 3348 randV128(&src); 3349 randV128(&dst); 3350 memcpy(&src[0], &vals[i], 4); 3351 memcpy(&src[4], &vals[(i+5)%22], 4); 3352 memcpy(&src[8], &vals[(i+11)%22], 4); 3353 memcpy(&src[12], &vals[(i+17)%22], 4); 3354 do_ROUNDPS_001(False/*reg*/, &src, &dst); 3355 printf("r roundps_001 "); 3356 showV128(&src); 3357 printf(" "); 3358 showV128(&dst); 3359 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3360 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3361 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3362 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3363 printf("\n"); 3364 3365 randV128(&src); 3366 randV128(&dst); 3367 memcpy(&src[0], &vals[i], 4); 3368 memcpy(&src[4], &vals[(i+5)%22], 4); 3369 memcpy(&src[8], &vals[(i+11)%22], 4); 3370 memcpy(&src[12], &vals[(i+17)%22], 4); 3371 do_ROUNDPS_001(True/*mem*/, &src, &dst); 3372 printf("m roundps_001 "); 3373 showV128(&src); 3374 printf(" "); 3375 showV128(&dst); 3376 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3377 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3378 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3379 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3380 printf("\n"); 3381 3382 3383 randV128(&src); 3384 randV128(&dst); 3385 memcpy(&src[0], &vals[i], 4); 3386 memcpy(&src[4], &vals[(i+5)%22], 4); 3387 memcpy(&src[8], &vals[(i+11)%22], 4); 3388 memcpy(&src[12], &vals[(i+17)%22], 4); 3389 do_ROUNDPS_010(False/*reg*/, &src, &dst); 3390 printf("r roundps_010 "); 3391 showV128(&src); 3392 printf(" "); 3393 showV128(&dst); 3394 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3395 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3396 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3397 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3398 printf("\n"); 3399 3400 randV128(&src); 3401 randV128(&dst); 3402 memcpy(&src[0], &vals[i], 4); 3403 memcpy(&src[4], &vals[(i+5)%22], 4); 3404 memcpy(&src[8], &vals[(i+11)%22], 4); 3405 memcpy(&src[12], &vals[(i+17)%22], 4); 3406 do_ROUNDPS_010(True/*mem*/, &src, &dst); 3407 printf("m roundps_010 "); 3408 showV128(&src); 3409 printf(" "); 3410 showV128(&dst); 3411 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3412 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3413 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3414 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3415 printf("\n"); 3416 3417 3418 randV128(&src); 3419 randV128(&dst); 3420 memcpy(&src[0], &vals[i], 4); 3421 memcpy(&src[4], &vals[(i+5)%22], 4); 3422 memcpy(&src[8], &vals[(i+11)%22], 4); 3423 memcpy(&src[12], &vals[(i+17)%22], 4); 3424 do_ROUNDPS_011(False/*reg*/, &src, &dst); 3425 printf("r roundps_011 "); 3426 showV128(&src); 3427 printf(" "); 3428 showV128(&dst); 3429 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3430 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3431 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3432 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3433 printf("\n"); 3434 3435 randV128(&src); 3436 randV128(&dst); 3437 memcpy(&src[0], &vals[i], 4); 3438 memcpy(&src[4], &vals[(i+5)%22], 4); 3439 memcpy(&src[8], &vals[(i+11)%22], 4); 3440 memcpy(&src[12], &vals[(i+17)%22], 4); 3441 do_ROUNDPS_011(True/*mem*/, &src, &dst); 3442 printf("m roundps_011 "); 3443 showV128(&src); 3444 printf(" "); 3445 showV128(&dst); 3446 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3447 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3448 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3449 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3450 printf("\n"); 3451 } 3452 } 3453 3454 void test_ROUNDPS_w_mxcsr_rounding ( void ) 3455 { 3456 UInt rm; 3457 float vals[22]; 3458 Int i = 0; 3459 vals[i++] = 0.0; 3460 vals[i++] = -0.0; 3461 vals[i++] = mkPosInf(); 3462 vals[i++] = mkNegInf(); 3463 vals[i++] = mkPosNan(); 3464 vals[i++] = mkNegNan(); 3465 vals[i++] = -1.3; 3466 vals[i++] = -1.1; 3467 vals[i++] = -0.9; 3468 vals[i++] = -0.7; 3469 vals[i++] = -0.50001; 3470 vals[i++] = -0.49999; 3471 vals[i++] = -0.3; 3472 vals[i++] = -0.1; 3473 vals[i++] = 0.1; 3474 vals[i++] = 0.3; 3475 vals[i++] = 0.49999; 3476 vals[i++] = 0.50001; 3477 vals[i++] = 0.7; 3478 vals[i++] = 0.9; 3479 vals[i++] = 1.1; 3480 vals[i++] = 1.3; 3481 assert(i == 22); 3482 3483 rm = get_sse_roundingmode(); 3484 assert(rm == 0); // 0 == RN == default 3485 3486 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 3487 V128 src, dst; 3488 3489 for (rm = 0; rm <= 3; rm++) { 3490 set_sse_roundingmode(rm); 3491 3492 randV128(&src); 3493 randV128(&dst); 3494 memcpy(&src[0], &vals[i], 4); 3495 memcpy(&src[4], &vals[(i+5)%22], 4); 3496 memcpy(&src[8], &vals[(i+11)%22], 4); 3497 memcpy(&src[12], &vals[(i+17)%22], 4); 3498 do_ROUNDPS_1XX(False/*reg*/, &src, &dst); 3499 printf("r (rm=%u) roundps_1XX ", rm); 3500 showV128(&src); 3501 printf(" "); 3502 showV128(&dst); 3503 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3504 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3505 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3506 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3507 printf("\n"); 3508 3509 randV128(&src); 3510 randV128(&dst); 3511 memcpy(&src[0], &vals[i], 4); 3512 memcpy(&src[4], &vals[(i+5)%22], 4); 3513 memcpy(&src[8], &vals[(i+11)%22], 4); 3514 memcpy(&src[12], &vals[(i+17)%22], 4); 3515 do_ROUNDPS_1XX(True/*mem*/, &src, &dst); 3516 printf("m (rm=%u) roundps_1XX ", rm); 3517 showV128(&src); 3518 printf(" "); 3519 showV128(&dst); 3520 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3521 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3522 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3523 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3524 printf("\n"); 3525 } 3526 } 3527 3528 rm = get_sse_roundingmode(); 3529 assert(rm == 3); 3530 set_sse_roundingmode(0); 3531 rm = get_sse_roundingmode(); 3532 assert(rm == 0); // 0 == RN == default 3533 } 3534 3535 /* ------------ PTEST ------------ */ 3536 3537 void test_PTEST ( void ) 3538 { 3539 const Int ntests = 8; 3540 V128 spec[ntests]; 3541 do64HLtoV128( &spec[0], 0x0000000000000000ULL, 0x0000000000000000ULL ); 3542 do64HLtoV128( &spec[1], 0x0000000000000000ULL, 0x0000000000000001ULL ); 3543 do64HLtoV128( &spec[2], 0x0000000000000001ULL, 0x0000000000000000ULL ); 3544 do64HLtoV128( &spec[3], 0x0000000000000001ULL, 0x0000000000000001ULL ); 3545 do64HLtoV128( &spec[4], 0xffffffffffffffffULL, 0xffffffffffffffffULL ); 3546 do64HLtoV128( &spec[5], 0xffffffffffffffffULL, 0xfffffffffffffffeULL ); 3547 do64HLtoV128( &spec[6], 0xfffffffffffffffeULL, 0xffffffffffffffffULL ); 3548 do64HLtoV128( &spec[7], 0xfffffffffffffffeULL, 0xfffffffffffffffeULL ); 3549 V128 block[2]; 3550 Int i, j; 3551 ULong flags; 3552 for (i = 0; i < ntests; i++) { 3553 for (j = 0; j < ntests; j++) { 3554 memcpy(&block[0], &spec[i], 16); 3555 memcpy(&block[1], &spec[j], 16); 3556 __asm__ __volatile__( 3557 "subq $256, %%rsp" "\n\t" 3558 "movupd 0(%1), %%xmm2" "\n\t" 3559 "ptest 16(%1), %%xmm2" "\n\t" 3560 "pushfq" "\n\t" 3561 "popq %0" "\n\t" 3562 "addq $256, %%rsp" "\n\t" 3563 : /*out*/"=r"(flags) : /*in*/ "r"(&block[0]) : 3564 "xmm2", "memory", "cc" 3565 ); 3566 printf("r ptest "); 3567 showV128(&block[0]); 3568 printf(" "); 3569 showV128(&block[1]); 3570 printf(" -> eflags %04x\n", (UInt)flags & 0x8D5); 3571 } 3572 } 3573 } 3574 3575 /* ------------ PBLENDVB ------------ */ 3576 3577 void do_PBLENDVB ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst ) 3578 { 3579 if (mem) { 3580 __asm__ __volatile__( 3581 "movupd (%2), %%xmm0" "\n\t" 3582 "movupd (%1), %%xmm11" "\n\t" 3583 "pblendvb (%0), %%xmm11" "\n\t" 3584 "movupd %%xmm11, (%1)" "\n" 3585 : /*OUT*/ 3586 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0) 3587 : /*TRASH*/ "xmm11","xmm0" 3588 ); 3589 } else { 3590 __asm__ __volatile__( 3591 "movupd (%2), %%xmm0" "\n\t" 3592 "movupd (%1), %%xmm11" "\n\t" 3593 "movupd (%0), %%xmm2" "\n\t" 3594 "pblendvb %%xmm2, %%xmm11" "\n\t" 3595 "movupd %%xmm11, (%1)" "\n" 3596 : /*OUT*/ 3597 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0) 3598 : /*TRASH*/ "xmm11","xmm2","xmm0" 3599 ); 3600 } 3601 } 3602 3603 void test_PBLENDVB ( void ) 3604 { 3605 V128 xmm0, src, dst, t_xmm0, t_src, t_dst; 3606 Int i; 3607 for (i = 0; i < 10; i++) { 3608 randV128(&t_xmm0); 3609 randV128(&t_src); 3610 randV128(&t_dst); 3611 3612 memcpy(&xmm0, &t_xmm0, 16); 3613 memcpy(&src, &t_src, 16); 3614 memcpy(&dst, &t_dst, 16); 3615 do_PBLENDVB(False/*reg*/, &xmm0, &src, &dst); 3616 printf("r pblendvb "); 3617 showV128(&t_xmm0); 3618 printf(" "); 3619 showV128(&t_src); 3620 printf(" "); 3621 showV128(&t_dst); 3622 printf(" -> "); 3623 showV128(&dst); 3624 printf("\n"); 3625 3626 memcpy(&xmm0, &t_xmm0, 16); 3627 memcpy(&src, &t_src, 16); 3628 memcpy(&dst, &t_dst, 16); 3629 do_PBLENDVB(True/*mem*/, &xmm0, &src, &dst); 3630 printf("m pblendvb "); 3631 showV128(&t_xmm0); 3632 printf(" "); 3633 showV128(&t_src); 3634 printf(" "); 3635 showV128(&t_dst); 3636 printf(" -> "); 3637 showV128(&dst); 3638 printf("\n"); 3639 } 3640 } 3641 3642 /* ------------ BLENDVPD ------------ */ 3643 3644 void do_BLENDVPD ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst ) 3645 { 3646 if (mem) { 3647 __asm__ __volatile__( 3648 "movupd (%2), %%xmm0" "\n\t" 3649 "movupd (%1), %%xmm11" "\n\t" 3650 "blendvpd (%0), %%xmm11" "\n\t" 3651 "movupd %%xmm11, (%1)" "\n" 3652 : /*OUT*/ 3653 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0) 3654 : /*TRASH*/ "xmm11","xmm0" 3655 ); 3656 } else { 3657 __asm__ __volatile__( 3658 "movupd (%2), %%xmm0" "\n\t" 3659 "movupd (%1), %%xmm11" "\n\t" 3660 "movupd (%0), %%xmm2" "\n\t" 3661 "blendvpd %%xmm2, %%xmm11" "\n\t" 3662 "movupd %%xmm11, (%1)" "\n" 3663 : /*OUT*/ 3664 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0) 3665 : /*TRASH*/ "xmm11","xmm2","xmm0" 3666 ); 3667 } 3668 } 3669 3670 void test_BLENDVPD ( void ) 3671 { 3672 V128 xmm0, src, dst, t_xmm0, t_src, t_dst; 3673 Int i; 3674 for (i = 0; i < 10; i++) { 3675 randV128(&t_xmm0); 3676 randV128(&t_src); 3677 randV128(&t_dst); 3678 3679 memcpy(&xmm0, &t_xmm0, 16); 3680 memcpy(&src, &t_src, 16); 3681 memcpy(&dst, &t_dst, 16); 3682 do_BLENDVPD(False/*reg*/, &xmm0, &src, &dst); 3683 printf("r blendvpd "); 3684 showV128(&t_xmm0); 3685 printf(" "); 3686 showV128(&t_src); 3687 printf(" "); 3688 showV128(&t_dst); 3689 printf(" -> "); 3690 showV128(&dst); 3691 printf("\n"); 3692 3693 memcpy(&xmm0, &t_xmm0, 16); 3694 memcpy(&src, &t_src, 16); 3695 memcpy(&dst, &t_dst, 16); 3696 do_BLENDVPD(True/*mem*/, &xmm0, &src, &dst); 3697 printf("m blendvpd "); 3698 showV128(&t_xmm0); 3699 printf(" "); 3700 showV128(&t_src); 3701 printf(" "); 3702 showV128(&t_dst); 3703 printf(" -> "); 3704 showV128(&dst); 3705 printf("\n"); 3706 } 3707 } 3708 3709 /* ------------ BLENDVPS ------------ */ 3710 3711 void do_BLENDVPS ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst ) 3712 { 3713 if (mem) { 3714 __asm__ __volatile__( 3715 "movupd (%2), %%xmm0" "\n\t" 3716 "movupd (%1), %%xmm11" "\n\t" 3717 "blendvps (%0), %%xmm11" "\n\t" 3718 "movupd %%xmm11, (%1)" "\n" 3719 : /*OUT*/ 3720 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0) 3721 : /*TRASH*/ "xmm11","xmm0" 3722 ); 3723 } else { 3724 __asm__ __volatile__( 3725 "movupd (%2), %%xmm0" "\n\t" 3726 "movupd (%1), %%xmm11" "\n\t" 3727 "movupd (%0), %%xmm2" "\n\t" 3728 "blendvps %%xmm2, %%xmm11" "\n\t" 3729 "movupd %%xmm11, (%1)" "\n" 3730 : /*OUT*/ 3731 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0) 3732 : /*TRASH*/ "xmm11","xmm2","xmm0" 3733 ); 3734 } 3735 } 3736 3737 void test_BLENDVPS ( void ) 3738 { 3739 V128 xmm0, src, dst, t_xmm0, t_src, t_dst; 3740 Int i; 3741 for (i = 0; i < 10; i++) { 3742 randV128(&t_xmm0); 3743 randV128(&t_src); 3744 randV128(&t_dst); 3745 3746 memcpy(&xmm0, &t_xmm0, 16); 3747 memcpy(&src, &t_src, 16); 3748 memcpy(&dst, &t_dst, 16); 3749 do_BLENDVPS(False/*reg*/, &xmm0, &src, &dst); 3750 printf("r blendvps "); 3751 showV128(&t_xmm0); 3752 printf(" "); 3753 showV128(&t_src); 3754 printf(" "); 3755 showV128(&t_dst); 3756 printf(" -> "); 3757 showV128(&dst); 3758 printf("\n"); 3759 3760 memcpy(&xmm0, &t_xmm0, 16); 3761 memcpy(&src, &t_src, 16); 3762 memcpy(&dst, &t_dst, 16); 3763 do_BLENDVPS(True/*mem*/, &xmm0, &src, &dst); 3764 printf("m blendvps "); 3765 showV128(&t_xmm0); 3766 printf(" "); 3767 showV128(&t_src); 3768 printf(" "); 3769 showV128(&t_dst); 3770 printf(" -> "); 3771 showV128(&dst); 3772 printf("\n"); 3773 } 3774 } 3775 3776 /* ------------ main ------------ */ 3777 3778 int main ( int argc, char** argv ) 3779 { 3780 #if 1 3781 // ------ SSE 4.1 ------ 3782 test_BLENDPD(); // done Apr.01.2010 3783 test_BLENDPS(); // done Apr.02.2010 3784 test_PBLENDW(); 3785 test_PBLENDVB(); 3786 test_BLENDVPD(); 3787 test_BLENDVPS(); 3788 test_DPPD(); // done Apr.08.2010 3789 test_DPPS(); // done Apr.09.2010 3790 test_EXTRACTPS(); 3791 test_INSERTPS(); // done Apr.01.2010 3792 // MOVNTDQA *** 3793 //test_MPSADBW(); 3794 test_PCMPEQQ(); 3795 test_PEXTRB(); // done Apr.15.2010 3796 test_PEXTRD(); // done Apr.14.2010 3797 test_PEXTRQ(); // done Apr.14.2010 3798 test_PEXTRW(); // done Apr.14.2010 3799 test_PINSRQ(); // done Apr.16.2010 3800 test_PINSRD(); // todo 3801 test_PINSRW(); /* Umm, this is SSE2, not SSE4. Right? */ 3802 test_PINSRB(); // todo 3803 //test_PHMINPOSUW(); 3804 test_PMAXSB(); 3805 test_PMAXSD(); // done Apr.09.2010 3806 test_PMAXUD(); // done Apr.16.2010 3807 test_PMAXUW(); 3808 test_PMINSB(); 3809 test_PMINSD(); // done Apr.09.2010 3810 test_PMINUD(); 3811 test_PMINUW(); 3812 test_PMOVSXBW(); // done Apr.02.2010 3813 test_PMOVSXBD(); // done Mar.30.2010 3814 test_PMOVSXBQ(); // done Mar.30.2010 3815 test_PMOVSXWD(); // done Mar.31.2010 3816 test_PMOVSXWQ(); // done Mar.31.2010 3817 test_PMOVSXDQ(); // done Mar.31.2010 3818 test_PMOVZXBW(); // done Mar.28.2010 3819 test_PMOVZXBD(); // done Mar.29.2010 3820 test_PMOVZXBQ(); // done Mar.29.2010 3821 test_PMOVZXWD(); // done Mar.28.2010 3822 test_PMOVZXWQ(); // done Mar.29.2010 3823 test_PMOVZXDQ(); // done Mar.29.2010 3824 test_POPCNTW(); 3825 test_POPCNTL(); 3826 test_POPCNTQ(); 3827 test_PMULDQ(); 3828 test_PMULLD(); 3829 test_PTEST(); 3830 test_ROUNDSD_w_immediate_rounding(); 3831 test_ROUNDSS_w_immediate_rounding(); 3832 test_ROUNDPD_w_immediate_rounding(); 3833 test_ROUNDPS_w_immediate_rounding(); 3834 test_ROUNDSD_w_mxcsr_rounding(); 3835 test_ROUNDSS_w_mxcsr_rounding(); 3836 test_ROUNDPD_w_mxcsr_rounding(); 3837 test_ROUNDPS_w_mxcsr_rounding(); 3838 // ------ SSE 4.2 ------ 3839 test_PCMPGTQ(); 3840 // CRC32B,Q 3841 test_PACKUSDW(); 3842 3843 #else 3844 #if 0 3845 test_MPSADBW(); 3846 test_PCMPEQQ(); 3847 test_PHMINPOSUW(); 3848 test_PMULDQ(); 3849 #endif 3850 #endif 3851 3852 return 0; 3853 } 3854 3855