1 2 /* A program to test SSE4.1/SSE4.2 instructions. 3 Revisions: Nov.208 - wrote this file 4 Apr.10.2010 - added PEXTR* tests 5 Apr.16.2010 - added PINS* tests 6 */ 7 8 /* HOW TO COMPILE: 9 gcc -m64 -g -O -Wall -o sse4-64 sse4-64.c 10 */ 11 12 #include <stdio.h> 13 #include <stdlib.h> 14 #include <assert.h> 15 //#include "tests/malloc.h" // reenable when reintegrated 16 #include <string.h> 17 18 19 20 // rmme when reintegrated 21 // Allocates a 16-aligned block. Asserts if the allocation fails. 22 #ifdef VGO_darwin 23 #include <stdlib.h> 24 #else 25 #include <malloc.h> 26 #endif 27 __attribute__((unused)) 28 static void* memalign16(size_t szB) 29 { 30 void* x; 31 #if defined(VGO_darwin) 32 // Darwin lacks memalign, but its malloc is always 16-aligned anyway. 33 x = malloc(szB); 34 #else 35 x = memalign(16, szB); 36 #endif 37 assert(x); 38 assert(0 == ((16-1) & (unsigned long)x)); 39 return x; 40 } 41 42 43 44 typedef unsigned char V128[16]; 45 typedef unsigned int UInt; 46 typedef signed int Int; 47 typedef unsigned char UChar; 48 typedef unsigned long long int ULong; 49 50 typedef unsigned char Bool; 51 #define False ((Bool)0) 52 #define True ((Bool)1) 53 54 55 typedef 56 struct { 57 V128 arg1; 58 V128 arg2; 59 V128 res; 60 } 61 RRArgs; 62 63 typedef 64 struct { 65 V128 arg1; 66 V128 res; 67 } 68 RMArgs; 69 70 static void do64HLtoV128 ( /*OUT*/V128* res, ULong wHi, ULong wLo ) 71 { 72 // try to sidestep strict-aliasing snafus by memcpying explicitly 73 UChar* p = (UChar*)res; 74 memcpy(&p[8], (UChar*)&wHi, 8); 75 memcpy(&p[0], (UChar*)&wLo, 8); 76 } 77 78 static UChar randUChar ( void ) 79 { 80 static UInt seed = 80021; 81 seed = 1103515245 * seed + 12345; 82 return (seed >> 17) & 0xFF; 83 } 84 85 static ULong randULong ( void ) 86 { 87 Int i; 88 ULong r = 0; 89 for (i = 0; i < 8; i++) { 90 r = (r << 8) | (ULong)(0xFF & randUChar()); 91 } 92 return r; 93 } 94 95 static void randV128 ( V128* v ) 96 { 97 Int i; 98 for (i = 0; i < 16; i++) 99 (*v)[i] = randUChar(); 100 } 101 102 static void showV128 ( V128* v ) 103 { 104 Int i; 105 for (i = 15; i >= 0; i--) 106 printf("%02x", (Int)(*v)[i]); 107 } 108 109 static void showMaskedV128 ( V128* v, V128* mask ) 110 { 111 Int i; 112 for (i = 15; i >= 0; i--) 113 printf("%02x", (Int)( ((*v)[i]) & ((*mask)[i]) )); 114 } 115 116 static void showIGVV( char* rOrM, char* op, Int imm, 117 ULong src64, V128* dst, V128* res ) 118 { 119 printf("%s %10s $%d ", rOrM, op, imm); 120 printf("%016llx", src64); 121 printf(" "); 122 showV128(dst); 123 printf(" "); 124 showV128(res); 125 printf("\n"); 126 } 127 128 static void showIAG ( char* rOrM, char* op, Int imm, 129 V128* argL, ULong argR, ULong res ) 130 { 131 printf("%s %10s $%d ", rOrM, op, imm); 132 showV128(argL); 133 printf(" "); 134 printf("%016llx", argR); 135 printf(" "); 136 printf("%016llx", res); 137 printf("\n"); 138 } 139 140 static void showIAA ( char* rOrM, char* op, Int imm, RRArgs* rra, V128* rmask ) 141 { 142 printf("%s %10s $%d ", rOrM, op, imm); 143 showV128(&rra->arg1); 144 printf(" "); 145 showV128(&rra->arg2); 146 printf(" "); 147 showMaskedV128(&rra->res, rmask); 148 printf("\n"); 149 } 150 151 static void showAA ( char* rOrM, char* op, RRArgs* rra, V128* rmask ) 152 { 153 printf("%s %10s ", rOrM, op); 154 showV128(&rra->arg1); 155 printf(" "); 156 showV128(&rra->arg2); 157 printf(" "); 158 showMaskedV128(&rra->res, rmask); 159 printf("\n"); 160 } 161 162 /* Note: these are little endian. Hence first byte is the least 163 significant byte of lane zero. */ 164 165 /* Mask for insns where all result bits are non-approximated. */ 166 static V128 AllMask = { 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 167 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF }; 168 169 /* Mark for insns which produce approximated vector short results. */ 170 __attribute__((unused)) 171 static V128 ApproxPS = { 0x00,0x00,0x80,0xFF, 0x00,0x00,0x80,0xFF, 172 0x00,0x00,0x80,0xFF, 0x00,0x00,0x80,0xFF }; 173 174 /* Mark for insns which produce approximated scalar short results. */ 175 __attribute__((unused)) 176 static V128 ApproxSS = { 0x00,0x00,0x80,0xFF, 0xFF,0xFF,0xFF,0xFF, 177 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF }; 178 179 static V128 fives = { 0x55,0x55,0x55,0x55, 0x55,0x55,0x55,0x55, 180 0x55,0x55,0x55,0x55, 0x55,0x55,0x55,0x55 }; 181 182 static V128 zeroes = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 183 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 }; 184 185 double mkPosInf ( void ) { return 1.0 / 0.0; } 186 double mkNegInf ( void ) { return -mkPosInf(); } 187 double mkPosNan ( void ) { return 0.0 / 0.0; } 188 double mkNegNan ( void ) { return -mkPosNan(); } 189 190 __attribute__((noinline)) 191 UInt get_mxcsr ( void ) 192 { 193 ULong w64; 194 __asm__ __volatile__( 195 "subq $8, %%rsp" "\n\t" 196 "stmxcsr (%%rsp)" "\n\t" 197 "movq (%%rsp), %0" "\n" 198 "addq $8, %%rsp" 199 : /*OUT*/"=r"(w64) : /*IN*/ : "memory","cc" 200 ); 201 if (0) printf("get %08x\n", (UInt)w64); 202 return (UInt)w64; 203 } 204 205 __attribute__((noinline)) 206 void set_mxcsr ( UInt w32 ) 207 { 208 if (0) printf("set %08x\n", w32); 209 ULong w64 = (ULong)w32; 210 __asm__ __volatile__( 211 "subq $8, %%rsp" "\n\t" 212 "movq %0, (%%rsp)" "\n\t" 213 "ldmxcsr (%%rsp)" "\n\t" 214 "addq $8, %%rsp" 215 : /*OUT*/ : /*IN*/"r"(w64) : "memory",/*"mxcsr",*/"cc" 216 ); 217 } 218 219 UInt get_sse_roundingmode ( void ) 220 { 221 UInt w = get_mxcsr(); 222 return (w >> 13) & 3; 223 } 224 225 void set_sse_roundingmode ( UInt m ) 226 { 227 UInt w; 228 assert(0 == (m & ~3)); 229 w = get_mxcsr(); 230 w &= ~(3 << 13); 231 w |= (m << 13); 232 set_mxcsr(w); 233 } 234 235 236 #define DO_imm_r_r(_opname, _imm, _src, _dst) \ 237 { \ 238 V128 _tmp; \ 239 __asm__ __volatile__( \ 240 "movupd (%0), %%xmm2" "\n\t" \ 241 "movupd (%1), %%xmm11" "\n\t" \ 242 _opname " $" #_imm ", %%xmm2, %%xmm11" "\n\t" \ 243 "movupd %%xmm11, (%2)" "\n" \ 244 : /*out*/ : /*in*/ "r"(&(_src)), "r"(&(_dst)), "r"(&(_tmp)) \ 245 : "cc", "memory", "xmm2", "xmm11" \ 246 ); \ 247 RRArgs rra; \ 248 memcpy(&rra.arg1, &(_src), sizeof(V128)); \ 249 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \ 250 memcpy(&rra.res, &(_tmp), sizeof(V128)); \ 251 showIAA("r", (_opname), (_imm), &rra, &AllMask); \ 252 } 253 254 #define DO_imm_m_r(_opname, _imm, _src, _dst) \ 255 { \ 256 V128 _tmp; \ 257 V128* _srcM = memalign16(sizeof(V128)); \ 258 memcpy(_srcM, &(_src), sizeof(V128)); \ 259 __asm__ __volatile__( \ 260 "movupd (%1), %%xmm11" "\n\t" \ 261 _opname " $" #_imm ", (%0), %%xmm11" "\n\t" \ 262 "movupd %%xmm11, (%2)" "\n" \ 263 : /*out*/ : /*in*/ "r"(_srcM), "r"(&(_dst)), "r"(&(_tmp)) \ 264 : "cc", "memory", "xmm11" \ 265 ); \ 266 RRArgs rra; \ 267 memcpy(&rra.arg1, &(_src), sizeof(V128)); \ 268 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \ 269 memcpy(&rra.res, &(_tmp), sizeof(V128)); \ 270 showIAA("m", (_opname), (_imm), &rra, &AllMask); \ 271 free(_srcM); \ 272 } 273 274 #define DO_imm_mandr_r(_opname, _imm, _src, _dst) \ 275 DO_imm_r_r( _opname, _imm, _src, _dst ) \ 276 DO_imm_m_r( _opname, _imm, _src, _dst ) 277 278 279 280 281 282 #define DO_r_r(_opname, _src, _dst) \ 283 { \ 284 V128 _tmp; \ 285 __asm__ __volatile__( \ 286 "movupd (%0), %%xmm2" "\n\t" \ 287 "movupd (%1), %%xmm11" "\n\t" \ 288 _opname " %%xmm2, %%xmm11" "\n\t" \ 289 "movupd %%xmm11, (%2)" "\n" \ 290 : /*out*/ : /*in*/ "r"(&(_src)), "r"(&(_dst)), "r"(&(_tmp)) \ 291 : "cc", "memory", "xmm2", "xmm11" \ 292 ); \ 293 RRArgs rra; \ 294 memcpy(&rra.arg1, &(_src), sizeof(V128)); \ 295 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \ 296 memcpy(&rra.res, &(_tmp), sizeof(V128)); \ 297 showAA("r", (_opname), &rra, &AllMask); \ 298 } 299 300 #define DO_m_r(_opname, _src, _dst) \ 301 { \ 302 V128 _tmp; \ 303 V128* _srcM = memalign16(sizeof(V128)); \ 304 memcpy(_srcM, &(_src), sizeof(V128)); \ 305 __asm__ __volatile__( \ 306 "movupd (%1), %%xmm11" "\n\t" \ 307 _opname " (%0), %%xmm11" "\n\t" \ 308 "movupd %%xmm11, (%2)" "\n" \ 309 : /*out*/ : /*in*/ "r"(_srcM), "r"(&(_dst)), "r"(&(_tmp)) \ 310 : "cc", "memory", "xmm11" \ 311 ); \ 312 RRArgs rra; \ 313 memcpy(&rra.arg1, &(_src), sizeof(V128)); \ 314 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \ 315 memcpy(&rra.res, &(_tmp), sizeof(V128)); \ 316 showAA("m", (_opname), &rra, &AllMask); \ 317 free(_srcM); \ 318 } 319 320 #define DO_mandr_r(_opname, _src, _dst) \ 321 DO_r_r(_opname, _src, _dst) \ 322 DO_m_r(_opname, _src, _dst) 323 324 325 326 327 #define DO_imm_r_to_rscalar(_opname, _imm, _src, _dstsuffix) \ 328 { \ 329 ULong _scbefore = 0x5555555555555555ULL; \ 330 ULong _scafter = 0xAAAAAAAAAAAAAAAAULL; \ 331 /* This assumes that gcc won't make any of %0, %1, %2 */ \ 332 /* be r11. That should be ensured (cough, cough) */ \ 333 /* by declaring r11 to be clobbered. */ \ 334 __asm__ __volatile__( \ 335 "movupd (%0), %%xmm2" "\n\t" \ 336 "movq (%1), %%r11" "\n\t" \ 337 _opname " $" #_imm ", %%xmm2, %%r11" _dstsuffix "\n\t" \ 338 "movq %%r11, (%2)" "\n" \ 339 : /*out*/ \ 340 : /*in*/ "r"(&(_src)), "r"(&(_scbefore)), "r"(&(_scafter)) \ 341 : "cc", "memory", "xmm2", "r11" \ 342 ); \ 343 showIAG("r", (_opname), (_imm), &(_src), (_scbefore), (_scafter)); \ 344 } 345 346 #define DO_imm_r_to_mscalar(_opname, _imm, _src) \ 347 { \ 348 ULong _scbefore = 0x5555555555555555ULL; \ 349 ULong _scafter = _scbefore; \ 350 __asm__ __volatile__( \ 351 "movupd (%0), %%xmm2" "\n\t" \ 352 _opname " $" #_imm ", %%xmm2, (%1)" "\n\t" \ 353 : /*out*/ \ 354 : /*in*/ "r"(&(_src)), "r"(&(_scafter)) \ 355 : "cc", "memory", "xmm2" \ 356 ); \ 357 showIAG("m", (_opname), (_imm), &(_src), (_scbefore), (_scafter)); \ 358 } 359 360 #define DO_imm_r_to_mandrscalar(_opname, _imm, _src, _dstsuffix) \ 361 DO_imm_r_to_rscalar( _opname, _imm, _src, _dstsuffix ) \ 362 DO_imm_r_to_mscalar( _opname, _imm, _src ) 363 364 365 366 367 368 369 370 371 #define DO_imm_rscalar_to_r(_opname, _imm, _src, _srcsuffix) \ 372 { \ 373 V128 dstv; \ 374 V128 res; \ 375 ULong src64 = (ULong)(_src); \ 376 memcpy(dstv, fives, sizeof(dstv)); \ 377 memcpy(res, zeroes, sizeof(res)); \ 378 /* This assumes that gcc won't make any of %0, %1, %2 */ \ 379 /* be r11. That should be ensured (cough, cough) */ \ 380 /* by declaring r11 to be clobbered. */ \ 381 __asm__ __volatile__( \ 382 "movupd (%0), %%xmm2" "\n\t" /*dstv*/ \ 383 "movq (%1), %%r11" "\n\t" /*src64*/ \ 384 _opname " $" #_imm ", %%r11" _srcsuffix ", %%xmm2" "\n\t" \ 385 "movupd %%xmm2, (%2)" "\n" /*res*/ \ 386 : /*out*/ \ 387 : /*in*/ "r"(&dstv), "r"(&src64), "r"(&res) \ 388 : "cc", "memory", "xmm2", "r11" \ 389 ); \ 390 showIGVV("r", (_opname), (_imm), src64, &dstv, &res); \ 391 } 392 #define DO_imm_mscalar_to_r(_opname, _imm, _src) \ 393 { \ 394 V128 dstv; \ 395 V128 res; \ 396 ULong src64 = (ULong)(_src); \ 397 memcpy(dstv, fives, sizeof(dstv)); \ 398 memcpy(res, zeroes, sizeof(res)); \ 399 __asm__ __volatile__( \ 400 "movupd (%0), %%xmm2" "\n\t" /*dstv*/ \ 401 _opname " $" #_imm ", (%1), %%xmm2" "\n\t" \ 402 "movupd %%xmm2, (%2)" "\n" /*res*/ \ 403 : /*out*/ \ 404 : /*in*/ "r"(&dstv), "r"(&src64), "r"(&res) \ 405 : "cc", "memory", "xmm2" \ 406 ); \ 407 showIGVV("m", (_opname), (_imm), src64, &dstv, &res); \ 408 } 409 410 #define DO_imm_mandrscalar_to_r(_opname, _imm, _src, _dstsuffix) \ 411 DO_imm_rscalar_to_r( _opname, _imm, _src, _dstsuffix ) \ 412 DO_imm_mscalar_to_r( _opname, _imm, _src ) 413 414 415 416 417 418 void test_BLENDPD ( void ) 419 { 420 V128 src, dst; 421 Int i; 422 for (i = 0; i < 10; i++) { 423 randV128(&src); 424 randV128(&dst); 425 DO_imm_mandr_r("blendpd", 0, src, dst); 426 DO_imm_mandr_r("blendpd", 1, src, dst); 427 DO_imm_mandr_r("blendpd", 2, src, dst); 428 DO_imm_mandr_r("blendpd", 3, src, dst); 429 } 430 } 431 432 void test_BLENDPS ( void ) 433 { 434 V128 src, dst; 435 Int i; 436 for (i = 0; i < 10; i++) { 437 randV128(&src); 438 randV128(&dst); 439 DO_imm_mandr_r("blendps", 0, src, dst); 440 DO_imm_mandr_r("blendps", 1, src, dst); 441 DO_imm_mandr_r("blendps", 2, src, dst); 442 DO_imm_mandr_r("blendps", 3, src, dst); 443 DO_imm_mandr_r("blendps", 4, src, dst); 444 DO_imm_mandr_r("blendps", 5, src, dst); 445 DO_imm_mandr_r("blendps", 6, src, dst); 446 DO_imm_mandr_r("blendps", 7, src, dst); 447 DO_imm_mandr_r("blendps", 8, src, dst); 448 DO_imm_mandr_r("blendps", 9, src, dst); 449 DO_imm_mandr_r("blendps", 10, src, dst); 450 DO_imm_mandr_r("blendps", 11, src, dst); 451 DO_imm_mandr_r("blendps", 12, src, dst); 452 DO_imm_mandr_r("blendps", 13, src, dst); 453 DO_imm_mandr_r("blendps", 14, src, dst); 454 DO_imm_mandr_r("blendps", 15, src, dst); 455 } 456 } 457 458 void test_DPPD ( void ) 459 { 460 V128 src, dst; 461 { 462 *(double*)(&src[0]) = 1.2345; 463 *(double*)(&src[8]) = -6.78910; 464 *(double*)(&dst[0]) = -11.121314; 465 *(double*)(&dst[8]) = 15.161718; 466 DO_imm_mandr_r("dppd", 0, src, dst); 467 DO_imm_mandr_r("dppd", 1, src, dst); 468 DO_imm_mandr_r("dppd", 2, src, dst); 469 DO_imm_mandr_r("dppd", 3, src, dst); 470 DO_imm_mandr_r("dppd", 4, src, dst); 471 DO_imm_mandr_r("dppd", 5, src, dst); 472 DO_imm_mandr_r("dppd", 6, src, dst); 473 DO_imm_mandr_r("dppd", 7, src, dst); 474 DO_imm_mandr_r("dppd", 8, src, dst); 475 DO_imm_mandr_r("dppd", 9, src, dst); 476 DO_imm_mandr_r("dppd", 10, src, dst); 477 DO_imm_mandr_r("dppd", 11, src, dst); 478 DO_imm_mandr_r("dppd", 12, src, dst); 479 DO_imm_mandr_r("dppd", 13, src, dst); 480 DO_imm_mandr_r("dppd", 14, src, dst); 481 DO_imm_mandr_r("dppd", 15, src, dst); 482 DO_imm_mandr_r("dppd", 16, src, dst); 483 DO_imm_mandr_r("dppd", 17, src, dst); 484 DO_imm_mandr_r("dppd", 18, src, dst); 485 DO_imm_mandr_r("dppd", 19, src, dst); 486 DO_imm_mandr_r("dppd", 20, src, dst); 487 DO_imm_mandr_r("dppd", 21, src, dst); 488 DO_imm_mandr_r("dppd", 22, src, dst); 489 DO_imm_mandr_r("dppd", 23, src, dst); 490 DO_imm_mandr_r("dppd", 24, src, dst); 491 DO_imm_mandr_r("dppd", 25, src, dst); 492 DO_imm_mandr_r("dppd", 26, src, dst); 493 DO_imm_mandr_r("dppd", 27, src, dst); 494 DO_imm_mandr_r("dppd", 28, src, dst); 495 DO_imm_mandr_r("dppd", 29, src, dst); 496 DO_imm_mandr_r("dppd", 30, src, dst); 497 DO_imm_mandr_r("dppd", 31, src, dst); 498 DO_imm_mandr_r("dppd", 32, src, dst); 499 DO_imm_mandr_r("dppd", 33, src, dst); 500 DO_imm_mandr_r("dppd", 34, src, dst); 501 DO_imm_mandr_r("dppd", 35, src, dst); 502 DO_imm_mandr_r("dppd", 36, src, dst); 503 DO_imm_mandr_r("dppd", 37, src, dst); 504 DO_imm_mandr_r("dppd", 38, src, dst); 505 DO_imm_mandr_r("dppd", 39, src, dst); 506 DO_imm_mandr_r("dppd", 40, src, dst); 507 DO_imm_mandr_r("dppd", 41, src, dst); 508 DO_imm_mandr_r("dppd", 42, src, dst); 509 DO_imm_mandr_r("dppd", 43, src, dst); 510 DO_imm_mandr_r("dppd", 44, src, dst); 511 DO_imm_mandr_r("dppd", 45, src, dst); 512 DO_imm_mandr_r("dppd", 46, src, dst); 513 DO_imm_mandr_r("dppd", 47, src, dst); 514 DO_imm_mandr_r("dppd", 48, src, dst); 515 DO_imm_mandr_r("dppd", 49, src, dst); 516 DO_imm_mandr_r("dppd", 50, src, dst); 517 DO_imm_mandr_r("dppd", 51, src, dst); 518 DO_imm_mandr_r("dppd", 52, src, dst); 519 DO_imm_mandr_r("dppd", 53, src, dst); 520 DO_imm_mandr_r("dppd", 54, src, dst); 521 DO_imm_mandr_r("dppd", 55, src, dst); 522 DO_imm_mandr_r("dppd", 56, src, dst); 523 DO_imm_mandr_r("dppd", 57, src, dst); 524 DO_imm_mandr_r("dppd", 58, src, dst); 525 DO_imm_mandr_r("dppd", 59, src, dst); 526 DO_imm_mandr_r("dppd", 60, src, dst); 527 DO_imm_mandr_r("dppd", 61, src, dst); 528 DO_imm_mandr_r("dppd", 62, src, dst); 529 DO_imm_mandr_r("dppd", 63, src, dst); 530 DO_imm_mandr_r("dppd", 64, src, dst); 531 DO_imm_mandr_r("dppd", 65, src, dst); 532 DO_imm_mandr_r("dppd", 66, src, dst); 533 DO_imm_mandr_r("dppd", 67, src, dst); 534 DO_imm_mandr_r("dppd", 68, src, dst); 535 DO_imm_mandr_r("dppd", 69, src, dst); 536 DO_imm_mandr_r("dppd", 70, src, dst); 537 DO_imm_mandr_r("dppd", 71, src, dst); 538 DO_imm_mandr_r("dppd", 72, src, dst); 539 DO_imm_mandr_r("dppd", 73, src, dst); 540 DO_imm_mandr_r("dppd", 74, src, dst); 541 DO_imm_mandr_r("dppd", 75, src, dst); 542 DO_imm_mandr_r("dppd", 76, src, dst); 543 DO_imm_mandr_r("dppd", 77, src, dst); 544 DO_imm_mandr_r("dppd", 78, src, dst); 545 DO_imm_mandr_r("dppd", 79, src, dst); 546 DO_imm_mandr_r("dppd", 80, src, dst); 547 DO_imm_mandr_r("dppd", 81, src, dst); 548 DO_imm_mandr_r("dppd", 82, src, dst); 549 DO_imm_mandr_r("dppd", 83, src, dst); 550 DO_imm_mandr_r("dppd", 84, src, dst); 551 DO_imm_mandr_r("dppd", 85, src, dst); 552 DO_imm_mandr_r("dppd", 86, src, dst); 553 DO_imm_mandr_r("dppd", 87, src, dst); 554 DO_imm_mandr_r("dppd", 88, src, dst); 555 DO_imm_mandr_r("dppd", 89, src, dst); 556 DO_imm_mandr_r("dppd", 90, src, dst); 557 DO_imm_mandr_r("dppd", 91, src, dst); 558 DO_imm_mandr_r("dppd", 92, src, dst); 559 DO_imm_mandr_r("dppd", 93, src, dst); 560 DO_imm_mandr_r("dppd", 94, src, dst); 561 DO_imm_mandr_r("dppd", 95, src, dst); 562 DO_imm_mandr_r("dppd", 96, src, dst); 563 DO_imm_mandr_r("dppd", 97, src, dst); 564 DO_imm_mandr_r("dppd", 98, src, dst); 565 DO_imm_mandr_r("dppd", 99, src, dst); 566 DO_imm_mandr_r("dppd", 100, src, dst); 567 DO_imm_mandr_r("dppd", 101, src, dst); 568 DO_imm_mandr_r("dppd", 102, src, dst); 569 DO_imm_mandr_r("dppd", 103, src, dst); 570 DO_imm_mandr_r("dppd", 104, src, dst); 571 DO_imm_mandr_r("dppd", 105, src, dst); 572 DO_imm_mandr_r("dppd", 106, src, dst); 573 DO_imm_mandr_r("dppd", 107, src, dst); 574 DO_imm_mandr_r("dppd", 108, src, dst); 575 DO_imm_mandr_r("dppd", 109, src, dst); 576 DO_imm_mandr_r("dppd", 110, src, dst); 577 DO_imm_mandr_r("dppd", 111, src, dst); 578 DO_imm_mandr_r("dppd", 112, src, dst); 579 DO_imm_mandr_r("dppd", 113, src, dst); 580 DO_imm_mandr_r("dppd", 114, src, dst); 581 DO_imm_mandr_r("dppd", 115, src, dst); 582 DO_imm_mandr_r("dppd", 116, src, dst); 583 DO_imm_mandr_r("dppd", 117, src, dst); 584 DO_imm_mandr_r("dppd", 118, src, dst); 585 DO_imm_mandr_r("dppd", 119, src, dst); 586 DO_imm_mandr_r("dppd", 120, src, dst); 587 DO_imm_mandr_r("dppd", 121, src, dst); 588 DO_imm_mandr_r("dppd", 122, src, dst); 589 DO_imm_mandr_r("dppd", 123, src, dst); 590 DO_imm_mandr_r("dppd", 124, src, dst); 591 DO_imm_mandr_r("dppd", 125, src, dst); 592 DO_imm_mandr_r("dppd", 126, src, dst); 593 DO_imm_mandr_r("dppd", 127, src, dst); 594 DO_imm_mandr_r("dppd", 128, src, dst); 595 DO_imm_mandr_r("dppd", 129, src, dst); 596 DO_imm_mandr_r("dppd", 130, src, dst); 597 DO_imm_mandr_r("dppd", 131, src, dst); 598 DO_imm_mandr_r("dppd", 132, src, dst); 599 DO_imm_mandr_r("dppd", 133, src, dst); 600 DO_imm_mandr_r("dppd", 134, src, dst); 601 DO_imm_mandr_r("dppd", 135, src, dst); 602 DO_imm_mandr_r("dppd", 136, src, dst); 603 DO_imm_mandr_r("dppd", 137, src, dst); 604 DO_imm_mandr_r("dppd", 138, src, dst); 605 DO_imm_mandr_r("dppd", 139, src, dst); 606 DO_imm_mandr_r("dppd", 140, src, dst); 607 DO_imm_mandr_r("dppd", 141, src, dst); 608 DO_imm_mandr_r("dppd", 142, src, dst); 609 DO_imm_mandr_r("dppd", 143, src, dst); 610 DO_imm_mandr_r("dppd", 144, src, dst); 611 DO_imm_mandr_r("dppd", 145, src, dst); 612 DO_imm_mandr_r("dppd", 146, src, dst); 613 DO_imm_mandr_r("dppd", 147, src, dst); 614 DO_imm_mandr_r("dppd", 148, src, dst); 615 DO_imm_mandr_r("dppd", 149, src, dst); 616 DO_imm_mandr_r("dppd", 150, src, dst); 617 DO_imm_mandr_r("dppd", 151, src, dst); 618 DO_imm_mandr_r("dppd", 152, src, dst); 619 DO_imm_mandr_r("dppd", 153, src, dst); 620 DO_imm_mandr_r("dppd", 154, src, dst); 621 DO_imm_mandr_r("dppd", 155, src, dst); 622 DO_imm_mandr_r("dppd", 156, src, dst); 623 DO_imm_mandr_r("dppd", 157, src, dst); 624 DO_imm_mandr_r("dppd", 158, src, dst); 625 DO_imm_mandr_r("dppd", 159, src, dst); 626 DO_imm_mandr_r("dppd", 160, src, dst); 627 DO_imm_mandr_r("dppd", 161, src, dst); 628 DO_imm_mandr_r("dppd", 162, src, dst); 629 DO_imm_mandr_r("dppd", 163, src, dst); 630 DO_imm_mandr_r("dppd", 164, src, dst); 631 DO_imm_mandr_r("dppd", 165, src, dst); 632 DO_imm_mandr_r("dppd", 166, src, dst); 633 DO_imm_mandr_r("dppd", 167, src, dst); 634 DO_imm_mandr_r("dppd", 168, src, dst); 635 DO_imm_mandr_r("dppd", 169, src, dst); 636 DO_imm_mandr_r("dppd", 170, src, dst); 637 DO_imm_mandr_r("dppd", 171, src, dst); 638 DO_imm_mandr_r("dppd", 172, src, dst); 639 DO_imm_mandr_r("dppd", 173, src, dst); 640 DO_imm_mandr_r("dppd", 174, src, dst); 641 DO_imm_mandr_r("dppd", 175, src, dst); 642 DO_imm_mandr_r("dppd", 176, src, dst); 643 DO_imm_mandr_r("dppd", 177, src, dst); 644 DO_imm_mandr_r("dppd", 178, src, dst); 645 DO_imm_mandr_r("dppd", 179, src, dst); 646 DO_imm_mandr_r("dppd", 180, src, dst); 647 DO_imm_mandr_r("dppd", 181, src, dst); 648 DO_imm_mandr_r("dppd", 182, src, dst); 649 DO_imm_mandr_r("dppd", 183, src, dst); 650 DO_imm_mandr_r("dppd", 184, src, dst); 651 DO_imm_mandr_r("dppd", 185, src, dst); 652 DO_imm_mandr_r("dppd", 186, src, dst); 653 DO_imm_mandr_r("dppd", 187, src, dst); 654 DO_imm_mandr_r("dppd", 188, src, dst); 655 DO_imm_mandr_r("dppd", 189, src, dst); 656 DO_imm_mandr_r("dppd", 190, src, dst); 657 DO_imm_mandr_r("dppd", 191, src, dst); 658 DO_imm_mandr_r("dppd", 192, src, dst); 659 DO_imm_mandr_r("dppd", 193, src, dst); 660 DO_imm_mandr_r("dppd", 194, src, dst); 661 DO_imm_mandr_r("dppd", 195, src, dst); 662 DO_imm_mandr_r("dppd", 196, src, dst); 663 DO_imm_mandr_r("dppd", 197, src, dst); 664 DO_imm_mandr_r("dppd", 198, src, dst); 665 DO_imm_mandr_r("dppd", 199, src, dst); 666 DO_imm_mandr_r("dppd", 200, src, dst); 667 DO_imm_mandr_r("dppd", 201, src, dst); 668 DO_imm_mandr_r("dppd", 202, src, dst); 669 DO_imm_mandr_r("dppd", 203, src, dst); 670 DO_imm_mandr_r("dppd", 204, src, dst); 671 DO_imm_mandr_r("dppd", 205, src, dst); 672 DO_imm_mandr_r("dppd", 206, src, dst); 673 DO_imm_mandr_r("dppd", 207, src, dst); 674 DO_imm_mandr_r("dppd", 208, src, dst); 675 DO_imm_mandr_r("dppd", 209, src, dst); 676 DO_imm_mandr_r("dppd", 210, src, dst); 677 DO_imm_mandr_r("dppd", 211, src, dst); 678 DO_imm_mandr_r("dppd", 212, src, dst); 679 DO_imm_mandr_r("dppd", 213, src, dst); 680 DO_imm_mandr_r("dppd", 214, src, dst); 681 DO_imm_mandr_r("dppd", 215, src, dst); 682 DO_imm_mandr_r("dppd", 216, src, dst); 683 DO_imm_mandr_r("dppd", 217, src, dst); 684 DO_imm_mandr_r("dppd", 218, src, dst); 685 DO_imm_mandr_r("dppd", 219, src, dst); 686 DO_imm_mandr_r("dppd", 220, src, dst); 687 DO_imm_mandr_r("dppd", 221, src, dst); 688 DO_imm_mandr_r("dppd", 222, src, dst); 689 DO_imm_mandr_r("dppd", 223, src, dst); 690 DO_imm_mandr_r("dppd", 224, src, dst); 691 DO_imm_mandr_r("dppd", 225, src, dst); 692 DO_imm_mandr_r("dppd", 226, src, dst); 693 DO_imm_mandr_r("dppd", 227, src, dst); 694 DO_imm_mandr_r("dppd", 228, src, dst); 695 DO_imm_mandr_r("dppd", 229, src, dst); 696 DO_imm_mandr_r("dppd", 230, src, dst); 697 DO_imm_mandr_r("dppd", 231, src, dst); 698 DO_imm_mandr_r("dppd", 232, src, dst); 699 DO_imm_mandr_r("dppd", 233, src, dst); 700 DO_imm_mandr_r("dppd", 234, src, dst); 701 DO_imm_mandr_r("dppd", 235, src, dst); 702 DO_imm_mandr_r("dppd", 236, src, dst); 703 DO_imm_mandr_r("dppd", 237, src, dst); 704 DO_imm_mandr_r("dppd", 238, src, dst); 705 DO_imm_mandr_r("dppd", 239, src, dst); 706 DO_imm_mandr_r("dppd", 240, src, dst); 707 DO_imm_mandr_r("dppd", 241, src, dst); 708 DO_imm_mandr_r("dppd", 242, src, dst); 709 DO_imm_mandr_r("dppd", 243, src, dst); 710 DO_imm_mandr_r("dppd", 244, src, dst); 711 DO_imm_mandr_r("dppd", 245, src, dst); 712 DO_imm_mandr_r("dppd", 246, src, dst); 713 DO_imm_mandr_r("dppd", 247, src, dst); 714 DO_imm_mandr_r("dppd", 248, src, dst); 715 DO_imm_mandr_r("dppd", 249, src, dst); 716 DO_imm_mandr_r("dppd", 250, src, dst); 717 DO_imm_mandr_r("dppd", 251, src, dst); 718 DO_imm_mandr_r("dppd", 252, src, dst); 719 DO_imm_mandr_r("dppd", 253, src, dst); 720 DO_imm_mandr_r("dppd", 254, src, dst); 721 DO_imm_mandr_r("dppd", 255, src, dst); 722 } 723 } 724 725 void test_DPPS ( void ) 726 { 727 V128 src, dst; 728 { 729 *(float*)(&src[0]) = 1.2; 730 *(float*)(&src[4]) = -3.4; 731 *(float*)(&src[8]) = -6.7; 732 *(float*)(&src[12]) = 8.9; 733 *(float*)(&dst[0]) = -10.11; 734 *(float*)(&dst[4]) = 12.13; 735 *(float*)(&dst[8]) = 14.15; 736 *(float*)(&dst[12]) = -16.17; 737 DO_imm_mandr_r("dpps", 0, src, dst); 738 DO_imm_mandr_r("dpps", 1, src, dst); 739 DO_imm_mandr_r("dpps", 2, src, dst); 740 DO_imm_mandr_r("dpps", 3, src, dst); 741 DO_imm_mandr_r("dpps", 4, src, dst); 742 DO_imm_mandr_r("dpps", 5, src, dst); 743 DO_imm_mandr_r("dpps", 6, src, dst); 744 DO_imm_mandr_r("dpps", 7, src, dst); 745 DO_imm_mandr_r("dpps", 8, src, dst); 746 DO_imm_mandr_r("dpps", 9, src, dst); 747 DO_imm_mandr_r("dpps", 10, src, dst); 748 DO_imm_mandr_r("dpps", 11, src, dst); 749 DO_imm_mandr_r("dpps", 12, src, dst); 750 DO_imm_mandr_r("dpps", 13, src, dst); 751 DO_imm_mandr_r("dpps", 14, src, dst); 752 DO_imm_mandr_r("dpps", 15, src, dst); 753 DO_imm_mandr_r("dpps", 16, src, dst); 754 DO_imm_mandr_r("dpps", 17, src, dst); 755 DO_imm_mandr_r("dpps", 18, src, dst); 756 DO_imm_mandr_r("dpps", 19, src, dst); 757 DO_imm_mandr_r("dpps", 20, src, dst); 758 DO_imm_mandr_r("dpps", 21, src, dst); 759 DO_imm_mandr_r("dpps", 22, src, dst); 760 DO_imm_mandr_r("dpps", 23, src, dst); 761 DO_imm_mandr_r("dpps", 24, src, dst); 762 DO_imm_mandr_r("dpps", 25, src, dst); 763 DO_imm_mandr_r("dpps", 26, src, dst); 764 DO_imm_mandr_r("dpps", 27, src, dst); 765 DO_imm_mandr_r("dpps", 28, src, dst); 766 DO_imm_mandr_r("dpps", 29, src, dst); 767 DO_imm_mandr_r("dpps", 30, src, dst); 768 DO_imm_mandr_r("dpps", 31, src, dst); 769 DO_imm_mandr_r("dpps", 32, src, dst); 770 DO_imm_mandr_r("dpps", 33, src, dst); 771 DO_imm_mandr_r("dpps", 34, src, dst); 772 DO_imm_mandr_r("dpps", 35, src, dst); 773 DO_imm_mandr_r("dpps", 36, src, dst); 774 DO_imm_mandr_r("dpps", 37, src, dst); 775 DO_imm_mandr_r("dpps", 38, src, dst); 776 DO_imm_mandr_r("dpps", 39, src, dst); 777 DO_imm_mandr_r("dpps", 40, src, dst); 778 DO_imm_mandr_r("dpps", 41, src, dst); 779 DO_imm_mandr_r("dpps", 42, src, dst); 780 DO_imm_mandr_r("dpps", 43, src, dst); 781 DO_imm_mandr_r("dpps", 44, src, dst); 782 DO_imm_mandr_r("dpps", 45, src, dst); 783 DO_imm_mandr_r("dpps", 46, src, dst); 784 DO_imm_mandr_r("dpps", 47, src, dst); 785 DO_imm_mandr_r("dpps", 48, src, dst); 786 DO_imm_mandr_r("dpps", 49, src, dst); 787 DO_imm_mandr_r("dpps", 50, src, dst); 788 DO_imm_mandr_r("dpps", 51, src, dst); 789 DO_imm_mandr_r("dpps", 52, src, dst); 790 DO_imm_mandr_r("dpps", 53, src, dst); 791 DO_imm_mandr_r("dpps", 54, src, dst); 792 DO_imm_mandr_r("dpps", 55, src, dst); 793 DO_imm_mandr_r("dpps", 56, src, dst); 794 DO_imm_mandr_r("dpps", 57, src, dst); 795 DO_imm_mandr_r("dpps", 58, src, dst); 796 DO_imm_mandr_r("dpps", 59, src, dst); 797 DO_imm_mandr_r("dpps", 60, src, dst); 798 DO_imm_mandr_r("dpps", 61, src, dst); 799 DO_imm_mandr_r("dpps", 62, src, dst); 800 DO_imm_mandr_r("dpps", 63, src, dst); 801 DO_imm_mandr_r("dpps", 64, src, dst); 802 DO_imm_mandr_r("dpps", 65, src, dst); 803 DO_imm_mandr_r("dpps", 66, src, dst); 804 DO_imm_mandr_r("dpps", 67, src, dst); 805 DO_imm_mandr_r("dpps", 68, src, dst); 806 DO_imm_mandr_r("dpps", 69, src, dst); 807 DO_imm_mandr_r("dpps", 70, src, dst); 808 DO_imm_mandr_r("dpps", 71, src, dst); 809 DO_imm_mandr_r("dpps", 72, src, dst); 810 DO_imm_mandr_r("dpps", 73, src, dst); 811 DO_imm_mandr_r("dpps", 74, src, dst); 812 DO_imm_mandr_r("dpps", 75, src, dst); 813 DO_imm_mandr_r("dpps", 76, src, dst); 814 DO_imm_mandr_r("dpps", 77, src, dst); 815 DO_imm_mandr_r("dpps", 78, src, dst); 816 DO_imm_mandr_r("dpps", 79, src, dst); 817 DO_imm_mandr_r("dpps", 80, src, dst); 818 DO_imm_mandr_r("dpps", 81, src, dst); 819 DO_imm_mandr_r("dpps", 82, src, dst); 820 DO_imm_mandr_r("dpps", 83, src, dst); 821 DO_imm_mandr_r("dpps", 84, src, dst); 822 DO_imm_mandr_r("dpps", 85, src, dst); 823 DO_imm_mandr_r("dpps", 86, src, dst); 824 DO_imm_mandr_r("dpps", 87, src, dst); 825 DO_imm_mandr_r("dpps", 88, src, dst); 826 DO_imm_mandr_r("dpps", 89, src, dst); 827 DO_imm_mandr_r("dpps", 90, src, dst); 828 DO_imm_mandr_r("dpps", 91, src, dst); 829 DO_imm_mandr_r("dpps", 92, src, dst); 830 DO_imm_mandr_r("dpps", 93, src, dst); 831 DO_imm_mandr_r("dpps", 94, src, dst); 832 DO_imm_mandr_r("dpps", 95, src, dst); 833 DO_imm_mandr_r("dpps", 96, src, dst); 834 DO_imm_mandr_r("dpps", 97, src, dst); 835 DO_imm_mandr_r("dpps", 98, src, dst); 836 DO_imm_mandr_r("dpps", 99, src, dst); 837 DO_imm_mandr_r("dpps", 100, src, dst); 838 DO_imm_mandr_r("dpps", 101, src, dst); 839 DO_imm_mandr_r("dpps", 102, src, dst); 840 DO_imm_mandr_r("dpps", 103, src, dst); 841 DO_imm_mandr_r("dpps", 104, src, dst); 842 DO_imm_mandr_r("dpps", 105, src, dst); 843 DO_imm_mandr_r("dpps", 106, src, dst); 844 DO_imm_mandr_r("dpps", 107, src, dst); 845 DO_imm_mandr_r("dpps", 108, src, dst); 846 DO_imm_mandr_r("dpps", 109, src, dst); 847 DO_imm_mandr_r("dpps", 110, src, dst); 848 DO_imm_mandr_r("dpps", 111, src, dst); 849 DO_imm_mandr_r("dpps", 112, src, dst); 850 DO_imm_mandr_r("dpps", 113, src, dst); 851 DO_imm_mandr_r("dpps", 114, src, dst); 852 DO_imm_mandr_r("dpps", 115, src, dst); 853 DO_imm_mandr_r("dpps", 116, src, dst); 854 DO_imm_mandr_r("dpps", 117, src, dst); 855 DO_imm_mandr_r("dpps", 118, src, dst); 856 DO_imm_mandr_r("dpps", 119, src, dst); 857 DO_imm_mandr_r("dpps", 120, src, dst); 858 DO_imm_mandr_r("dpps", 121, src, dst); 859 DO_imm_mandr_r("dpps", 122, src, dst); 860 DO_imm_mandr_r("dpps", 123, src, dst); 861 DO_imm_mandr_r("dpps", 124, src, dst); 862 DO_imm_mandr_r("dpps", 125, src, dst); 863 DO_imm_mandr_r("dpps", 126, src, dst); 864 DO_imm_mandr_r("dpps", 127, src, dst); 865 DO_imm_mandr_r("dpps", 128, src, dst); 866 DO_imm_mandr_r("dpps", 129, src, dst); 867 DO_imm_mandr_r("dpps", 130, src, dst); 868 DO_imm_mandr_r("dpps", 131, src, dst); 869 DO_imm_mandr_r("dpps", 132, src, dst); 870 DO_imm_mandr_r("dpps", 133, src, dst); 871 DO_imm_mandr_r("dpps", 134, src, dst); 872 DO_imm_mandr_r("dpps", 135, src, dst); 873 DO_imm_mandr_r("dpps", 136, src, dst); 874 DO_imm_mandr_r("dpps", 137, src, dst); 875 DO_imm_mandr_r("dpps", 138, src, dst); 876 DO_imm_mandr_r("dpps", 139, src, dst); 877 DO_imm_mandr_r("dpps", 140, src, dst); 878 DO_imm_mandr_r("dpps", 141, src, dst); 879 DO_imm_mandr_r("dpps", 142, src, dst); 880 DO_imm_mandr_r("dpps", 143, src, dst); 881 DO_imm_mandr_r("dpps", 144, src, dst); 882 DO_imm_mandr_r("dpps", 145, src, dst); 883 DO_imm_mandr_r("dpps", 146, src, dst); 884 DO_imm_mandr_r("dpps", 147, src, dst); 885 DO_imm_mandr_r("dpps", 148, src, dst); 886 DO_imm_mandr_r("dpps", 149, src, dst); 887 DO_imm_mandr_r("dpps", 150, src, dst); 888 DO_imm_mandr_r("dpps", 151, src, dst); 889 DO_imm_mandr_r("dpps", 152, src, dst); 890 DO_imm_mandr_r("dpps", 153, src, dst); 891 DO_imm_mandr_r("dpps", 154, src, dst); 892 DO_imm_mandr_r("dpps", 155, src, dst); 893 DO_imm_mandr_r("dpps", 156, src, dst); 894 DO_imm_mandr_r("dpps", 157, src, dst); 895 DO_imm_mandr_r("dpps", 158, src, dst); 896 DO_imm_mandr_r("dpps", 159, src, dst); 897 DO_imm_mandr_r("dpps", 160, src, dst); 898 DO_imm_mandr_r("dpps", 161, src, dst); 899 DO_imm_mandr_r("dpps", 162, src, dst); 900 DO_imm_mandr_r("dpps", 163, src, dst); 901 DO_imm_mandr_r("dpps", 164, src, dst); 902 DO_imm_mandr_r("dpps", 165, src, dst); 903 DO_imm_mandr_r("dpps", 166, src, dst); 904 DO_imm_mandr_r("dpps", 167, src, dst); 905 DO_imm_mandr_r("dpps", 168, src, dst); 906 DO_imm_mandr_r("dpps", 169, src, dst); 907 DO_imm_mandr_r("dpps", 170, src, dst); 908 DO_imm_mandr_r("dpps", 171, src, dst); 909 DO_imm_mandr_r("dpps", 172, src, dst); 910 DO_imm_mandr_r("dpps", 173, src, dst); 911 DO_imm_mandr_r("dpps", 174, src, dst); 912 DO_imm_mandr_r("dpps", 175, src, dst); 913 DO_imm_mandr_r("dpps", 176, src, dst); 914 DO_imm_mandr_r("dpps", 177, src, dst); 915 DO_imm_mandr_r("dpps", 178, src, dst); 916 DO_imm_mandr_r("dpps", 179, src, dst); 917 DO_imm_mandr_r("dpps", 180, src, dst); 918 DO_imm_mandr_r("dpps", 181, src, dst); 919 DO_imm_mandr_r("dpps", 182, src, dst); 920 DO_imm_mandr_r("dpps", 183, src, dst); 921 DO_imm_mandr_r("dpps", 184, src, dst); 922 DO_imm_mandr_r("dpps", 185, src, dst); 923 DO_imm_mandr_r("dpps", 186, src, dst); 924 DO_imm_mandr_r("dpps", 187, src, dst); 925 DO_imm_mandr_r("dpps", 188, src, dst); 926 DO_imm_mandr_r("dpps", 189, src, dst); 927 DO_imm_mandr_r("dpps", 190, src, dst); 928 DO_imm_mandr_r("dpps", 191, src, dst); 929 DO_imm_mandr_r("dpps", 192, src, dst); 930 DO_imm_mandr_r("dpps", 193, src, dst); 931 DO_imm_mandr_r("dpps", 194, src, dst); 932 DO_imm_mandr_r("dpps", 195, src, dst); 933 DO_imm_mandr_r("dpps", 196, src, dst); 934 DO_imm_mandr_r("dpps", 197, src, dst); 935 DO_imm_mandr_r("dpps", 198, src, dst); 936 DO_imm_mandr_r("dpps", 199, src, dst); 937 DO_imm_mandr_r("dpps", 200, src, dst); 938 DO_imm_mandr_r("dpps", 201, src, dst); 939 DO_imm_mandr_r("dpps", 202, src, dst); 940 DO_imm_mandr_r("dpps", 203, src, dst); 941 DO_imm_mandr_r("dpps", 204, src, dst); 942 DO_imm_mandr_r("dpps", 205, src, dst); 943 DO_imm_mandr_r("dpps", 206, src, dst); 944 DO_imm_mandr_r("dpps", 207, src, dst); 945 DO_imm_mandr_r("dpps", 208, src, dst); 946 DO_imm_mandr_r("dpps", 209, src, dst); 947 DO_imm_mandr_r("dpps", 210, src, dst); 948 DO_imm_mandr_r("dpps", 211, src, dst); 949 DO_imm_mandr_r("dpps", 212, src, dst); 950 DO_imm_mandr_r("dpps", 213, src, dst); 951 DO_imm_mandr_r("dpps", 214, src, dst); 952 DO_imm_mandr_r("dpps", 215, src, dst); 953 DO_imm_mandr_r("dpps", 216, src, dst); 954 DO_imm_mandr_r("dpps", 217, src, dst); 955 DO_imm_mandr_r("dpps", 218, src, dst); 956 DO_imm_mandr_r("dpps", 219, src, dst); 957 DO_imm_mandr_r("dpps", 220, src, dst); 958 DO_imm_mandr_r("dpps", 221, src, dst); 959 DO_imm_mandr_r("dpps", 222, src, dst); 960 DO_imm_mandr_r("dpps", 223, src, dst); 961 DO_imm_mandr_r("dpps", 224, src, dst); 962 DO_imm_mandr_r("dpps", 225, src, dst); 963 DO_imm_mandr_r("dpps", 226, src, dst); 964 DO_imm_mandr_r("dpps", 227, src, dst); 965 DO_imm_mandr_r("dpps", 228, src, dst); 966 DO_imm_mandr_r("dpps", 229, src, dst); 967 DO_imm_mandr_r("dpps", 230, src, dst); 968 DO_imm_mandr_r("dpps", 231, src, dst); 969 DO_imm_mandr_r("dpps", 232, src, dst); 970 DO_imm_mandr_r("dpps", 233, src, dst); 971 DO_imm_mandr_r("dpps", 234, src, dst); 972 DO_imm_mandr_r("dpps", 235, src, dst); 973 DO_imm_mandr_r("dpps", 236, src, dst); 974 DO_imm_mandr_r("dpps", 237, src, dst); 975 DO_imm_mandr_r("dpps", 238, src, dst); 976 DO_imm_mandr_r("dpps", 239, src, dst); 977 DO_imm_mandr_r("dpps", 240, src, dst); 978 DO_imm_mandr_r("dpps", 241, src, dst); 979 DO_imm_mandr_r("dpps", 242, src, dst); 980 DO_imm_mandr_r("dpps", 243, src, dst); 981 DO_imm_mandr_r("dpps", 244, src, dst); 982 DO_imm_mandr_r("dpps", 245, src, dst); 983 DO_imm_mandr_r("dpps", 246, src, dst); 984 DO_imm_mandr_r("dpps", 247, src, dst); 985 DO_imm_mandr_r("dpps", 248, src, dst); 986 DO_imm_mandr_r("dpps", 249, src, dst); 987 DO_imm_mandr_r("dpps", 250, src, dst); 988 DO_imm_mandr_r("dpps", 251, src, dst); 989 DO_imm_mandr_r("dpps", 252, src, dst); 990 DO_imm_mandr_r("dpps", 253, src, dst); 991 DO_imm_mandr_r("dpps", 254, src, dst); 992 DO_imm_mandr_r("dpps", 255, src, dst); 993 } 994 } 995 996 void test_INSERTPS ( void ) 997 { 998 V128 src, dst; 999 { 1000 *(float*)(&src[0]) = 1.2; 1001 *(float*)(&src[4]) = -3.4; 1002 *(float*)(&src[8]) = -6.7; 1003 *(float*)(&src[12]) = 8.9; 1004 *(float*)(&dst[0]) = -10.11; 1005 *(float*)(&dst[4]) = 12.13; 1006 *(float*)(&dst[8]) = 14.15; 1007 *(float*)(&dst[12]) = -16.17; 1008 DO_imm_mandr_r("insertps", 0, src, dst); 1009 DO_imm_mandr_r("insertps", 1, src, dst); 1010 DO_imm_mandr_r("insertps", 2, src, dst); 1011 DO_imm_mandr_r("insertps", 3, src, dst); 1012 DO_imm_mandr_r("insertps", 4, src, dst); 1013 DO_imm_mandr_r("insertps", 5, src, dst); 1014 DO_imm_mandr_r("insertps", 6, src, dst); 1015 DO_imm_mandr_r("insertps", 7, src, dst); 1016 DO_imm_mandr_r("insertps", 8, src, dst); 1017 DO_imm_mandr_r("insertps", 9, src, dst); 1018 DO_imm_mandr_r("insertps", 10, src, dst); 1019 DO_imm_mandr_r("insertps", 11, src, dst); 1020 DO_imm_mandr_r("insertps", 12, src, dst); 1021 DO_imm_mandr_r("insertps", 13, src, dst); 1022 DO_imm_mandr_r("insertps", 14, src, dst); 1023 DO_imm_mandr_r("insertps", 15, src, dst); 1024 DO_imm_mandr_r("insertps", 16, src, dst); 1025 DO_imm_mandr_r("insertps", 17, src, dst); 1026 DO_imm_mandr_r("insertps", 18, src, dst); 1027 DO_imm_mandr_r("insertps", 19, src, dst); 1028 DO_imm_mandr_r("insertps", 20, src, dst); 1029 DO_imm_mandr_r("insertps", 21, src, dst); 1030 DO_imm_mandr_r("insertps", 22, src, dst); 1031 DO_imm_mandr_r("insertps", 23, src, dst); 1032 DO_imm_mandr_r("insertps", 24, src, dst); 1033 DO_imm_mandr_r("insertps", 25, src, dst); 1034 DO_imm_mandr_r("insertps", 26, src, dst); 1035 DO_imm_mandr_r("insertps", 27, src, dst); 1036 DO_imm_mandr_r("insertps", 28, src, dst); 1037 DO_imm_mandr_r("insertps", 29, src, dst); 1038 DO_imm_mandr_r("insertps", 30, src, dst); 1039 DO_imm_mandr_r("insertps", 31, src, dst); 1040 DO_imm_mandr_r("insertps", 32, src, dst); 1041 DO_imm_mandr_r("insertps", 33, src, dst); 1042 DO_imm_mandr_r("insertps", 34, src, dst); 1043 DO_imm_mandr_r("insertps", 35, src, dst); 1044 DO_imm_mandr_r("insertps", 36, src, dst); 1045 DO_imm_mandr_r("insertps", 37, src, dst); 1046 DO_imm_mandr_r("insertps", 38, src, dst); 1047 DO_imm_mandr_r("insertps", 39, src, dst); 1048 DO_imm_mandr_r("insertps", 40, src, dst); 1049 DO_imm_mandr_r("insertps", 41, src, dst); 1050 DO_imm_mandr_r("insertps", 42, src, dst); 1051 DO_imm_mandr_r("insertps", 43, src, dst); 1052 DO_imm_mandr_r("insertps", 44, src, dst); 1053 DO_imm_mandr_r("insertps", 45, src, dst); 1054 DO_imm_mandr_r("insertps", 46, src, dst); 1055 DO_imm_mandr_r("insertps", 47, src, dst); 1056 DO_imm_mandr_r("insertps", 48, src, dst); 1057 DO_imm_mandr_r("insertps", 49, src, dst); 1058 DO_imm_mandr_r("insertps", 50, src, dst); 1059 DO_imm_mandr_r("insertps", 51, src, dst); 1060 DO_imm_mandr_r("insertps", 52, src, dst); 1061 DO_imm_mandr_r("insertps", 53, src, dst); 1062 DO_imm_mandr_r("insertps", 54, src, dst); 1063 DO_imm_mandr_r("insertps", 55, src, dst); 1064 DO_imm_mandr_r("insertps", 56, src, dst); 1065 DO_imm_mandr_r("insertps", 57, src, dst); 1066 DO_imm_mandr_r("insertps", 58, src, dst); 1067 DO_imm_mandr_r("insertps", 59, src, dst); 1068 DO_imm_mandr_r("insertps", 60, src, dst); 1069 DO_imm_mandr_r("insertps", 61, src, dst); 1070 DO_imm_mandr_r("insertps", 62, src, dst); 1071 DO_imm_mandr_r("insertps", 63, src, dst); 1072 DO_imm_mandr_r("insertps", 64, src, dst); 1073 DO_imm_mandr_r("insertps", 65, src, dst); 1074 DO_imm_mandr_r("insertps", 66, src, dst); 1075 DO_imm_mandr_r("insertps", 67, src, dst); 1076 DO_imm_mandr_r("insertps", 68, src, dst); 1077 DO_imm_mandr_r("insertps", 69, src, dst); 1078 DO_imm_mandr_r("insertps", 70, src, dst); 1079 DO_imm_mandr_r("insertps", 71, src, dst); 1080 DO_imm_mandr_r("insertps", 72, src, dst); 1081 DO_imm_mandr_r("insertps", 73, src, dst); 1082 DO_imm_mandr_r("insertps", 74, src, dst); 1083 DO_imm_mandr_r("insertps", 75, src, dst); 1084 DO_imm_mandr_r("insertps", 76, src, dst); 1085 DO_imm_mandr_r("insertps", 77, src, dst); 1086 DO_imm_mandr_r("insertps", 78, src, dst); 1087 DO_imm_mandr_r("insertps", 79, src, dst); 1088 DO_imm_mandr_r("insertps", 80, src, dst); 1089 DO_imm_mandr_r("insertps", 81, src, dst); 1090 DO_imm_mandr_r("insertps", 82, src, dst); 1091 DO_imm_mandr_r("insertps", 83, src, dst); 1092 DO_imm_mandr_r("insertps", 84, src, dst); 1093 DO_imm_mandr_r("insertps", 85, src, dst); 1094 DO_imm_mandr_r("insertps", 86, src, dst); 1095 DO_imm_mandr_r("insertps", 87, src, dst); 1096 DO_imm_mandr_r("insertps", 88, src, dst); 1097 DO_imm_mandr_r("insertps", 89, src, dst); 1098 DO_imm_mandr_r("insertps", 90, src, dst); 1099 DO_imm_mandr_r("insertps", 91, src, dst); 1100 DO_imm_mandr_r("insertps", 92, src, dst); 1101 DO_imm_mandr_r("insertps", 93, src, dst); 1102 DO_imm_mandr_r("insertps", 94, src, dst); 1103 DO_imm_mandr_r("insertps", 95, src, dst); 1104 DO_imm_mandr_r("insertps", 96, src, dst); 1105 DO_imm_mandr_r("insertps", 97, src, dst); 1106 DO_imm_mandr_r("insertps", 98, src, dst); 1107 DO_imm_mandr_r("insertps", 99, src, dst); 1108 DO_imm_mandr_r("insertps", 100, src, dst); 1109 DO_imm_mandr_r("insertps", 101, src, dst); 1110 DO_imm_mandr_r("insertps", 102, src, dst); 1111 DO_imm_mandr_r("insertps", 103, src, dst); 1112 DO_imm_mandr_r("insertps", 104, src, dst); 1113 DO_imm_mandr_r("insertps", 105, src, dst); 1114 DO_imm_mandr_r("insertps", 106, src, dst); 1115 DO_imm_mandr_r("insertps", 107, src, dst); 1116 DO_imm_mandr_r("insertps", 108, src, dst); 1117 DO_imm_mandr_r("insertps", 109, src, dst); 1118 DO_imm_mandr_r("insertps", 110, src, dst); 1119 DO_imm_mandr_r("insertps", 111, src, dst); 1120 DO_imm_mandr_r("insertps", 112, src, dst); 1121 DO_imm_mandr_r("insertps", 113, src, dst); 1122 DO_imm_mandr_r("insertps", 114, src, dst); 1123 DO_imm_mandr_r("insertps", 115, src, dst); 1124 DO_imm_mandr_r("insertps", 116, src, dst); 1125 DO_imm_mandr_r("insertps", 117, src, dst); 1126 DO_imm_mandr_r("insertps", 118, src, dst); 1127 DO_imm_mandr_r("insertps", 119, src, dst); 1128 DO_imm_mandr_r("insertps", 120, src, dst); 1129 DO_imm_mandr_r("insertps", 121, src, dst); 1130 DO_imm_mandr_r("insertps", 122, src, dst); 1131 DO_imm_mandr_r("insertps", 123, src, dst); 1132 DO_imm_mandr_r("insertps", 124, src, dst); 1133 DO_imm_mandr_r("insertps", 125, src, dst); 1134 DO_imm_mandr_r("insertps", 126, src, dst); 1135 DO_imm_mandr_r("insertps", 127, src, dst); 1136 DO_imm_mandr_r("insertps", 128, src, dst); 1137 DO_imm_mandr_r("insertps", 129, src, dst); 1138 DO_imm_mandr_r("insertps", 130, src, dst); 1139 DO_imm_mandr_r("insertps", 131, src, dst); 1140 DO_imm_mandr_r("insertps", 132, src, dst); 1141 DO_imm_mandr_r("insertps", 133, src, dst); 1142 DO_imm_mandr_r("insertps", 134, src, dst); 1143 DO_imm_mandr_r("insertps", 135, src, dst); 1144 DO_imm_mandr_r("insertps", 136, src, dst); 1145 DO_imm_mandr_r("insertps", 137, src, dst); 1146 DO_imm_mandr_r("insertps", 138, src, dst); 1147 DO_imm_mandr_r("insertps", 139, src, dst); 1148 DO_imm_mandr_r("insertps", 140, src, dst); 1149 DO_imm_mandr_r("insertps", 141, src, dst); 1150 DO_imm_mandr_r("insertps", 142, src, dst); 1151 DO_imm_mandr_r("insertps", 143, src, dst); 1152 DO_imm_mandr_r("insertps", 144, src, dst); 1153 DO_imm_mandr_r("insertps", 145, src, dst); 1154 DO_imm_mandr_r("insertps", 146, src, dst); 1155 DO_imm_mandr_r("insertps", 147, src, dst); 1156 DO_imm_mandr_r("insertps", 148, src, dst); 1157 DO_imm_mandr_r("insertps", 149, src, dst); 1158 DO_imm_mandr_r("insertps", 150, src, dst); 1159 DO_imm_mandr_r("insertps", 151, src, dst); 1160 DO_imm_mandr_r("insertps", 152, src, dst); 1161 DO_imm_mandr_r("insertps", 153, src, dst); 1162 DO_imm_mandr_r("insertps", 154, src, dst); 1163 DO_imm_mandr_r("insertps", 155, src, dst); 1164 DO_imm_mandr_r("insertps", 156, src, dst); 1165 DO_imm_mandr_r("insertps", 157, src, dst); 1166 DO_imm_mandr_r("insertps", 158, src, dst); 1167 DO_imm_mandr_r("insertps", 159, src, dst); 1168 DO_imm_mandr_r("insertps", 160, src, dst); 1169 DO_imm_mandr_r("insertps", 161, src, dst); 1170 DO_imm_mandr_r("insertps", 162, src, dst); 1171 DO_imm_mandr_r("insertps", 163, src, dst); 1172 DO_imm_mandr_r("insertps", 164, src, dst); 1173 DO_imm_mandr_r("insertps", 165, src, dst); 1174 DO_imm_mandr_r("insertps", 166, src, dst); 1175 DO_imm_mandr_r("insertps", 167, src, dst); 1176 DO_imm_mandr_r("insertps", 168, src, dst); 1177 DO_imm_mandr_r("insertps", 169, src, dst); 1178 DO_imm_mandr_r("insertps", 170, src, dst); 1179 DO_imm_mandr_r("insertps", 171, src, dst); 1180 DO_imm_mandr_r("insertps", 172, src, dst); 1181 DO_imm_mandr_r("insertps", 173, src, dst); 1182 DO_imm_mandr_r("insertps", 174, src, dst); 1183 DO_imm_mandr_r("insertps", 175, src, dst); 1184 DO_imm_mandr_r("insertps", 176, src, dst); 1185 DO_imm_mandr_r("insertps", 177, src, dst); 1186 DO_imm_mandr_r("insertps", 178, src, dst); 1187 DO_imm_mandr_r("insertps", 179, src, dst); 1188 DO_imm_mandr_r("insertps", 180, src, dst); 1189 DO_imm_mandr_r("insertps", 181, src, dst); 1190 DO_imm_mandr_r("insertps", 182, src, dst); 1191 DO_imm_mandr_r("insertps", 183, src, dst); 1192 DO_imm_mandr_r("insertps", 184, src, dst); 1193 DO_imm_mandr_r("insertps", 185, src, dst); 1194 DO_imm_mandr_r("insertps", 186, src, dst); 1195 DO_imm_mandr_r("insertps", 187, src, dst); 1196 DO_imm_mandr_r("insertps", 188, src, dst); 1197 DO_imm_mandr_r("insertps", 189, src, dst); 1198 DO_imm_mandr_r("insertps", 190, src, dst); 1199 DO_imm_mandr_r("insertps", 191, src, dst); 1200 DO_imm_mandr_r("insertps", 192, src, dst); 1201 DO_imm_mandr_r("insertps", 193, src, dst); 1202 DO_imm_mandr_r("insertps", 194, src, dst); 1203 DO_imm_mandr_r("insertps", 195, src, dst); 1204 DO_imm_mandr_r("insertps", 196, src, dst); 1205 DO_imm_mandr_r("insertps", 197, src, dst); 1206 DO_imm_mandr_r("insertps", 198, src, dst); 1207 DO_imm_mandr_r("insertps", 199, src, dst); 1208 DO_imm_mandr_r("insertps", 200, src, dst); 1209 DO_imm_mandr_r("insertps", 201, src, dst); 1210 DO_imm_mandr_r("insertps", 202, src, dst); 1211 DO_imm_mandr_r("insertps", 203, src, dst); 1212 DO_imm_mandr_r("insertps", 204, src, dst); 1213 DO_imm_mandr_r("insertps", 205, src, dst); 1214 DO_imm_mandr_r("insertps", 206, src, dst); 1215 DO_imm_mandr_r("insertps", 207, src, dst); 1216 DO_imm_mandr_r("insertps", 208, src, dst); 1217 DO_imm_mandr_r("insertps", 209, src, dst); 1218 DO_imm_mandr_r("insertps", 210, src, dst); 1219 DO_imm_mandr_r("insertps", 211, src, dst); 1220 DO_imm_mandr_r("insertps", 212, src, dst); 1221 DO_imm_mandr_r("insertps", 213, src, dst); 1222 DO_imm_mandr_r("insertps", 214, src, dst); 1223 DO_imm_mandr_r("insertps", 215, src, dst); 1224 DO_imm_mandr_r("insertps", 216, src, dst); 1225 DO_imm_mandr_r("insertps", 217, src, dst); 1226 DO_imm_mandr_r("insertps", 218, src, dst); 1227 DO_imm_mandr_r("insertps", 219, src, dst); 1228 DO_imm_mandr_r("insertps", 220, src, dst); 1229 DO_imm_mandr_r("insertps", 221, src, dst); 1230 DO_imm_mandr_r("insertps", 222, src, dst); 1231 DO_imm_mandr_r("insertps", 223, src, dst); 1232 DO_imm_mandr_r("insertps", 224, src, dst); 1233 DO_imm_mandr_r("insertps", 225, src, dst); 1234 DO_imm_mandr_r("insertps", 226, src, dst); 1235 DO_imm_mandr_r("insertps", 227, src, dst); 1236 DO_imm_mandr_r("insertps", 228, src, dst); 1237 DO_imm_mandr_r("insertps", 229, src, dst); 1238 DO_imm_mandr_r("insertps", 230, src, dst); 1239 DO_imm_mandr_r("insertps", 231, src, dst); 1240 DO_imm_mandr_r("insertps", 232, src, dst); 1241 DO_imm_mandr_r("insertps", 233, src, dst); 1242 DO_imm_mandr_r("insertps", 234, src, dst); 1243 DO_imm_mandr_r("insertps", 235, src, dst); 1244 DO_imm_mandr_r("insertps", 236, src, dst); 1245 DO_imm_mandr_r("insertps", 237, src, dst); 1246 DO_imm_mandr_r("insertps", 238, src, dst); 1247 DO_imm_mandr_r("insertps", 239, src, dst); 1248 DO_imm_mandr_r("insertps", 240, src, dst); 1249 DO_imm_mandr_r("insertps", 241, src, dst); 1250 DO_imm_mandr_r("insertps", 242, src, dst); 1251 DO_imm_mandr_r("insertps", 243, src, dst); 1252 DO_imm_mandr_r("insertps", 244, src, dst); 1253 DO_imm_mandr_r("insertps", 245, src, dst); 1254 DO_imm_mandr_r("insertps", 246, src, dst); 1255 DO_imm_mandr_r("insertps", 247, src, dst); 1256 DO_imm_mandr_r("insertps", 248, src, dst); 1257 DO_imm_mandr_r("insertps", 249, src, dst); 1258 DO_imm_mandr_r("insertps", 250, src, dst); 1259 DO_imm_mandr_r("insertps", 251, src, dst); 1260 DO_imm_mandr_r("insertps", 252, src, dst); 1261 DO_imm_mandr_r("insertps", 253, src, dst); 1262 DO_imm_mandr_r("insertps", 254, src, dst); 1263 DO_imm_mandr_r("insertps", 255, src, dst); 1264 } 1265 } 1266 1267 void test_MPSADBW ( void ) 1268 { 1269 V128 src, dst; 1270 Int i; 1271 for (i = 0; i < 50; i++) { 1272 randV128(&src); 1273 randV128(&dst); 1274 DO_imm_mandr_r("mpsadbw", 0, src, dst); 1275 DO_imm_mandr_r("mpsadbw", 1, src, dst); 1276 DO_imm_mandr_r("mpsadbw", 2, src, dst); 1277 DO_imm_mandr_r("mpsadbw", 3, src, dst); 1278 DO_imm_mandr_r("mpsadbw", 4, src, dst); 1279 DO_imm_mandr_r("mpsadbw", 5, src, dst); 1280 DO_imm_mandr_r("mpsadbw", 6, src, dst); 1281 DO_imm_mandr_r("mpsadbw", 7, src, dst); 1282 } 1283 } 1284 1285 void test_PACKUSDW ( void ) 1286 { 1287 V128 src, dst; 1288 Int i; 1289 for (i = 0; i < 10; i++) { 1290 if (i < 9) { 1291 randV128(&src); 1292 randV128(&dst); 1293 } else { 1294 memset(&src, 0, sizeof(src)); 1295 memset(&dst, 0, sizeof(src)); 1296 src[0] = 0x11; src[1] = 0x22; 1297 src[4] = 0x33; src[5] = 0x44; 1298 src[8] = 0x55; src[9] = 0x66; 1299 src[12] = 0x77; src[13] = 0x88; 1300 dst[0] = 0xaa; dst[1] = 0xbb; 1301 dst[4] = 0xcc; dst[5] = 0xdd; 1302 dst[8] = 0xee; dst[9] = 0xff; 1303 dst[12] = 0xa1; dst[13] = 0xb2; 1304 } 1305 DO_mandr_r("packusdw", src, dst); 1306 } 1307 } 1308 1309 void test_PBLENDW ( void ) 1310 { 1311 V128 src, dst; 1312 randV128(&src); 1313 randV128(&dst); 1314 { 1315 DO_imm_mandr_r("pblendw", 0, src, dst); 1316 DO_imm_mandr_r("pblendw", 1, src, dst); 1317 DO_imm_mandr_r("pblendw", 2, src, dst); 1318 DO_imm_mandr_r("pblendw", 3, src, dst); 1319 DO_imm_mandr_r("pblendw", 4, src, dst); 1320 DO_imm_mandr_r("pblendw", 5, src, dst); 1321 DO_imm_mandr_r("pblendw", 6, src, dst); 1322 DO_imm_mandr_r("pblendw", 7, src, dst); 1323 DO_imm_mandr_r("pblendw", 8, src, dst); 1324 DO_imm_mandr_r("pblendw", 9, src, dst); 1325 DO_imm_mandr_r("pblendw", 10, src, dst); 1326 DO_imm_mandr_r("pblendw", 11, src, dst); 1327 DO_imm_mandr_r("pblendw", 12, src, dst); 1328 DO_imm_mandr_r("pblendw", 13, src, dst); 1329 DO_imm_mandr_r("pblendw", 14, src, dst); 1330 DO_imm_mandr_r("pblendw", 15, src, dst); 1331 DO_imm_mandr_r("pblendw", 16, src, dst); 1332 DO_imm_mandr_r("pblendw", 17, src, dst); 1333 DO_imm_mandr_r("pblendw", 18, src, dst); 1334 DO_imm_mandr_r("pblendw", 19, src, dst); 1335 DO_imm_mandr_r("pblendw", 20, src, dst); 1336 DO_imm_mandr_r("pblendw", 21, src, dst); 1337 DO_imm_mandr_r("pblendw", 22, src, dst); 1338 DO_imm_mandr_r("pblendw", 23, src, dst); 1339 DO_imm_mandr_r("pblendw", 24, src, dst); 1340 DO_imm_mandr_r("pblendw", 25, src, dst); 1341 DO_imm_mandr_r("pblendw", 26, src, dst); 1342 DO_imm_mandr_r("pblendw", 27, src, dst); 1343 DO_imm_mandr_r("pblendw", 28, src, dst); 1344 DO_imm_mandr_r("pblendw", 29, src, dst); 1345 DO_imm_mandr_r("pblendw", 30, src, dst); 1346 DO_imm_mandr_r("pblendw", 31, src, dst); 1347 DO_imm_mandr_r("pblendw", 32, src, dst); 1348 DO_imm_mandr_r("pblendw", 33, src, dst); 1349 DO_imm_mandr_r("pblendw", 34, src, dst); 1350 DO_imm_mandr_r("pblendw", 35, src, dst); 1351 DO_imm_mandr_r("pblendw", 36, src, dst); 1352 DO_imm_mandr_r("pblendw", 37, src, dst); 1353 DO_imm_mandr_r("pblendw", 38, src, dst); 1354 DO_imm_mandr_r("pblendw", 39, src, dst); 1355 DO_imm_mandr_r("pblendw", 40, src, dst); 1356 DO_imm_mandr_r("pblendw", 41, src, dst); 1357 DO_imm_mandr_r("pblendw", 42, src, dst); 1358 DO_imm_mandr_r("pblendw", 43, src, dst); 1359 DO_imm_mandr_r("pblendw", 44, src, dst); 1360 DO_imm_mandr_r("pblendw", 45, src, dst); 1361 DO_imm_mandr_r("pblendw", 46, src, dst); 1362 DO_imm_mandr_r("pblendw", 47, src, dst); 1363 DO_imm_mandr_r("pblendw", 48, src, dst); 1364 DO_imm_mandr_r("pblendw", 49, src, dst); 1365 DO_imm_mandr_r("pblendw", 50, src, dst); 1366 DO_imm_mandr_r("pblendw", 51, src, dst); 1367 DO_imm_mandr_r("pblendw", 52, src, dst); 1368 DO_imm_mandr_r("pblendw", 53, src, dst); 1369 DO_imm_mandr_r("pblendw", 54, src, dst); 1370 DO_imm_mandr_r("pblendw", 55, src, dst); 1371 DO_imm_mandr_r("pblendw", 56, src, dst); 1372 DO_imm_mandr_r("pblendw", 57, src, dst); 1373 DO_imm_mandr_r("pblendw", 58, src, dst); 1374 DO_imm_mandr_r("pblendw", 59, src, dst); 1375 DO_imm_mandr_r("pblendw", 60, src, dst); 1376 DO_imm_mandr_r("pblendw", 61, src, dst); 1377 DO_imm_mandr_r("pblendw", 62, src, dst); 1378 DO_imm_mandr_r("pblendw", 63, src, dst); 1379 DO_imm_mandr_r("pblendw", 64, src, dst); 1380 DO_imm_mandr_r("pblendw", 65, src, dst); 1381 DO_imm_mandr_r("pblendw", 66, src, dst); 1382 DO_imm_mandr_r("pblendw", 67, src, dst); 1383 DO_imm_mandr_r("pblendw", 68, src, dst); 1384 DO_imm_mandr_r("pblendw", 69, src, dst); 1385 DO_imm_mandr_r("pblendw", 70, src, dst); 1386 DO_imm_mandr_r("pblendw", 71, src, dst); 1387 DO_imm_mandr_r("pblendw", 72, src, dst); 1388 DO_imm_mandr_r("pblendw", 73, src, dst); 1389 DO_imm_mandr_r("pblendw", 74, src, dst); 1390 DO_imm_mandr_r("pblendw", 75, src, dst); 1391 DO_imm_mandr_r("pblendw", 76, src, dst); 1392 DO_imm_mandr_r("pblendw", 77, src, dst); 1393 DO_imm_mandr_r("pblendw", 78, src, dst); 1394 DO_imm_mandr_r("pblendw", 79, src, dst); 1395 DO_imm_mandr_r("pblendw", 80, src, dst); 1396 DO_imm_mandr_r("pblendw", 81, src, dst); 1397 DO_imm_mandr_r("pblendw", 82, src, dst); 1398 DO_imm_mandr_r("pblendw", 83, src, dst); 1399 DO_imm_mandr_r("pblendw", 84, src, dst); 1400 DO_imm_mandr_r("pblendw", 85, src, dst); 1401 DO_imm_mandr_r("pblendw", 86, src, dst); 1402 DO_imm_mandr_r("pblendw", 87, src, dst); 1403 DO_imm_mandr_r("pblendw", 88, src, dst); 1404 DO_imm_mandr_r("pblendw", 89, src, dst); 1405 DO_imm_mandr_r("pblendw", 90, src, dst); 1406 DO_imm_mandr_r("pblendw", 91, src, dst); 1407 DO_imm_mandr_r("pblendw", 92, src, dst); 1408 DO_imm_mandr_r("pblendw", 93, src, dst); 1409 DO_imm_mandr_r("pblendw", 94, src, dst); 1410 DO_imm_mandr_r("pblendw", 95, src, dst); 1411 DO_imm_mandr_r("pblendw", 96, src, dst); 1412 DO_imm_mandr_r("pblendw", 97, src, dst); 1413 DO_imm_mandr_r("pblendw", 98, src, dst); 1414 DO_imm_mandr_r("pblendw", 99, src, dst); 1415 DO_imm_mandr_r("pblendw", 100, src, dst); 1416 DO_imm_mandr_r("pblendw", 101, src, dst); 1417 DO_imm_mandr_r("pblendw", 102, src, dst); 1418 DO_imm_mandr_r("pblendw", 103, src, dst); 1419 DO_imm_mandr_r("pblendw", 104, src, dst); 1420 DO_imm_mandr_r("pblendw", 105, src, dst); 1421 DO_imm_mandr_r("pblendw", 106, src, dst); 1422 DO_imm_mandr_r("pblendw", 107, src, dst); 1423 DO_imm_mandr_r("pblendw", 108, src, dst); 1424 DO_imm_mandr_r("pblendw", 109, src, dst); 1425 DO_imm_mandr_r("pblendw", 110, src, dst); 1426 DO_imm_mandr_r("pblendw", 111, src, dst); 1427 DO_imm_mandr_r("pblendw", 112, src, dst); 1428 DO_imm_mandr_r("pblendw", 113, src, dst); 1429 DO_imm_mandr_r("pblendw", 114, src, dst); 1430 DO_imm_mandr_r("pblendw", 115, src, dst); 1431 DO_imm_mandr_r("pblendw", 116, src, dst); 1432 DO_imm_mandr_r("pblendw", 117, src, dst); 1433 DO_imm_mandr_r("pblendw", 118, src, dst); 1434 DO_imm_mandr_r("pblendw", 119, src, dst); 1435 DO_imm_mandr_r("pblendw", 120, src, dst); 1436 DO_imm_mandr_r("pblendw", 121, src, dst); 1437 DO_imm_mandr_r("pblendw", 122, src, dst); 1438 DO_imm_mandr_r("pblendw", 123, src, dst); 1439 DO_imm_mandr_r("pblendw", 124, src, dst); 1440 DO_imm_mandr_r("pblendw", 125, src, dst); 1441 DO_imm_mandr_r("pblendw", 126, src, dst); 1442 DO_imm_mandr_r("pblendw", 127, src, dst); 1443 DO_imm_mandr_r("pblendw", 128, src, dst); 1444 DO_imm_mandr_r("pblendw", 129, src, dst); 1445 DO_imm_mandr_r("pblendw", 130, src, dst); 1446 DO_imm_mandr_r("pblendw", 131, src, dst); 1447 DO_imm_mandr_r("pblendw", 132, src, dst); 1448 DO_imm_mandr_r("pblendw", 133, src, dst); 1449 DO_imm_mandr_r("pblendw", 134, src, dst); 1450 DO_imm_mandr_r("pblendw", 135, src, dst); 1451 DO_imm_mandr_r("pblendw", 136, src, dst); 1452 DO_imm_mandr_r("pblendw", 137, src, dst); 1453 DO_imm_mandr_r("pblendw", 138, src, dst); 1454 DO_imm_mandr_r("pblendw", 139, src, dst); 1455 DO_imm_mandr_r("pblendw", 140, src, dst); 1456 DO_imm_mandr_r("pblendw", 141, src, dst); 1457 DO_imm_mandr_r("pblendw", 142, src, dst); 1458 DO_imm_mandr_r("pblendw", 143, src, dst); 1459 DO_imm_mandr_r("pblendw", 144, src, dst); 1460 DO_imm_mandr_r("pblendw", 145, src, dst); 1461 DO_imm_mandr_r("pblendw", 146, src, dst); 1462 DO_imm_mandr_r("pblendw", 147, src, dst); 1463 DO_imm_mandr_r("pblendw", 148, src, dst); 1464 DO_imm_mandr_r("pblendw", 149, src, dst); 1465 DO_imm_mandr_r("pblendw", 150, src, dst); 1466 DO_imm_mandr_r("pblendw", 151, src, dst); 1467 DO_imm_mandr_r("pblendw", 152, src, dst); 1468 DO_imm_mandr_r("pblendw", 153, src, dst); 1469 DO_imm_mandr_r("pblendw", 154, src, dst); 1470 DO_imm_mandr_r("pblendw", 155, src, dst); 1471 DO_imm_mandr_r("pblendw", 156, src, dst); 1472 DO_imm_mandr_r("pblendw", 157, src, dst); 1473 DO_imm_mandr_r("pblendw", 158, src, dst); 1474 DO_imm_mandr_r("pblendw", 159, src, dst); 1475 DO_imm_mandr_r("pblendw", 160, src, dst); 1476 DO_imm_mandr_r("pblendw", 161, src, dst); 1477 DO_imm_mandr_r("pblendw", 162, src, dst); 1478 DO_imm_mandr_r("pblendw", 163, src, dst); 1479 DO_imm_mandr_r("pblendw", 164, src, dst); 1480 DO_imm_mandr_r("pblendw", 165, src, dst); 1481 DO_imm_mandr_r("pblendw", 166, src, dst); 1482 DO_imm_mandr_r("pblendw", 167, src, dst); 1483 DO_imm_mandr_r("pblendw", 168, src, dst); 1484 DO_imm_mandr_r("pblendw", 169, src, dst); 1485 DO_imm_mandr_r("pblendw", 170, src, dst); 1486 DO_imm_mandr_r("pblendw", 171, src, dst); 1487 DO_imm_mandr_r("pblendw", 172, src, dst); 1488 DO_imm_mandr_r("pblendw", 173, src, dst); 1489 DO_imm_mandr_r("pblendw", 174, src, dst); 1490 DO_imm_mandr_r("pblendw", 175, src, dst); 1491 DO_imm_mandr_r("pblendw", 176, src, dst); 1492 DO_imm_mandr_r("pblendw", 177, src, dst); 1493 DO_imm_mandr_r("pblendw", 178, src, dst); 1494 DO_imm_mandr_r("pblendw", 179, src, dst); 1495 DO_imm_mandr_r("pblendw", 180, src, dst); 1496 DO_imm_mandr_r("pblendw", 181, src, dst); 1497 DO_imm_mandr_r("pblendw", 182, src, dst); 1498 DO_imm_mandr_r("pblendw", 183, src, dst); 1499 DO_imm_mandr_r("pblendw", 184, src, dst); 1500 DO_imm_mandr_r("pblendw", 185, src, dst); 1501 DO_imm_mandr_r("pblendw", 186, src, dst); 1502 DO_imm_mandr_r("pblendw", 187, src, dst); 1503 DO_imm_mandr_r("pblendw", 188, src, dst); 1504 DO_imm_mandr_r("pblendw", 189, src, dst); 1505 DO_imm_mandr_r("pblendw", 190, src, dst); 1506 DO_imm_mandr_r("pblendw", 191, src, dst); 1507 DO_imm_mandr_r("pblendw", 192, src, dst); 1508 DO_imm_mandr_r("pblendw", 193, src, dst); 1509 DO_imm_mandr_r("pblendw", 194, src, dst); 1510 DO_imm_mandr_r("pblendw", 195, src, dst); 1511 DO_imm_mandr_r("pblendw", 196, src, dst); 1512 DO_imm_mandr_r("pblendw", 197, src, dst); 1513 DO_imm_mandr_r("pblendw", 198, src, dst); 1514 DO_imm_mandr_r("pblendw", 199, src, dst); 1515 DO_imm_mandr_r("pblendw", 200, src, dst); 1516 DO_imm_mandr_r("pblendw", 201, src, dst); 1517 DO_imm_mandr_r("pblendw", 202, src, dst); 1518 DO_imm_mandr_r("pblendw", 203, src, dst); 1519 DO_imm_mandr_r("pblendw", 204, src, dst); 1520 DO_imm_mandr_r("pblendw", 205, src, dst); 1521 DO_imm_mandr_r("pblendw", 206, src, dst); 1522 DO_imm_mandr_r("pblendw", 207, src, dst); 1523 DO_imm_mandr_r("pblendw", 208, src, dst); 1524 DO_imm_mandr_r("pblendw", 209, src, dst); 1525 DO_imm_mandr_r("pblendw", 210, src, dst); 1526 DO_imm_mandr_r("pblendw", 211, src, dst); 1527 DO_imm_mandr_r("pblendw", 212, src, dst); 1528 DO_imm_mandr_r("pblendw", 213, src, dst); 1529 DO_imm_mandr_r("pblendw", 214, src, dst); 1530 DO_imm_mandr_r("pblendw", 215, src, dst); 1531 DO_imm_mandr_r("pblendw", 216, src, dst); 1532 DO_imm_mandr_r("pblendw", 217, src, dst); 1533 DO_imm_mandr_r("pblendw", 218, src, dst); 1534 DO_imm_mandr_r("pblendw", 219, src, dst); 1535 DO_imm_mandr_r("pblendw", 220, src, dst); 1536 DO_imm_mandr_r("pblendw", 221, src, dst); 1537 DO_imm_mandr_r("pblendw", 222, src, dst); 1538 DO_imm_mandr_r("pblendw", 223, src, dst); 1539 DO_imm_mandr_r("pblendw", 224, src, dst); 1540 DO_imm_mandr_r("pblendw", 225, src, dst); 1541 DO_imm_mandr_r("pblendw", 226, src, dst); 1542 DO_imm_mandr_r("pblendw", 227, src, dst); 1543 DO_imm_mandr_r("pblendw", 228, src, dst); 1544 DO_imm_mandr_r("pblendw", 229, src, dst); 1545 DO_imm_mandr_r("pblendw", 230, src, dst); 1546 DO_imm_mandr_r("pblendw", 231, src, dst); 1547 DO_imm_mandr_r("pblendw", 232, src, dst); 1548 DO_imm_mandr_r("pblendw", 233, src, dst); 1549 DO_imm_mandr_r("pblendw", 234, src, dst); 1550 DO_imm_mandr_r("pblendw", 235, src, dst); 1551 DO_imm_mandr_r("pblendw", 236, src, dst); 1552 DO_imm_mandr_r("pblendw", 237, src, dst); 1553 DO_imm_mandr_r("pblendw", 238, src, dst); 1554 DO_imm_mandr_r("pblendw", 239, src, dst); 1555 DO_imm_mandr_r("pblendw", 240, src, dst); 1556 DO_imm_mandr_r("pblendw", 241, src, dst); 1557 DO_imm_mandr_r("pblendw", 242, src, dst); 1558 DO_imm_mandr_r("pblendw", 243, src, dst); 1559 DO_imm_mandr_r("pblendw", 244, src, dst); 1560 DO_imm_mandr_r("pblendw", 245, src, dst); 1561 DO_imm_mandr_r("pblendw", 246, src, dst); 1562 DO_imm_mandr_r("pblendw", 247, src, dst); 1563 DO_imm_mandr_r("pblendw", 248, src, dst); 1564 DO_imm_mandr_r("pblendw", 249, src, dst); 1565 DO_imm_mandr_r("pblendw", 250, src, dst); 1566 DO_imm_mandr_r("pblendw", 251, src, dst); 1567 DO_imm_mandr_r("pblendw", 252, src, dst); 1568 DO_imm_mandr_r("pblendw", 253, src, dst); 1569 DO_imm_mandr_r("pblendw", 254, src, dst); 1570 DO_imm_mandr_r("pblendw", 255, src, dst); 1571 } 1572 } 1573 1574 1575 void test_PCMPEQQ ( void ) 1576 { 1577 V128 src, dst; 1578 Int i; 1579 for (i = 0; i < 10; i++) { 1580 randV128(&src); 1581 randV128(&dst); 1582 switch (i - 6) { 1583 case 0: memset(&src[0], 0x55, 8); 1584 memset(&dst[0], 0x55, 8); break; 1585 case 1: memset(&src[8], 0x55, 8); 1586 memset(&dst[8], 0x55, 8); break; 1587 default: 1588 break; 1589 } 1590 DO_mandr_r("pcmpeqq", src, dst); 1591 } 1592 } 1593 1594 1595 void test_PEXTRB ( void ) 1596 { 1597 V128 src; 1598 randV128(&src); 1599 DO_imm_r_to_mandrscalar("pextrb", 0, src, "d"); 1600 DO_imm_r_to_mandrscalar("pextrb", 1, src, "d"); 1601 DO_imm_r_to_mandrscalar("pextrb", 2, src, "d"); 1602 DO_imm_r_to_mandrscalar("pextrb", 3, src, "d"); 1603 DO_imm_r_to_mandrscalar("pextrb", 4, src, "d"); 1604 DO_imm_r_to_mandrscalar("pextrb", 5, src, "d"); 1605 DO_imm_r_to_mandrscalar("pextrb", 6, src, "d"); 1606 DO_imm_r_to_mandrscalar("pextrb", 7, src, "d"); 1607 DO_imm_r_to_mandrscalar("pextrb", 8, src, "d"); 1608 DO_imm_r_to_mandrscalar("pextrb", 9, src, "d"); 1609 DO_imm_r_to_mandrscalar("pextrb", 10, src, "d"); 1610 DO_imm_r_to_mandrscalar("pextrb", 11, src, "d"); 1611 DO_imm_r_to_mandrscalar("pextrb", 12, src, "d"); 1612 DO_imm_r_to_mandrscalar("pextrb", 13, src, "d"); 1613 DO_imm_r_to_mandrscalar("pextrb", 14, src, "d"); 1614 DO_imm_r_to_mandrscalar("pextrb", 15, src, "d"); 1615 } 1616 1617 void test_PINSRB ( void ) 1618 { 1619 ULong src; 1620 src = randULong(); 1621 DO_imm_mandrscalar_to_r("pinsrb", 0, src, "d"); 1622 src = randULong(); 1623 DO_imm_mandrscalar_to_r("pinsrb", 1, src, "d"); 1624 src = randULong(); 1625 DO_imm_mandrscalar_to_r("pinsrb", 2, src, "d"); 1626 src = randULong(); 1627 DO_imm_mandrscalar_to_r("pinsrb", 3, src, "d"); 1628 src = randULong(); 1629 DO_imm_mandrscalar_to_r("pinsrb", 4, src, "d"); 1630 src = randULong(); 1631 DO_imm_mandrscalar_to_r("pinsrb", 5, src, "d"); 1632 src = randULong(); 1633 DO_imm_mandrscalar_to_r("pinsrb", 6, src, "d"); 1634 src = randULong(); 1635 DO_imm_mandrscalar_to_r("pinsrb", 7, src, "d"); 1636 src = randULong(); 1637 DO_imm_mandrscalar_to_r("pinsrb", 8, src, "d"); 1638 src = randULong(); 1639 DO_imm_mandrscalar_to_r("pinsrb", 9, src, "d"); 1640 src = randULong(); 1641 DO_imm_mandrscalar_to_r("pinsrb", 10, src, "d"); 1642 src = randULong(); 1643 DO_imm_mandrscalar_to_r("pinsrb", 11, src, "d"); 1644 src = randULong(); 1645 DO_imm_mandrscalar_to_r("pinsrb", 12, src, "d"); 1646 src = randULong(); 1647 DO_imm_mandrscalar_to_r("pinsrb", 13, src, "d"); 1648 src = randULong(); 1649 DO_imm_mandrscalar_to_r("pinsrb", 14, src, "d"); 1650 src = randULong(); 1651 DO_imm_mandrscalar_to_r("pinsrb", 15, src, "d"); 1652 } 1653 1654 1655 void test_PEXTRW ( void ) 1656 { 1657 V128 src; 1658 randV128(&src); 1659 DO_imm_r_to_mandrscalar("pextrw", 0, src, "d"); 1660 DO_imm_r_to_mandrscalar("pextrw", 1, src, "d"); 1661 DO_imm_r_to_mandrscalar("pextrw", 2, src, "d"); 1662 DO_imm_r_to_mandrscalar("pextrw", 3, src, "d"); 1663 DO_imm_r_to_mandrscalar("pextrw", 4, src, "d"); 1664 DO_imm_r_to_mandrscalar("pextrw", 5, src, "d"); 1665 DO_imm_r_to_mandrscalar("pextrw", 6, src, "d"); 1666 DO_imm_r_to_mandrscalar("pextrw", 7, src, "d"); 1667 } 1668 1669 void test_PINSRW ( void ) 1670 { 1671 ULong src; 1672 src = randULong(); 1673 DO_imm_mandrscalar_to_r("pinsrw", 0, src, "d"); 1674 src = randULong(); 1675 DO_imm_mandrscalar_to_r("pinsrw", 1, src, "d"); 1676 src = randULong(); 1677 DO_imm_mandrscalar_to_r("pinsrw", 2, src, "d"); 1678 src = randULong(); 1679 DO_imm_mandrscalar_to_r("pinsrw", 3, src, "d"); 1680 src = randULong(); 1681 DO_imm_mandrscalar_to_r("pinsrw", 4, src, "d"); 1682 src = randULong(); 1683 DO_imm_mandrscalar_to_r("pinsrw", 5, src, "d"); 1684 src = randULong(); 1685 DO_imm_mandrscalar_to_r("pinsrw", 6, src, "d"); 1686 src = randULong(); 1687 DO_imm_mandrscalar_to_r("pinsrw", 7, src, "d"); 1688 } 1689 1690 1691 void test_PEXTRD ( void ) 1692 { 1693 V128 src; 1694 randV128(&src); 1695 DO_imm_r_to_mandrscalar("pextrd", 0, src, "d"); 1696 DO_imm_r_to_mandrscalar("pextrd", 1, src, "d"); 1697 DO_imm_r_to_mandrscalar("pextrd", 2, src, "d"); 1698 DO_imm_r_to_mandrscalar("pextrd", 3, src, "d"); 1699 } 1700 1701 void test_PINSRD ( void ) 1702 { 1703 ULong src; 1704 src = randULong(); 1705 DO_imm_mandrscalar_to_r("pinsrd", 0, src, "d"); 1706 src = randULong(); 1707 DO_imm_mandrscalar_to_r("pinsrd", 1, src, "d"); 1708 src = randULong(); 1709 DO_imm_mandrscalar_to_r("pinsrd", 2, src, "d"); 1710 src = randULong(); 1711 DO_imm_mandrscalar_to_r("pinsrd", 3, src, "d"); 1712 } 1713 1714 1715 void test_PEXTRQ ( void ) 1716 { 1717 V128 src; 1718 randV128(&src); 1719 DO_imm_r_to_mandrscalar("pextrq", 0, src, ""); 1720 DO_imm_r_to_mandrscalar("pextrq", 1, src, ""); 1721 } 1722 1723 void test_PINSRQ ( void ) 1724 { 1725 ULong src; 1726 src = randULong(); 1727 DO_imm_mandrscalar_to_r("pinsrq", 0, src, ""); 1728 src = randULong(); 1729 DO_imm_mandrscalar_to_r("pinsrq", 1, src, ""); 1730 } 1731 1732 1733 void test_EXTRACTPS ( void ) 1734 { 1735 V128 src; 1736 randV128(&src); 1737 DO_imm_r_to_mandrscalar("extractps", 0, src, "d"); 1738 DO_imm_r_to_mandrscalar("extractps", 1, src, "d"); 1739 DO_imm_r_to_mandrscalar("extractps", 2, src, "d"); 1740 DO_imm_r_to_mandrscalar("extractps", 3, src, "d"); 1741 } 1742 1743 1744 void test_PHMINPOSUW ( void ) 1745 { 1746 V128 src, dst; 1747 Int i; 1748 for (i = 0; i < 20; i++) { 1749 randV128(&src); 1750 randV128(&dst); 1751 DO_mandr_r("phminposuw", src, dst); 1752 } 1753 memset(src, 0x55, sizeof(src)); 1754 memset(dst, 0xAA, sizeof(dst)); 1755 DO_mandr_r("phminposuw", src, dst); 1756 } 1757 1758 void test_PMAXSB ( void ) 1759 { 1760 V128 src, dst; 1761 Int i; 1762 for (i = 0; i < 10; i++) { 1763 randV128(&src); 1764 randV128(&dst); 1765 DO_mandr_r("pmaxsb", src, dst); 1766 } 1767 } 1768 1769 void test_PMAXSD ( void ) 1770 { 1771 V128 src, dst; 1772 Int i; 1773 for (i = 0; i < 10; i++) { 1774 randV128(&src); 1775 randV128(&dst); 1776 DO_mandr_r("pmaxsd", src, dst); 1777 } 1778 } 1779 1780 void test_PMAXUD ( void ) 1781 { 1782 V128 src, dst; 1783 Int i; 1784 for (i = 0; i < 10; i++) { 1785 randV128(&src); 1786 randV128(&dst); 1787 DO_mandr_r("pmaxud", src, dst); 1788 } 1789 } 1790 1791 void test_PMAXUW ( void ) 1792 { 1793 V128 src, dst; 1794 Int i; 1795 for (i = 0; i < 10; i++) { 1796 randV128(&src); 1797 randV128(&dst); 1798 DO_mandr_r("pmaxuw", src, dst); 1799 } 1800 } 1801 1802 void test_PMINSB ( void ) 1803 { 1804 V128 src, dst; 1805 Int i; 1806 for (i = 0; i < 10; i++) { 1807 randV128(&src); 1808 randV128(&dst); 1809 DO_mandr_r("pminsb", src, dst); 1810 } 1811 } 1812 1813 void test_PMINSD ( void ) 1814 { 1815 V128 src, dst; 1816 Int i; 1817 for (i = 0; i < 10; i++) { 1818 randV128(&src); 1819 randV128(&dst); 1820 DO_mandr_r("pminsd", src, dst); 1821 } 1822 } 1823 1824 void test_PMINUD ( void ) 1825 { 1826 V128 src, dst; 1827 Int i; 1828 for (i = 0; i < 10; i++) { 1829 randV128(&src); 1830 randV128(&dst); 1831 DO_mandr_r("pminud", src, dst); 1832 } 1833 } 1834 1835 void test_PMINUW ( void ) 1836 { 1837 V128 src, dst; 1838 Int i; 1839 for (i = 0; i < 10; i++) { 1840 randV128(&src); 1841 randV128(&dst); 1842 DO_mandr_r("pminuw", src, dst); 1843 } 1844 } 1845 1846 void test_PMOVSXBW ( void ) 1847 { 1848 V128 src, dst; 1849 Int i; 1850 for (i = 0; i < 10; i++) { 1851 randV128(&src); 1852 randV128(&dst); 1853 DO_mandr_r("pmovsxbw", src, dst); 1854 } 1855 } 1856 1857 void test_PMOVSXBD ( void ) 1858 { 1859 V128 src, dst; 1860 Int i; 1861 for (i = 0; i < 10; i++) { 1862 randV128(&src); 1863 randV128(&dst); 1864 DO_mandr_r("pmovsxbd", src, dst); 1865 } 1866 } 1867 1868 void test_PMOVSXBQ ( void ) 1869 { 1870 V128 src, dst; 1871 Int i; 1872 for (i = 0; i < 10; i++) { 1873 randV128(&src); 1874 randV128(&dst); 1875 DO_mandr_r("pmovsxbq", src, dst); 1876 } 1877 } 1878 1879 void test_PMOVSXWD ( void ) 1880 { 1881 V128 src, dst; 1882 Int i; 1883 for (i = 0; i < 10; i++) { 1884 randV128(&src); 1885 randV128(&dst); 1886 DO_mandr_r("pmovsxwd", src, dst); 1887 } 1888 } 1889 1890 void test_PMOVSXWQ ( void ) 1891 { 1892 V128 src, dst; 1893 Int i; 1894 for (i = 0; i < 10; i++) { 1895 randV128(&src); 1896 randV128(&dst); 1897 DO_mandr_r("pmovsxwq", src, dst); 1898 } 1899 } 1900 1901 void test_PMOVSXDQ ( void ) 1902 { 1903 V128 src, dst; 1904 Int i; 1905 for (i = 0; i < 10; i++) { 1906 randV128(&src); 1907 randV128(&dst); 1908 DO_mandr_r("pmovsxdq", src, dst); 1909 } 1910 } 1911 1912 void test_PMOVZXBW ( void ) 1913 { 1914 V128 src, dst; 1915 Int i; 1916 for (i = 0; i < 10; i++) { 1917 randV128(&src); 1918 randV128(&dst); 1919 DO_mandr_r("pmovzxbw", src, dst); 1920 } 1921 } 1922 1923 void test_PMOVZXBD ( void ) 1924 { 1925 V128 src, dst; 1926 Int i; 1927 for (i = 0; i < 10; i++) { 1928 randV128(&src); 1929 randV128(&dst); 1930 DO_mandr_r("pmovzxbd", src, dst); 1931 } 1932 } 1933 1934 void test_PMOVZXBQ ( void ) 1935 { 1936 V128 src, dst; 1937 Int i; 1938 for (i = 0; i < 10; i++) { 1939 randV128(&src); 1940 randV128(&dst); 1941 DO_mandr_r("pmovzxbq", src, dst); 1942 } 1943 } 1944 1945 void test_PMOVZXWD ( void ) 1946 { 1947 V128 src, dst; 1948 Int i; 1949 for (i = 0; i < 10; i++) { 1950 randV128(&src); 1951 randV128(&dst); 1952 DO_mandr_r("pmovzxwd", src, dst); 1953 } 1954 } 1955 1956 void test_PMOVZXWQ ( void ) 1957 { 1958 V128 src, dst; 1959 Int i; 1960 for (i = 0; i < 10; i++) { 1961 randV128(&src); 1962 randV128(&dst); 1963 DO_mandr_r("pmovzxwq", src, dst); 1964 } 1965 } 1966 1967 void test_PMOVZXDQ ( void ) 1968 { 1969 V128 src, dst; 1970 Int i; 1971 for (i = 0; i < 10; i++) { 1972 randV128(&src); 1973 randV128(&dst); 1974 DO_mandr_r("pmovzxdq", src, dst); 1975 } 1976 } 1977 1978 void test_PMULDQ ( void ) 1979 { 1980 V128 src, dst; 1981 Int i; 1982 for (i = 0; i < 10; i++) { 1983 randV128(&src); 1984 randV128(&dst); 1985 DO_mandr_r("pmuldq", src, dst); 1986 } 1987 } 1988 1989 1990 void test_PMULLD ( void ) 1991 { 1992 V128 src, dst; 1993 Int i; 1994 for (i = 0; i < 10; i++) { 1995 randV128(&src); 1996 randV128(&dst); 1997 DO_mandr_r("pmulld", src, dst); 1998 } 1999 } 2000 2001 2002 void test_POPCNTQ ( void ) 2003 { 2004 ULong block[4]; 2005 Int i; 2006 ULong oszacp_mask = 0x8D5; 2007 for (i = 0; i < 10; i++) { 2008 block[0] = i == 0 ? 0 : randULong(); 2009 block[1] = randULong(); 2010 block[2] = randULong(); 2011 block[3] = randULong(); 2012 __asm__ __volatile__( 2013 "movq %0, %%rax" "\n\t" 2014 "movq 0(%%rax), %%rdi" "\n\t" 2015 "movq 8(%%rax), %%r11" "\n\t" 2016 #ifndef VGP_amd64_darwin 2017 "popcntq %%rdi, %%r11" "\n\t" 2018 #else 2019 "popcnt %%rdi, %%r11" "\n\t" 2020 #endif 2021 "movq %%r11, 16(%%rax)" "\n\t" 2022 "pushfq" "\n\t" 2023 "popq %%r12" "\n\t" 2024 "movq %%r12, 24(%%rax)" "\n" 2025 : /*out*/ 2026 : /*in*/"r"(&block[0]) 2027 : /*trash*/ "cc", "memory", "rdi", "r11", "r12" 2028 ); 2029 printf("r popcntq %016llx %016llx %016llx %016llx\n", 2030 block[0], block[1], block[2], block[3] & oszacp_mask); 2031 2032 block[0] = i == 0 ? 0 : randULong(); 2033 block[1] = randULong(); 2034 block[2] = randULong(); 2035 block[3] = randULong(); 2036 __asm__ __volatile__( 2037 "movq %0, %%rax" "\n\t" 2038 "movq 8(%%rax), %%r11" "\n\t" 2039 #ifndef VGP_amd64_darwin 2040 "popcntq 0(%%rax), %%r11" "\n\t" 2041 #else 2042 "popcnt 0(%%rax), %%r11" "\n\t" 2043 #endif 2044 "movq %%r11, 16(%%rax)" "\n\t" 2045 "pushfq" "\n\t" 2046 "popq %%r12" "\n\t" 2047 "movq %%r12, 24(%%rax)" "\n" 2048 : /*out*/ 2049 : /*in*/"r"(&block[0]) 2050 : /*trash*/ "cc", "memory", "r11", "r12" 2051 ); 2052 printf("m popcntq %016llx %016llx %016llx %016llx\n", 2053 block[0], block[1], block[2], block[3] & oszacp_mask); 2054 } 2055 } 2056 2057 2058 void test_POPCNTL ( void ) 2059 { 2060 ULong block[4]; 2061 Int i; 2062 ULong oszacp_mask = 0x8D5; 2063 for (i = 0; i < 10; i++) { 2064 block[0] = i == 0 ? 0 : randULong(); 2065 block[1] = randULong(); 2066 block[2] = randULong(); 2067 block[3] = randULong(); 2068 __asm__ __volatile__( 2069 "movq %0, %%rax" "\n\t" 2070 "movq 0(%%rax), %%rdi" "\n\t" 2071 "movq 8(%%rax), %%r11" "\n\t" 2072 #ifndef VGP_amd64_darwin 2073 "popcntl %%edi, %%r11d" "\n\t" 2074 #else 2075 "popcnt %%edi, %%r11d" "\n\t" 2076 #endif 2077 "movq %%r11, 16(%%rax)" "\n\t" 2078 "pushfq" "\n\t" 2079 "popq %%r12" "\n\t" 2080 "movq %%r12, 24(%%rax)" "\n" 2081 : /*out*/ 2082 : /*in*/"r"(&block[0]) 2083 : /*trash*/ "cc", "memory", "rdi", "r11", "r12" 2084 ); 2085 printf("r popcntl %016llx %016llx %016llx %016llx\n", 2086 block[0], block[1], block[2], block[3] & oszacp_mask); 2087 2088 block[0] = i == 0 ? 0 : randULong(); 2089 block[1] = randULong(); 2090 block[2] = randULong(); 2091 block[3] = randULong(); 2092 __asm__ __volatile__( 2093 "movq %0, %%rax" "\n\t" 2094 "movq 8(%%rax), %%r11" "\n\t" 2095 #ifndef VGP_amd64_darwin 2096 "popcntl 0(%%rax), %%r11d" "\n\t" 2097 #else 2098 "popcnt 0(%%rax), %%r11d" "\n\t" 2099 #endif 2100 "movq %%r11, 16(%%rax)" "\n\t" 2101 "pushfq" "\n\t" 2102 "popq %%r12" "\n\t" 2103 "movq %%r12, 24(%%rax)" "\n" 2104 : /*out*/ 2105 : /*in*/"r"(&block[0]) 2106 : /*trash*/ "cc", "memory", "r11", "r12" 2107 ); 2108 printf("m popcntl %016llx %016llx %016llx %016llx\n", 2109 block[0], block[1], block[2], block[3] & oszacp_mask); 2110 } 2111 } 2112 2113 2114 void test_POPCNTW ( void ) 2115 { 2116 ULong block[4]; 2117 Int i; 2118 ULong oszacp_mask = 0x8D5; 2119 for (i = 0; i < 10; i++) { 2120 block[0] = i == 0 ? 0 : randULong(); 2121 block[1] = randULong(); 2122 block[2] = randULong(); 2123 block[3] = randULong(); 2124 __asm__ __volatile__( 2125 "movq %0, %%rax" "\n\t" 2126 "movq 0(%%rax), %%rdi" "\n\t" 2127 "movq 8(%%rax), %%r11" "\n\t" 2128 #ifndef VGP_amd64_darwin 2129 "popcntw %%di, %%r11w" "\n\t" 2130 #else 2131 "popcnt %%di, %%r11w" "\n\t" 2132 #endif 2133 "movq %%r11, 16(%%rax)" "\n\t" 2134 "pushfq" "\n\t" 2135 "popq %%r12" "\n\t" 2136 "movq %%r12, 24(%%rax)" "\n" 2137 : /*out*/ 2138 : /*in*/"r"(&block[0]) 2139 : /*trash*/ "cc", "memory", "rdi", "r11", "r12" 2140 ); 2141 printf("r popcntw %016llx %016llx %016llx %016llx\n", 2142 block[0], block[1], block[2], block[3] & oszacp_mask); 2143 2144 block[0] = i == 0 ? 0 : randULong(); 2145 block[1] = randULong(); 2146 block[2] = randULong(); 2147 block[3] = randULong(); 2148 __asm__ __volatile__( 2149 "movq %0, %%rax" "\n\t" 2150 "movq 8(%%rax), %%r11" "\n\t" 2151 #ifndef VGP_amd64_darwin 2152 "popcntw 0(%%rax), %%r11w" "\n\t" 2153 #else 2154 "popcnt 0(%%rax), %%r11w" "\n\t" 2155 #endif 2156 "movq %%r11, 16(%%rax)" "\n\t" 2157 "pushfq" "\n\t" 2158 "popq %%r12" "\n\t" 2159 "movq %%r12, 24(%%rax)" "\n" 2160 : /*out*/ 2161 : /*in*/"r"(&block[0]) 2162 : /*trash*/ "cc", "memory", "r11", "r12" 2163 ); 2164 printf("m popcntw %016llx %016llx %016llx %016llx\n", 2165 block[0], block[1], block[2], block[3] & oszacp_mask); 2166 } 2167 } 2168 2169 2170 void test_PCMPGTQ ( void ) 2171 { 2172 V128 spec[7]; 2173 do64HLtoV128( &spec[0], 0x0000000000000000ULL, 0xffffffffffffffffULL ); 2174 do64HLtoV128( &spec[1], 0x0000000000000001ULL, 0xfffffffffffffffeULL ); 2175 do64HLtoV128( &spec[2], 0x7fffffffffffffffULL, 0x8000000000000001ULL ); 2176 do64HLtoV128( &spec[3], 0x8000000000000000ULL, 0x8000000000000000ULL ); 2177 do64HLtoV128( &spec[4], 0x8000000000000001ULL, 0x7fffffffffffffffULL ); 2178 do64HLtoV128( &spec[5], 0xfffffffffffffffeULL, 0x0000000000000001ULL ); 2179 do64HLtoV128( &spec[6], 0xffffffffffffffffULL, 0x0000000000000000ULL ); 2180 2181 V128 src, dst; 2182 Int i, j; 2183 for (i = 0; i < 10; i++) { 2184 randV128(&src); 2185 randV128(&dst); 2186 DO_mandr_r("pcmpgtq", src, dst); 2187 } 2188 for (i = 0; i < 7; i++) { 2189 for (j = 0; j < 7; j++) { 2190 memcpy(&src, &spec[i], 16); 2191 memcpy(&dst, &spec[j], 16); 2192 DO_mandr_r("pcmpgtq", src, dst); 2193 } 2194 } 2195 } 2196 2197 /* ------------ ROUNDSD ------------ */ 2198 2199 void do_ROUNDSD_000 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2200 { 2201 if (mem) { 2202 __asm__ __volatile__( 2203 "movupd (%1), %%xmm11" "\n\t" 2204 "roundsd $0, (%0), %%xmm11" "\n\t" 2205 "movupd %%xmm11, (%1)" "\n" 2206 : /*OUT*/ 2207 : /*IN*/ "r"(src), "r"(dst) 2208 : /*TRASH*/ "xmm11" 2209 ); 2210 } else { 2211 __asm__ __volatile__( 2212 "movupd (%1), %%xmm11" "\n\t" 2213 "movupd (%0), %%xmm2" "\n\t" 2214 "roundsd $0, %%xmm2, %%xmm11" "\n\t" 2215 "movupd %%xmm11, (%1)" "\n" 2216 : /*OUT*/ 2217 : /*IN*/ "r"(src), "r"(dst) 2218 : /*TRASH*/ "xmm11","xmm2" 2219 ); 2220 } 2221 } 2222 2223 void do_ROUNDSD_001 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2224 { 2225 if (mem) { 2226 __asm__ __volatile__( 2227 "movupd (%1), %%xmm11" "\n\t" 2228 "roundsd $1, (%0), %%xmm11" "\n\t" 2229 "movupd %%xmm11, (%1)" "\n" 2230 : /*OUT*/ 2231 : /*IN*/ "r"(src), "r"(dst) 2232 : /*TRASH*/ "xmm11" 2233 ); 2234 } else { 2235 __asm__ __volatile__( 2236 "movupd (%1), %%xmm11" "\n\t" 2237 "movupd (%0), %%xmm2" "\n\t" 2238 "roundsd $1, %%xmm2, %%xmm11" "\n\t" 2239 "movupd %%xmm11, (%1)" "\n" 2240 : /*OUT*/ 2241 : /*IN*/ "r"(src), "r"(dst) 2242 : /*TRASH*/ "xmm11","xmm2" 2243 ); 2244 } 2245 } 2246 2247 void do_ROUNDSD_010 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2248 { 2249 if (mem) { 2250 __asm__ __volatile__( 2251 "movupd (%1), %%xmm11" "\n\t" 2252 "roundsd $2, (%0), %%xmm11" "\n\t" 2253 "movupd %%xmm11, (%1)" "\n" 2254 : /*OUT*/ 2255 : /*IN*/ "r"(src), "r"(dst) 2256 : /*TRASH*/ "xmm11" 2257 ); 2258 } else { 2259 __asm__ __volatile__( 2260 "movupd (%1), %%xmm11" "\n\t" 2261 "movupd (%0), %%xmm2" "\n\t" 2262 "roundsd $2, %%xmm2, %%xmm11" "\n\t" 2263 "movupd %%xmm11, (%1)" "\n" 2264 : /*OUT*/ 2265 : /*IN*/ "r"(src), "r"(dst) 2266 : /*TRASH*/ "xmm11","xmm2" 2267 ); 2268 } 2269 } 2270 2271 void do_ROUNDSD_011 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2272 { 2273 if (mem) { 2274 __asm__ __volatile__( 2275 "movupd (%1), %%xmm11" "\n\t" 2276 "roundsd $3, (%0), %%xmm11" "\n\t" 2277 "movupd %%xmm11, (%1)" "\n" 2278 : /*OUT*/ 2279 : /*IN*/ "r"(src), "r"(dst) 2280 : /*TRASH*/ "xmm11" 2281 ); 2282 } else { 2283 __asm__ __volatile__( 2284 "movupd (%1), %%xmm11" "\n\t" 2285 "movupd (%0), %%xmm2" "\n\t" 2286 "roundsd $3, %%xmm2, %%xmm11" "\n\t" 2287 "movupd %%xmm11, (%1)" "\n" 2288 : /*OUT*/ 2289 : /*IN*/ "r"(src), "r"(dst) 2290 : /*TRASH*/ "xmm11","xmm2" 2291 ); 2292 } 2293 } 2294 2295 void do_ROUNDSD_1XX ( Bool mem, V128* src, /*OUT*/V128* dst ) 2296 { 2297 if (mem) { 2298 __asm__ __volatile__( 2299 "movupd (%1), %%xmm11" "\n\t" 2300 "roundsd $4, (%0), %%xmm11" "\n\t" 2301 "movupd %%xmm11, (%1)" "\n" 2302 : /*OUT*/ 2303 : /*IN*/ "r"(src), "r"(dst) 2304 : /*TRASH*/ "xmm11" 2305 ); 2306 } else { 2307 __asm__ __volatile__( 2308 "movupd (%1), %%xmm11" "\n\t" 2309 "movupd (%0), %%xmm2" "\n\t" 2310 "roundsd $4, %%xmm2, %%xmm11" "\n\t" 2311 "movupd %%xmm11, (%1)" "\n" 2312 : /*OUT*/ 2313 : /*IN*/ "r"(src), "r"(dst) 2314 : /*TRASH*/ "xmm11","xmm2" 2315 ); 2316 } 2317 } 2318 2319 void test_ROUNDSD_w_immediate_rounding ( void ) 2320 { 2321 double vals[22]; 2322 Int i = 0; 2323 vals[i++] = 0.0; 2324 vals[i++] = -0.0; 2325 vals[i++] = mkPosInf(); 2326 vals[i++] = mkNegInf(); 2327 vals[i++] = mkPosNan(); 2328 vals[i++] = mkNegNan(); 2329 vals[i++] = -1.3; 2330 vals[i++] = -1.1; 2331 vals[i++] = -0.9; 2332 vals[i++] = -0.7; 2333 vals[i++] = -0.50001; 2334 vals[i++] = -0.49999; 2335 vals[i++] = -0.3; 2336 vals[i++] = -0.1; 2337 vals[i++] = 0.1; 2338 vals[i++] = 0.3; 2339 vals[i++] = 0.49999; 2340 vals[i++] = 0.50001; 2341 vals[i++] = 0.7; 2342 vals[i++] = 0.9; 2343 vals[i++] = 1.1; 2344 vals[i++] = 1.3; 2345 assert(i == 22); 2346 2347 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 2348 V128 src, dst; 2349 2350 randV128(&src); 2351 randV128(&dst); 2352 memcpy(&src[0], &vals[i], 8); 2353 do_ROUNDSD_000(False/*reg*/, &src, &dst); 2354 printf("r roundsd_000 "); 2355 showV128(&src); 2356 printf(" "); 2357 showV128(&dst); 2358 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2359 printf("\n"); 2360 2361 randV128(&src); 2362 randV128(&dst); 2363 memcpy(&src[0], &vals[i], 8); 2364 do_ROUNDSD_000(True/*mem*/, &src, &dst); 2365 printf("m roundsd_000 "); 2366 showV128(&src); 2367 printf(" "); 2368 showV128(&dst); 2369 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2370 printf("\n"); 2371 2372 2373 randV128(&src); 2374 randV128(&dst); 2375 memcpy(&src[0], &vals[i], 8); 2376 do_ROUNDSD_001(False/*reg*/, &src, &dst); 2377 printf("r roundsd_001 "); 2378 showV128(&src); 2379 printf(" "); 2380 showV128(&dst); 2381 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2382 printf("\n"); 2383 2384 randV128(&src); 2385 randV128(&dst); 2386 memcpy(&src[0], &vals[i], 8); 2387 do_ROUNDSD_001(True/*mem*/, &src, &dst); 2388 printf("m roundsd_001 "); 2389 showV128(&src); 2390 printf(" "); 2391 showV128(&dst); 2392 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2393 printf("\n"); 2394 2395 2396 randV128(&src); 2397 randV128(&dst); 2398 memcpy(&src[0], &vals[i], 8); 2399 do_ROUNDSD_010(False/*reg*/, &src, &dst); 2400 printf("r roundsd_010 "); 2401 showV128(&src); 2402 printf(" "); 2403 showV128(&dst); 2404 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2405 printf("\n"); 2406 2407 randV128(&src); 2408 randV128(&dst); 2409 memcpy(&src[0], &vals[i], 8); 2410 do_ROUNDSD_010(True/*mem*/, &src, &dst); 2411 printf("m roundsd_010 "); 2412 showV128(&src); 2413 printf(" "); 2414 showV128(&dst); 2415 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2416 printf("\n"); 2417 2418 2419 randV128(&src); 2420 randV128(&dst); 2421 memcpy(&src[0], &vals[i], 8); 2422 do_ROUNDSD_011(False/*reg*/, &src, &dst); 2423 printf("r roundsd_011 "); 2424 showV128(&src); 2425 printf(" "); 2426 showV128(&dst); 2427 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2428 printf("\n"); 2429 2430 randV128(&src); 2431 randV128(&dst); 2432 memcpy(&src[0], &vals[i], 8); 2433 do_ROUNDSD_011(True/*mem*/, &src, &dst); 2434 printf("m roundsd_011 "); 2435 showV128(&src); 2436 printf(" "); 2437 showV128(&dst); 2438 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2439 printf("\n"); 2440 } 2441 } 2442 2443 void test_ROUNDSD_w_mxcsr_rounding ( void ) 2444 { 2445 UInt rm; 2446 double vals[22]; 2447 Int i = 0; 2448 vals[i++] = 0.0; 2449 vals[i++] = -0.0; 2450 vals[i++] = mkPosInf(); 2451 vals[i++] = mkNegInf(); 2452 vals[i++] = mkPosNan(); 2453 vals[i++] = mkNegNan(); 2454 vals[i++] = -1.3; 2455 vals[i++] = -1.1; 2456 vals[i++] = -0.9; 2457 vals[i++] = -0.7; 2458 vals[i++] = -0.50001; 2459 vals[i++] = -0.49999; 2460 vals[i++] = -0.3; 2461 vals[i++] = -0.1; 2462 vals[i++] = 0.1; 2463 vals[i++] = 0.3; 2464 vals[i++] = 0.49999; 2465 vals[i++] = 0.50001; 2466 vals[i++] = 0.7; 2467 vals[i++] = 0.9; 2468 vals[i++] = 1.1; 2469 vals[i++] = 1.3; 2470 assert(i == 22); 2471 2472 rm = get_sse_roundingmode(); 2473 assert(rm == 0); // 0 == RN == default 2474 2475 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 2476 V128 src, dst; 2477 2478 for (rm = 0; rm <= 3; rm++) { 2479 set_sse_roundingmode(rm); 2480 2481 randV128(&src); 2482 randV128(&dst); 2483 memcpy(&src[0], &vals[i], 8); 2484 do_ROUNDSD_1XX(False/*reg*/, &src, &dst); 2485 printf("r (rm=%u) roundsd_1XX ", rm); 2486 showV128(&src); 2487 printf(" "); 2488 showV128(&dst); 2489 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2490 printf("\n"); 2491 2492 randV128(&src); 2493 randV128(&dst); 2494 memcpy(&src[0], &vals[i], 8); 2495 do_ROUNDSD_1XX(True/*mem*/, &src, &dst); 2496 printf("m (rm=%u) roundsd_1XX ", rm); 2497 showV128(&src); 2498 printf(" "); 2499 showV128(&dst); 2500 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2501 printf("\n"); 2502 } 2503 } 2504 2505 rm = get_sse_roundingmode(); 2506 assert(rm == 3); 2507 set_sse_roundingmode(0); 2508 rm = get_sse_roundingmode(); 2509 assert(rm == 0); // 0 == RN == default 2510 } 2511 2512 2513 /* ------------ ROUNDSS ------------ */ 2514 2515 void do_ROUNDSS_000 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2516 { 2517 if (mem) { 2518 __asm__ __volatile__( 2519 "movupd (%1), %%xmm11" "\n\t" 2520 "roundss $0, (%0), %%xmm11" "\n\t" 2521 "movupd %%xmm11, (%1)" "\n" 2522 : /*OUT*/ 2523 : /*IN*/ "r"(src), "r"(dst) 2524 : /*TRASH*/ "xmm11" 2525 ); 2526 } else { 2527 __asm__ __volatile__( 2528 "movupd (%1), %%xmm11" "\n\t" 2529 "movupd (%0), %%xmm2" "\n\t" 2530 "roundss $0, %%xmm2, %%xmm11" "\n\t" 2531 "movupd %%xmm11, (%1)" "\n" 2532 : /*OUT*/ 2533 : /*IN*/ "r"(src), "r"(dst) 2534 : /*TRASH*/ "xmm11","xmm2" 2535 ); 2536 } 2537 } 2538 2539 void do_ROUNDSS_001 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2540 { 2541 if (mem) { 2542 __asm__ __volatile__( 2543 "movupd (%1), %%xmm11" "\n\t" 2544 "roundss $1, (%0), %%xmm11" "\n\t" 2545 "movupd %%xmm11, (%1)" "\n" 2546 : /*OUT*/ 2547 : /*IN*/ "r"(src), "r"(dst) 2548 : /*TRASH*/ "xmm11" 2549 ); 2550 } else { 2551 __asm__ __volatile__( 2552 "movupd (%1), %%xmm11" "\n\t" 2553 "movupd (%0), %%xmm2" "\n\t" 2554 "roundss $1, %%xmm2, %%xmm11" "\n\t" 2555 "movupd %%xmm11, (%1)" "\n" 2556 : /*OUT*/ 2557 : /*IN*/ "r"(src), "r"(dst) 2558 : /*TRASH*/ "xmm11","xmm2" 2559 ); 2560 } 2561 } 2562 2563 void do_ROUNDSS_010 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2564 { 2565 if (mem) { 2566 __asm__ __volatile__( 2567 "movupd (%1), %%xmm11" "\n\t" 2568 "roundss $2, (%0), %%xmm11" "\n\t" 2569 "movupd %%xmm11, (%1)" "\n" 2570 : /*OUT*/ 2571 : /*IN*/ "r"(src), "r"(dst) 2572 : /*TRASH*/ "xmm11" 2573 ); 2574 } else { 2575 __asm__ __volatile__( 2576 "movupd (%1), %%xmm11" "\n\t" 2577 "movupd (%0), %%xmm2" "\n\t" 2578 "roundss $2, %%xmm2, %%xmm11" "\n\t" 2579 "movupd %%xmm11, (%1)" "\n" 2580 : /*OUT*/ 2581 : /*IN*/ "r"(src), "r"(dst) 2582 : /*TRASH*/ "xmm11","xmm2" 2583 ); 2584 } 2585 } 2586 2587 void do_ROUNDSS_011 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2588 { 2589 if (mem) { 2590 __asm__ __volatile__( 2591 "movupd (%1), %%xmm11" "\n\t" 2592 "roundss $3, (%0), %%xmm11" "\n\t" 2593 "movupd %%xmm11, (%1)" "\n" 2594 : /*OUT*/ 2595 : /*IN*/ "r"(src), "r"(dst) 2596 : /*TRASH*/ "xmm11" 2597 ); 2598 } else { 2599 __asm__ __volatile__( 2600 "movupd (%1), %%xmm11" "\n\t" 2601 "movupd (%0), %%xmm2" "\n\t" 2602 "roundss $3, %%xmm2, %%xmm11" "\n\t" 2603 "movupd %%xmm11, (%1)" "\n" 2604 : /*OUT*/ 2605 : /*IN*/ "r"(src), "r"(dst) 2606 : /*TRASH*/ "xmm11","xmm2" 2607 ); 2608 } 2609 } 2610 2611 void do_ROUNDSS_1XX ( Bool mem, V128* src, /*OUT*/V128* dst ) 2612 { 2613 if (mem) { 2614 __asm__ __volatile__( 2615 "movupd (%1), %%xmm11" "\n\t" 2616 "roundss $4, (%0), %%xmm11" "\n\t" 2617 "movupd %%xmm11, (%1)" "\n" 2618 : /*OUT*/ 2619 : /*IN*/ "r"(src), "r"(dst) 2620 : /*TRASH*/ "xmm11" 2621 ); 2622 } else { 2623 __asm__ __volatile__( 2624 "movupd (%1), %%xmm11" "\n\t" 2625 "movupd (%0), %%xmm2" "\n\t" 2626 "roundss $4, %%xmm2, %%xmm11" "\n\t" 2627 "movupd %%xmm11, (%1)" "\n" 2628 : /*OUT*/ 2629 : /*IN*/ "r"(src), "r"(dst) 2630 : /*TRASH*/ "xmm11","xmm2" 2631 ); 2632 } 2633 } 2634 2635 void test_ROUNDSS_w_immediate_rounding ( void ) 2636 { 2637 float vals[22]; 2638 Int i = 0; 2639 vals[i++] = 0.0; 2640 vals[i++] = -0.0; 2641 vals[i++] = mkPosInf(); 2642 vals[i++] = mkNegInf(); 2643 vals[i++] = mkPosNan(); 2644 vals[i++] = mkNegNan(); 2645 vals[i++] = -1.3; 2646 vals[i++] = -1.1; 2647 vals[i++] = -0.9; 2648 vals[i++] = -0.7; 2649 vals[i++] = -0.50001; 2650 vals[i++] = -0.49999; 2651 vals[i++] = -0.3; 2652 vals[i++] = -0.1; 2653 vals[i++] = 0.1; 2654 vals[i++] = 0.3; 2655 vals[i++] = 0.49999; 2656 vals[i++] = 0.50001; 2657 vals[i++] = 0.7; 2658 vals[i++] = 0.9; 2659 vals[i++] = 1.1; 2660 vals[i++] = 1.3; 2661 assert(i == 22); 2662 2663 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 2664 V128 src, dst; 2665 2666 randV128(&src); 2667 randV128(&dst); 2668 memcpy(&src[0], &vals[i], 4); 2669 do_ROUNDSS_000(False/*reg*/, &src, &dst); 2670 printf("r roundss_000 "); 2671 showV128(&src); 2672 printf(" "); 2673 showV128(&dst); 2674 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2675 printf("\n"); 2676 2677 randV128(&src); 2678 randV128(&dst); 2679 memcpy(&src[0], &vals[i], 4); 2680 do_ROUNDSS_000(True/*mem*/, &src, &dst); 2681 printf("m roundss_000 "); 2682 showV128(&src); 2683 printf(" "); 2684 showV128(&dst); 2685 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2686 printf("\n"); 2687 2688 2689 randV128(&src); 2690 randV128(&dst); 2691 memcpy(&src[0], &vals[i], 4); 2692 do_ROUNDSS_001(False/*reg*/, &src, &dst); 2693 printf("r roundss_001 "); 2694 showV128(&src); 2695 printf(" "); 2696 showV128(&dst); 2697 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2698 printf("\n"); 2699 2700 randV128(&src); 2701 randV128(&dst); 2702 memcpy(&src[0], &vals[i], 4); 2703 do_ROUNDSS_001(True/*mem*/, &src, &dst); 2704 printf("m roundss_001 "); 2705 showV128(&src); 2706 printf(" "); 2707 showV128(&dst); 2708 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2709 printf("\n"); 2710 2711 2712 randV128(&src); 2713 randV128(&dst); 2714 memcpy(&src[0], &vals[i], 4); 2715 do_ROUNDSS_010(False/*reg*/, &src, &dst); 2716 printf("r roundss_010 "); 2717 showV128(&src); 2718 printf(" "); 2719 showV128(&dst); 2720 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2721 printf("\n"); 2722 2723 randV128(&src); 2724 randV128(&dst); 2725 memcpy(&src[0], &vals[i], 4); 2726 do_ROUNDSS_010(True/*mem*/, &src, &dst); 2727 printf("m roundss_010 "); 2728 showV128(&src); 2729 printf(" "); 2730 showV128(&dst); 2731 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2732 printf("\n"); 2733 2734 2735 randV128(&src); 2736 randV128(&dst); 2737 memcpy(&src[0], &vals[i], 4); 2738 do_ROUNDSS_011(False/*reg*/, &src, &dst); 2739 printf("r roundss_011 "); 2740 showV128(&src); 2741 printf(" "); 2742 showV128(&dst); 2743 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2744 printf("\n"); 2745 2746 randV128(&src); 2747 randV128(&dst); 2748 memcpy(&src[0], &vals[i], 4); 2749 do_ROUNDSS_011(True/*mem*/, &src, &dst); 2750 printf("m roundss_011 "); 2751 showV128(&src); 2752 printf(" "); 2753 showV128(&dst); 2754 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2755 printf("\n"); 2756 } 2757 } 2758 2759 void test_ROUNDSS_w_mxcsr_rounding ( void ) 2760 { 2761 UInt rm; 2762 float vals[22]; 2763 Int i = 0; 2764 vals[i++] = 0.0; 2765 vals[i++] = -0.0; 2766 vals[i++] = mkPosInf(); 2767 vals[i++] = mkNegInf(); 2768 vals[i++] = mkPosNan(); 2769 vals[i++] = mkNegNan(); 2770 vals[i++] = -1.3; 2771 vals[i++] = -1.1; 2772 vals[i++] = -0.9; 2773 vals[i++] = -0.7; 2774 vals[i++] = -0.50001; 2775 vals[i++] = -0.49999; 2776 vals[i++] = -0.3; 2777 vals[i++] = -0.1; 2778 vals[i++] = 0.1; 2779 vals[i++] = 0.3; 2780 vals[i++] = 0.49999; 2781 vals[i++] = 0.50001; 2782 vals[i++] = 0.7; 2783 vals[i++] = 0.9; 2784 vals[i++] = 1.1; 2785 vals[i++] = 1.3; 2786 assert(i == 22); 2787 2788 rm = get_sse_roundingmode(); 2789 assert(rm == 0); // 0 == RN == default 2790 2791 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 2792 V128 src, dst; 2793 2794 for (rm = 0; rm <= 3; rm++) { 2795 set_sse_roundingmode(rm); 2796 2797 randV128(&src); 2798 randV128(&dst); 2799 memcpy(&src[0], &vals[i], 4); 2800 do_ROUNDSS_1XX(False/*reg*/, &src, &dst); 2801 printf("r (rm=%u) roundss_1XX ", rm); 2802 showV128(&src); 2803 printf(" "); 2804 showV128(&dst); 2805 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2806 printf("\n"); 2807 2808 randV128(&src); 2809 randV128(&dst); 2810 memcpy(&src[0], &vals[i], 4); 2811 do_ROUNDSS_1XX(True/*mem*/, &src, &dst); 2812 printf("m (rm=%u) roundss_1XX ", rm); 2813 showV128(&src); 2814 printf(" "); 2815 showV128(&dst); 2816 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2817 printf("\n"); 2818 } 2819 } 2820 2821 rm = get_sse_roundingmode(); 2822 assert(rm == 3); 2823 set_sse_roundingmode(0); 2824 rm = get_sse_roundingmode(); 2825 assert(rm == 0); // 0 == RN == default 2826 } 2827 2828 /* ------------ ROUNDPD ------------ */ 2829 2830 void do_ROUNDPD_000 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2831 { 2832 if (mem) { 2833 __asm__ __volatile__( 2834 "movupd (%1), %%xmm11" "\n\t" 2835 "roundpd $0, (%0), %%xmm11" "\n\t" 2836 "movupd %%xmm11, (%1)" "\n" 2837 : /*OUT*/ 2838 : /*IN*/ "r"(src), "r"(dst) 2839 : /*TRASH*/ "xmm11" 2840 ); 2841 } else { 2842 __asm__ __volatile__( 2843 "movupd (%1), %%xmm11" "\n\t" 2844 "movupd (%0), %%xmm2" "\n\t" 2845 "roundpd $0, %%xmm2, %%xmm11" "\n\t" 2846 "movupd %%xmm11, (%1)" "\n" 2847 : /*OUT*/ 2848 : /*IN*/ "r"(src), "r"(dst) 2849 : /*TRASH*/ "xmm11","xmm2" 2850 ); 2851 } 2852 } 2853 2854 void do_ROUNDPD_001 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2855 { 2856 if (mem) { 2857 __asm__ __volatile__( 2858 "movupd (%1), %%xmm11" "\n\t" 2859 "roundpd $1, (%0), %%xmm11" "\n\t" 2860 "movupd %%xmm11, (%1)" "\n" 2861 : /*OUT*/ 2862 : /*IN*/ "r"(src), "r"(dst) 2863 : /*TRASH*/ "xmm11" 2864 ); 2865 } else { 2866 __asm__ __volatile__( 2867 "movupd (%1), %%xmm11" "\n\t" 2868 "movupd (%0), %%xmm2" "\n\t" 2869 "roundpd $1, %%xmm2, %%xmm11" "\n\t" 2870 "movupd %%xmm11, (%1)" "\n" 2871 : /*OUT*/ 2872 : /*IN*/ "r"(src), "r"(dst) 2873 : /*TRASH*/ "xmm11","xmm2" 2874 ); 2875 } 2876 } 2877 2878 void do_ROUNDPD_010 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2879 { 2880 if (mem) { 2881 __asm__ __volatile__( 2882 "movupd (%1), %%xmm11" "\n\t" 2883 "roundpd $2, (%0), %%xmm11" "\n\t" 2884 "movupd %%xmm11, (%1)" "\n" 2885 : /*OUT*/ 2886 : /*IN*/ "r"(src), "r"(dst) 2887 : /*TRASH*/ "xmm11" 2888 ); 2889 } else { 2890 __asm__ __volatile__( 2891 "movupd (%1), %%xmm11" "\n\t" 2892 "movupd (%0), %%xmm2" "\n\t" 2893 "roundpd $2, %%xmm2, %%xmm11" "\n\t" 2894 "movupd %%xmm11, (%1)" "\n" 2895 : /*OUT*/ 2896 : /*IN*/ "r"(src), "r"(dst) 2897 : /*TRASH*/ "xmm11","xmm2" 2898 ); 2899 } 2900 } 2901 2902 void do_ROUNDPD_011 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2903 { 2904 if (mem) { 2905 __asm__ __volatile__( 2906 "movupd (%1), %%xmm11" "\n\t" 2907 "roundpd $3, (%0), %%xmm11" "\n\t" 2908 "movupd %%xmm11, (%1)" "\n" 2909 : /*OUT*/ 2910 : /*IN*/ "r"(src), "r"(dst) 2911 : /*TRASH*/ "xmm11" 2912 ); 2913 } else { 2914 __asm__ __volatile__( 2915 "movupd (%1), %%xmm11" "\n\t" 2916 "movupd (%0), %%xmm2" "\n\t" 2917 "roundpd $3, %%xmm2, %%xmm11" "\n\t" 2918 "movupd %%xmm11, (%1)" "\n" 2919 : /*OUT*/ 2920 : /*IN*/ "r"(src), "r"(dst) 2921 : /*TRASH*/ "xmm11","xmm2" 2922 ); 2923 } 2924 } 2925 2926 void do_ROUNDPD_1XX ( Bool mem, V128* src, /*OUT*/V128* dst ) 2927 { 2928 if (mem) { 2929 __asm__ __volatile__( 2930 "movupd (%1), %%xmm11" "\n\t" 2931 "roundpd $4, (%0), %%xmm11" "\n\t" 2932 "movupd %%xmm11, (%1)" "\n" 2933 : /*OUT*/ 2934 : /*IN*/ "r"(src), "r"(dst) 2935 : /*TRASH*/ "xmm11" 2936 ); 2937 } else { 2938 __asm__ __volatile__( 2939 "movupd (%1), %%xmm11" "\n\t" 2940 "movupd (%0), %%xmm2" "\n\t" 2941 "roundpd $4, %%xmm2, %%xmm11" "\n\t" 2942 "movupd %%xmm11, (%1)" "\n" 2943 : /*OUT*/ 2944 : /*IN*/ "r"(src), "r"(dst) 2945 : /*TRASH*/ "xmm11","xmm2" 2946 ); 2947 } 2948 } 2949 2950 void test_ROUNDPD_w_immediate_rounding ( void ) 2951 { 2952 double vals[22]; 2953 Int i = 0; 2954 vals[i++] = 0.0; 2955 vals[i++] = -0.0; 2956 vals[i++] = mkPosInf(); 2957 vals[i++] = mkNegInf(); 2958 vals[i++] = mkPosNan(); 2959 vals[i++] = mkNegNan(); 2960 vals[i++] = -1.3; 2961 vals[i++] = -1.1; 2962 vals[i++] = -0.9; 2963 vals[i++] = -0.7; 2964 vals[i++] = -0.50001; 2965 vals[i++] = -0.49999; 2966 vals[i++] = -0.3; 2967 vals[i++] = -0.1; 2968 vals[i++] = 0.1; 2969 vals[i++] = 0.3; 2970 vals[i++] = 0.49999; 2971 vals[i++] = 0.50001; 2972 vals[i++] = 0.7; 2973 vals[i++] = 0.9; 2974 vals[i++] = 1.1; 2975 vals[i++] = 1.3; 2976 assert(i == 22); 2977 2978 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 2979 V128 src, dst; 2980 2981 randV128(&src); 2982 randV128(&dst); 2983 memcpy(&src[0], &vals[i], 8); 2984 memcpy(&src[8], &vals[(i+11)%22], 8); 2985 do_ROUNDPD_000(False/*reg*/, &src, &dst); 2986 printf("r roundpd_000 "); 2987 showV128(&src); 2988 printf(" "); 2989 showV128(&dst); 2990 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 2991 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 2992 printf("\n"); 2993 2994 randV128(&src); 2995 randV128(&dst); 2996 memcpy(&src[0], &vals[i], 8); 2997 memcpy(&src[8], &vals[(i+11)%22], 8); 2998 do_ROUNDPD_000(True/*mem*/, &src, &dst); 2999 printf("m roundpd_000 "); 3000 showV128(&src); 3001 printf(" "); 3002 showV128(&dst); 3003 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3004 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3005 printf("\n"); 3006 3007 3008 randV128(&src); 3009 randV128(&dst); 3010 memcpy(&src[0], &vals[i], 8); 3011 memcpy(&src[8], &vals[(i+11)%22], 8); 3012 do_ROUNDPD_001(False/*reg*/, &src, &dst); 3013 printf("r roundpd_001 "); 3014 showV128(&src); 3015 printf(" "); 3016 showV128(&dst); 3017 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3018 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3019 printf("\n"); 3020 3021 randV128(&src); 3022 randV128(&dst); 3023 memcpy(&src[0], &vals[i], 8); 3024 memcpy(&src[8], &vals[(i+11)%22], 8); 3025 do_ROUNDPD_001(True/*mem*/, &src, &dst); 3026 printf("m roundpd_001 "); 3027 showV128(&src); 3028 printf(" "); 3029 showV128(&dst); 3030 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3031 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3032 printf("\n"); 3033 3034 3035 randV128(&src); 3036 randV128(&dst); 3037 memcpy(&src[0], &vals[i], 8); 3038 memcpy(&src[8], &vals[(i+11)%22], 8); 3039 do_ROUNDPD_010(False/*reg*/, &src, &dst); 3040 printf("r roundpd_010 "); 3041 showV128(&src); 3042 printf(" "); 3043 showV128(&dst); 3044 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3045 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3046 printf("\n"); 3047 3048 randV128(&src); 3049 randV128(&dst); 3050 memcpy(&src[0], &vals[i], 8); 3051 memcpy(&src[8], &vals[(i+11)%22], 8); 3052 do_ROUNDPD_010(True/*mem*/, &src, &dst); 3053 printf("m roundpd_010 "); 3054 showV128(&src); 3055 printf(" "); 3056 showV128(&dst); 3057 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3058 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3059 printf("\n"); 3060 3061 3062 randV128(&src); 3063 randV128(&dst); 3064 memcpy(&src[0], &vals[i], 8); 3065 memcpy(&src[8], &vals[(i+11)%22], 8); 3066 do_ROUNDPD_011(False/*reg*/, &src, &dst); 3067 printf("r roundpd_011 "); 3068 showV128(&src); 3069 printf(" "); 3070 showV128(&dst); 3071 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3072 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3073 printf("\n"); 3074 3075 randV128(&src); 3076 randV128(&dst); 3077 memcpy(&src[0], &vals[i], 8); 3078 memcpy(&src[8], &vals[(i+11)%22], 8); 3079 do_ROUNDPD_011(True/*mem*/, &src, &dst); 3080 printf("m roundpd_011 "); 3081 showV128(&src); 3082 printf(" "); 3083 showV128(&dst); 3084 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3085 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3086 printf("\n"); 3087 } 3088 } 3089 3090 void test_ROUNDPD_w_mxcsr_rounding ( void ) 3091 { 3092 UInt rm; 3093 double vals[22]; 3094 Int i = 0; 3095 vals[i++] = 0.0; 3096 vals[i++] = -0.0; 3097 vals[i++] = mkPosInf(); 3098 vals[i++] = mkNegInf(); 3099 vals[i++] = mkPosNan(); 3100 vals[i++] = mkNegNan(); 3101 vals[i++] = -1.3; 3102 vals[i++] = -1.1; 3103 vals[i++] = -0.9; 3104 vals[i++] = -0.7; 3105 vals[i++] = -0.50001; 3106 vals[i++] = -0.49999; 3107 vals[i++] = -0.3; 3108 vals[i++] = -0.1; 3109 vals[i++] = 0.1; 3110 vals[i++] = 0.3; 3111 vals[i++] = 0.49999; 3112 vals[i++] = 0.50001; 3113 vals[i++] = 0.7; 3114 vals[i++] = 0.9; 3115 vals[i++] = 1.1; 3116 vals[i++] = 1.3; 3117 assert(i == 22); 3118 3119 rm = get_sse_roundingmode(); 3120 assert(rm == 0); // 0 == RN == default 3121 3122 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 3123 V128 src, dst; 3124 3125 for (rm = 0; rm <= 3; rm++) { 3126 set_sse_roundingmode(rm); 3127 3128 randV128(&src); 3129 randV128(&dst); 3130 memcpy(&src[0], &vals[i], 8); 3131 memcpy(&src[8], &vals[(i+11)%22], 8); 3132 do_ROUNDPD_1XX(False/*reg*/, &src, &dst); 3133 printf("r (rm=%u) roundpd_1XX ", rm); 3134 showV128(&src); 3135 printf(" "); 3136 showV128(&dst); 3137 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3138 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3139 printf("\n"); 3140 3141 randV128(&src); 3142 randV128(&dst); 3143 memcpy(&src[0], &vals[i], 8); 3144 memcpy(&src[8], &vals[(i+11)%22], 8); 3145 do_ROUNDPD_1XX(True/*mem*/, &src, &dst); 3146 printf("m (rm=%u) roundpd_1XX ", rm); 3147 showV128(&src); 3148 printf(" "); 3149 showV128(&dst); 3150 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3151 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3152 printf("\n"); 3153 } 3154 } 3155 3156 rm = get_sse_roundingmode(); 3157 assert(rm == 3); 3158 set_sse_roundingmode(0); 3159 rm = get_sse_roundingmode(); 3160 assert(rm == 0); // 0 == RN == default 3161 } 3162 3163 /* ------------ ROUNDPS ------------ */ 3164 3165 void do_ROUNDPS_000 ( Bool mem, V128* src, /*OUT*/V128* dst ) 3166 { 3167 if (mem) { 3168 __asm__ __volatile__( 3169 "movupd (%1), %%xmm11" "\n\t" 3170 "roundps $0, (%0), %%xmm11" "\n\t" 3171 "movupd %%xmm11, (%1)" "\n" 3172 : /*OUT*/ 3173 : /*IN*/ "r"(src), "r"(dst) 3174 : /*TRASH*/ "xmm11" 3175 ); 3176 } else { 3177 __asm__ __volatile__( 3178 "movupd (%1), %%xmm11" "\n\t" 3179 "movupd (%0), %%xmm2" "\n\t" 3180 "roundps $0, %%xmm2, %%xmm11" "\n\t" 3181 "movupd %%xmm11, (%1)" "\n" 3182 : /*OUT*/ 3183 : /*IN*/ "r"(src), "r"(dst) 3184 : /*TRASH*/ "xmm11","xmm2" 3185 ); 3186 } 3187 } 3188 3189 void do_ROUNDPS_001 ( Bool mem, V128* src, /*OUT*/V128* dst ) 3190 { 3191 if (mem) { 3192 __asm__ __volatile__( 3193 "movupd (%1), %%xmm11" "\n\t" 3194 "roundps $1, (%0), %%xmm11" "\n\t" 3195 "movupd %%xmm11, (%1)" "\n" 3196 : /*OUT*/ 3197 : /*IN*/ "r"(src), "r"(dst) 3198 : /*TRASH*/ "xmm11" 3199 ); 3200 } else { 3201 __asm__ __volatile__( 3202 "movupd (%1), %%xmm11" "\n\t" 3203 "movupd (%0), %%xmm2" "\n\t" 3204 "roundps $1, %%xmm2, %%xmm11" "\n\t" 3205 "movupd %%xmm11, (%1)" "\n" 3206 : /*OUT*/ 3207 : /*IN*/ "r"(src), "r"(dst) 3208 : /*TRASH*/ "xmm11","xmm2" 3209 ); 3210 } 3211 } 3212 3213 void do_ROUNDPS_010 ( Bool mem, V128* src, /*OUT*/V128* dst ) 3214 { 3215 if (mem) { 3216 __asm__ __volatile__( 3217 "movupd (%1), %%xmm11" "\n\t" 3218 "roundps $2, (%0), %%xmm11" "\n\t" 3219 "movupd %%xmm11, (%1)" "\n" 3220 : /*OUT*/ 3221 : /*IN*/ "r"(src), "r"(dst) 3222 : /*TRASH*/ "xmm11" 3223 ); 3224 } else { 3225 __asm__ __volatile__( 3226 "movupd (%1), %%xmm11" "\n\t" 3227 "movupd (%0), %%xmm2" "\n\t" 3228 "roundps $2, %%xmm2, %%xmm11" "\n\t" 3229 "movupd %%xmm11, (%1)" "\n" 3230 : /*OUT*/ 3231 : /*IN*/ "r"(src), "r"(dst) 3232 : /*TRASH*/ "xmm11","xmm2" 3233 ); 3234 } 3235 } 3236 3237 void do_ROUNDPS_011 ( Bool mem, V128* src, /*OUT*/V128* dst ) 3238 { 3239 if (mem) { 3240 __asm__ __volatile__( 3241 "movupd (%1), %%xmm11" "\n\t" 3242 "roundps $3, (%0), %%xmm11" "\n\t" 3243 "movupd %%xmm11, (%1)" "\n" 3244 : /*OUT*/ 3245 : /*IN*/ "r"(src), "r"(dst) 3246 : /*TRASH*/ "xmm11" 3247 ); 3248 } else { 3249 __asm__ __volatile__( 3250 "movupd (%1), %%xmm11" "\n\t" 3251 "movupd (%0), %%xmm2" "\n\t" 3252 "roundps $3, %%xmm2, %%xmm11" "\n\t" 3253 "movupd %%xmm11, (%1)" "\n" 3254 : /*OUT*/ 3255 : /*IN*/ "r"(src), "r"(dst) 3256 : /*TRASH*/ "xmm11","xmm2" 3257 ); 3258 } 3259 } 3260 3261 void do_ROUNDPS_1XX ( Bool mem, V128* src, /*OUT*/V128* dst ) 3262 { 3263 if (mem) { 3264 __asm__ __volatile__( 3265 "movupd (%1), %%xmm11" "\n\t" 3266 "roundps $4, (%0), %%xmm11" "\n\t" 3267 "movupd %%xmm11, (%1)" "\n" 3268 : /*OUT*/ 3269 : /*IN*/ "r"(src), "r"(dst) 3270 : /*TRASH*/ "xmm11" 3271 ); 3272 } else { 3273 __asm__ __volatile__( 3274 "movupd (%1), %%xmm11" "\n\t" 3275 "movupd (%0), %%xmm2" "\n\t" 3276 "roundps $4, %%xmm2, %%xmm11" "\n\t" 3277 "movupd %%xmm11, (%1)" "\n" 3278 : /*OUT*/ 3279 : /*IN*/ "r"(src), "r"(dst) 3280 : /*TRASH*/ "xmm11","xmm2" 3281 ); 3282 } 3283 } 3284 3285 void test_ROUNDPS_w_immediate_rounding ( void ) 3286 { 3287 float vals[22]; 3288 Int i = 0; 3289 vals[i++] = 0.0; 3290 vals[i++] = -0.0; 3291 vals[i++] = mkPosInf(); 3292 vals[i++] = mkNegInf(); 3293 vals[i++] = mkPosNan(); 3294 vals[i++] = mkNegNan(); 3295 vals[i++] = -1.3; 3296 vals[i++] = -1.1; 3297 vals[i++] = -0.9; 3298 vals[i++] = -0.7; 3299 vals[i++] = -0.50001; 3300 vals[i++] = -0.49999; 3301 vals[i++] = -0.3; 3302 vals[i++] = -0.1; 3303 vals[i++] = 0.1; 3304 vals[i++] = 0.3; 3305 vals[i++] = 0.49999; 3306 vals[i++] = 0.50001; 3307 vals[i++] = 0.7; 3308 vals[i++] = 0.9; 3309 vals[i++] = 1.1; 3310 vals[i++] = 1.3; 3311 assert(i == 22); 3312 3313 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 3314 V128 src, dst; 3315 3316 randV128(&src); 3317 randV128(&dst); 3318 memcpy(&src[0], &vals[i], 4); 3319 memcpy(&src[4], &vals[(i+5)%22], 4); 3320 memcpy(&src[8], &vals[(i+11)%22], 4); 3321 memcpy(&src[12], &vals[(i+17)%22], 4); 3322 do_ROUNDPS_000(False/*reg*/, &src, &dst); 3323 printf("r roundps_000 "); 3324 showV128(&src); 3325 printf(" "); 3326 showV128(&dst); 3327 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3328 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3329 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3330 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3331 printf("\n"); 3332 3333 randV128(&src); 3334 randV128(&dst); 3335 memcpy(&src[0], &vals[i], 4); 3336 memcpy(&src[4], &vals[(i+5)%22], 4); 3337 memcpy(&src[8], &vals[(i+11)%22], 4); 3338 memcpy(&src[12], &vals[(i+17)%22], 4); 3339 do_ROUNDPS_000(True/*mem*/, &src, &dst); 3340 printf("m roundps_000 "); 3341 showV128(&src); 3342 printf(" "); 3343 showV128(&dst); 3344 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3345 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3346 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3347 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3348 printf("\n"); 3349 3350 3351 randV128(&src); 3352 randV128(&dst); 3353 memcpy(&src[0], &vals[i], 4); 3354 memcpy(&src[4], &vals[(i+5)%22], 4); 3355 memcpy(&src[8], &vals[(i+11)%22], 4); 3356 memcpy(&src[12], &vals[(i+17)%22], 4); 3357 do_ROUNDPS_001(False/*reg*/, &src, &dst); 3358 printf("r roundps_001 "); 3359 showV128(&src); 3360 printf(" "); 3361 showV128(&dst); 3362 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3363 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3364 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3365 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3366 printf("\n"); 3367 3368 randV128(&src); 3369 randV128(&dst); 3370 memcpy(&src[0], &vals[i], 4); 3371 memcpy(&src[4], &vals[(i+5)%22], 4); 3372 memcpy(&src[8], &vals[(i+11)%22], 4); 3373 memcpy(&src[12], &vals[(i+17)%22], 4); 3374 do_ROUNDPS_001(True/*mem*/, &src, &dst); 3375 printf("m roundps_001 "); 3376 showV128(&src); 3377 printf(" "); 3378 showV128(&dst); 3379 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3380 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3381 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3382 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3383 printf("\n"); 3384 3385 3386 randV128(&src); 3387 randV128(&dst); 3388 memcpy(&src[0], &vals[i], 4); 3389 memcpy(&src[4], &vals[(i+5)%22], 4); 3390 memcpy(&src[8], &vals[(i+11)%22], 4); 3391 memcpy(&src[12], &vals[(i+17)%22], 4); 3392 do_ROUNDPS_010(False/*reg*/, &src, &dst); 3393 printf("r roundps_010 "); 3394 showV128(&src); 3395 printf(" "); 3396 showV128(&dst); 3397 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3398 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3399 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3400 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3401 printf("\n"); 3402 3403 randV128(&src); 3404 randV128(&dst); 3405 memcpy(&src[0], &vals[i], 4); 3406 memcpy(&src[4], &vals[(i+5)%22], 4); 3407 memcpy(&src[8], &vals[(i+11)%22], 4); 3408 memcpy(&src[12], &vals[(i+17)%22], 4); 3409 do_ROUNDPS_010(True/*mem*/, &src, &dst); 3410 printf("m roundps_010 "); 3411 showV128(&src); 3412 printf(" "); 3413 showV128(&dst); 3414 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3415 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3416 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3417 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3418 printf("\n"); 3419 3420 3421 randV128(&src); 3422 randV128(&dst); 3423 memcpy(&src[0], &vals[i], 4); 3424 memcpy(&src[4], &vals[(i+5)%22], 4); 3425 memcpy(&src[8], &vals[(i+11)%22], 4); 3426 memcpy(&src[12], &vals[(i+17)%22], 4); 3427 do_ROUNDPS_011(False/*reg*/, &src, &dst); 3428 printf("r roundps_011 "); 3429 showV128(&src); 3430 printf(" "); 3431 showV128(&dst); 3432 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3433 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3434 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3435 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3436 printf("\n"); 3437 3438 randV128(&src); 3439 randV128(&dst); 3440 memcpy(&src[0], &vals[i], 4); 3441 memcpy(&src[4], &vals[(i+5)%22], 4); 3442 memcpy(&src[8], &vals[(i+11)%22], 4); 3443 memcpy(&src[12], &vals[(i+17)%22], 4); 3444 do_ROUNDPS_011(True/*mem*/, &src, &dst); 3445 printf("m roundps_011 "); 3446 showV128(&src); 3447 printf(" "); 3448 showV128(&dst); 3449 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3450 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3451 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3452 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3453 printf("\n"); 3454 } 3455 } 3456 3457 void test_ROUNDPS_w_mxcsr_rounding ( void ) 3458 { 3459 UInt rm; 3460 float vals[22]; 3461 Int i = 0; 3462 vals[i++] = 0.0; 3463 vals[i++] = -0.0; 3464 vals[i++] = mkPosInf(); 3465 vals[i++] = mkNegInf(); 3466 vals[i++] = mkPosNan(); 3467 vals[i++] = mkNegNan(); 3468 vals[i++] = -1.3; 3469 vals[i++] = -1.1; 3470 vals[i++] = -0.9; 3471 vals[i++] = -0.7; 3472 vals[i++] = -0.50001; 3473 vals[i++] = -0.49999; 3474 vals[i++] = -0.3; 3475 vals[i++] = -0.1; 3476 vals[i++] = 0.1; 3477 vals[i++] = 0.3; 3478 vals[i++] = 0.49999; 3479 vals[i++] = 0.50001; 3480 vals[i++] = 0.7; 3481 vals[i++] = 0.9; 3482 vals[i++] = 1.1; 3483 vals[i++] = 1.3; 3484 assert(i == 22); 3485 3486 rm = get_sse_roundingmode(); 3487 assert(rm == 0); // 0 == RN == default 3488 3489 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 3490 V128 src, dst; 3491 3492 for (rm = 0; rm <= 3; rm++) { 3493 set_sse_roundingmode(rm); 3494 3495 randV128(&src); 3496 randV128(&dst); 3497 memcpy(&src[0], &vals[i], 4); 3498 memcpy(&src[4], &vals[(i+5)%22], 4); 3499 memcpy(&src[8], &vals[(i+11)%22], 4); 3500 memcpy(&src[12], &vals[(i+17)%22], 4); 3501 do_ROUNDPS_1XX(False/*reg*/, &src, &dst); 3502 printf("r (rm=%u) roundps_1XX ", rm); 3503 showV128(&src); 3504 printf(" "); 3505 showV128(&dst); 3506 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3507 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3508 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3509 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3510 printf("\n"); 3511 3512 randV128(&src); 3513 randV128(&dst); 3514 memcpy(&src[0], &vals[i], 4); 3515 memcpy(&src[4], &vals[(i+5)%22], 4); 3516 memcpy(&src[8], &vals[(i+11)%22], 4); 3517 memcpy(&src[12], &vals[(i+17)%22], 4); 3518 do_ROUNDPS_1XX(True/*mem*/, &src, &dst); 3519 printf("m (rm=%u) roundps_1XX ", rm); 3520 showV128(&src); 3521 printf(" "); 3522 showV128(&dst); 3523 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3524 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3525 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3526 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3527 printf("\n"); 3528 } 3529 } 3530 3531 rm = get_sse_roundingmode(); 3532 assert(rm == 3); 3533 set_sse_roundingmode(0); 3534 rm = get_sse_roundingmode(); 3535 assert(rm == 0); // 0 == RN == default 3536 } 3537 3538 /* ------------ PTEST ------------ */ 3539 3540 void test_PTEST ( void ) 3541 { 3542 const Int ntests = 8; 3543 V128 spec[ntests]; 3544 do64HLtoV128( &spec[0], 0x0000000000000000ULL, 0x0000000000000000ULL ); 3545 do64HLtoV128( &spec[1], 0x0000000000000000ULL, 0x0000000000000001ULL ); 3546 do64HLtoV128( &spec[2], 0x0000000000000001ULL, 0x0000000000000000ULL ); 3547 do64HLtoV128( &spec[3], 0x0000000000000001ULL, 0x0000000000000001ULL ); 3548 do64HLtoV128( &spec[4], 0xffffffffffffffffULL, 0xffffffffffffffffULL ); 3549 do64HLtoV128( &spec[5], 0xffffffffffffffffULL, 0xfffffffffffffffeULL ); 3550 do64HLtoV128( &spec[6], 0xfffffffffffffffeULL, 0xffffffffffffffffULL ); 3551 do64HLtoV128( &spec[7], 0xfffffffffffffffeULL, 0xfffffffffffffffeULL ); 3552 V128 block[2]; 3553 Int i, j; 3554 ULong flags; 3555 for (i = 0; i < ntests; i++) { 3556 for (j = 0; j < ntests; j++) { 3557 memcpy(&block[0], &spec[i], 16); 3558 memcpy(&block[1], &spec[j], 16); 3559 __asm__ __volatile__( 3560 "subq $256, %%rsp" "\n\t" 3561 "movupd 0(%1), %%xmm2" "\n\t" 3562 "ptest 16(%1), %%xmm2" "\n\t" 3563 "pushfq" "\n\t" 3564 "popq %0" "\n\t" 3565 "addq $256, %%rsp" "\n\t" 3566 : /*out*/"=r"(flags) : /*in*/ "r"(&block[0]) : 3567 "xmm2", "memory", "cc" 3568 ); 3569 printf("r ptest "); 3570 showV128(&block[0]); 3571 printf(" "); 3572 showV128(&block[1]); 3573 printf(" -> eflags %04x\n", (UInt)flags & 0x8D5); 3574 } 3575 } 3576 } 3577 3578 /* ------------ PBLENDVB ------------ */ 3579 3580 void do_PBLENDVB ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst ) 3581 { 3582 if (mem) { 3583 __asm__ __volatile__( 3584 "movupd (%2), %%xmm0" "\n\t" 3585 "movupd (%1), %%xmm11" "\n\t" 3586 "pblendvb (%0), %%xmm11" "\n\t" 3587 "movupd %%xmm11, (%1)" "\n" 3588 : /*OUT*/ 3589 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0) 3590 : /*TRASH*/ "xmm11","xmm0" 3591 ); 3592 } else { 3593 __asm__ __volatile__( 3594 "movupd (%2), %%xmm0" "\n\t" 3595 "movupd (%1), %%xmm11" "\n\t" 3596 "movupd (%0), %%xmm2" "\n\t" 3597 "pblendvb %%xmm2, %%xmm11" "\n\t" 3598 "movupd %%xmm11, (%1)" "\n" 3599 : /*OUT*/ 3600 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0) 3601 : /*TRASH*/ "xmm11","xmm2","xmm0" 3602 ); 3603 } 3604 } 3605 3606 void test_PBLENDVB ( void ) 3607 { 3608 V128 xmm0, src, dst, t_xmm0, t_src, t_dst; 3609 Int i; 3610 for (i = 0; i < 10; i++) { 3611 randV128(&t_xmm0); 3612 randV128(&t_src); 3613 randV128(&t_dst); 3614 3615 memcpy(&xmm0, &t_xmm0, 16); 3616 memcpy(&src, &t_src, 16); 3617 memcpy(&dst, &t_dst, 16); 3618 do_PBLENDVB(False/*reg*/, &xmm0, &src, &dst); 3619 printf("r pblendvb "); 3620 showV128(&t_xmm0); 3621 printf(" "); 3622 showV128(&t_src); 3623 printf(" "); 3624 showV128(&t_dst); 3625 printf(" -> "); 3626 showV128(&dst); 3627 printf("\n"); 3628 3629 memcpy(&xmm0, &t_xmm0, 16); 3630 memcpy(&src, &t_src, 16); 3631 memcpy(&dst, &t_dst, 16); 3632 do_PBLENDVB(True/*mem*/, &xmm0, &src, &dst); 3633 printf("m pblendvb "); 3634 showV128(&t_xmm0); 3635 printf(" "); 3636 showV128(&t_src); 3637 printf(" "); 3638 showV128(&t_dst); 3639 printf(" -> "); 3640 showV128(&dst); 3641 printf("\n"); 3642 } 3643 } 3644 3645 /* ------------ BLENDVPD ------------ */ 3646 3647 void do_BLENDVPD ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst ) 3648 { 3649 if (mem) { 3650 __asm__ __volatile__( 3651 "movupd (%2), %%xmm0" "\n\t" 3652 "movupd (%1), %%xmm11" "\n\t" 3653 "blendvpd (%0), %%xmm11" "\n\t" 3654 "movupd %%xmm11, (%1)" "\n" 3655 : /*OUT*/ 3656 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0) 3657 : /*TRASH*/ "xmm11","xmm0" 3658 ); 3659 } else { 3660 __asm__ __volatile__( 3661 "movupd (%2), %%xmm0" "\n\t" 3662 "movupd (%1), %%xmm11" "\n\t" 3663 "movupd (%0), %%xmm2" "\n\t" 3664 "blendvpd %%xmm2, %%xmm11" "\n\t" 3665 "movupd %%xmm11, (%1)" "\n" 3666 : /*OUT*/ 3667 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0) 3668 : /*TRASH*/ "xmm11","xmm2","xmm0" 3669 ); 3670 } 3671 } 3672 3673 void test_BLENDVPD ( void ) 3674 { 3675 V128 xmm0, src, dst, t_xmm0, t_src, t_dst; 3676 Int i; 3677 for (i = 0; i < 10; i++) { 3678 randV128(&t_xmm0); 3679 randV128(&t_src); 3680 randV128(&t_dst); 3681 3682 memcpy(&xmm0, &t_xmm0, 16); 3683 memcpy(&src, &t_src, 16); 3684 memcpy(&dst, &t_dst, 16); 3685 do_BLENDVPD(False/*reg*/, &xmm0, &src, &dst); 3686 printf("r blendvpd "); 3687 showV128(&t_xmm0); 3688 printf(" "); 3689 showV128(&t_src); 3690 printf(" "); 3691 showV128(&t_dst); 3692 printf(" -> "); 3693 showV128(&dst); 3694 printf("\n"); 3695 3696 memcpy(&xmm0, &t_xmm0, 16); 3697 memcpy(&src, &t_src, 16); 3698 memcpy(&dst, &t_dst, 16); 3699 do_BLENDVPD(True/*mem*/, &xmm0, &src, &dst); 3700 printf("m blendvpd "); 3701 showV128(&t_xmm0); 3702 printf(" "); 3703 showV128(&t_src); 3704 printf(" "); 3705 showV128(&t_dst); 3706 printf(" -> "); 3707 showV128(&dst); 3708 printf("\n"); 3709 } 3710 } 3711 3712 /* ------------ BLENDVPS ------------ */ 3713 3714 void do_BLENDVPS ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst ) 3715 { 3716 if (mem) { 3717 __asm__ __volatile__( 3718 "movupd (%2), %%xmm0" "\n\t" 3719 "movupd (%1), %%xmm11" "\n\t" 3720 "blendvps (%0), %%xmm11" "\n\t" 3721 "movupd %%xmm11, (%1)" "\n" 3722 : /*OUT*/ 3723 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0) 3724 : /*TRASH*/ "xmm11","xmm0" 3725 ); 3726 } else { 3727 __asm__ __volatile__( 3728 "movupd (%2), %%xmm0" "\n\t" 3729 "movupd (%1), %%xmm11" "\n\t" 3730 "movupd (%0), %%xmm2" "\n\t" 3731 "blendvps %%xmm2, %%xmm11" "\n\t" 3732 "movupd %%xmm11, (%1)" "\n" 3733 : /*OUT*/ 3734 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0) 3735 : /*TRASH*/ "xmm11","xmm2","xmm0" 3736 ); 3737 } 3738 } 3739 3740 void test_BLENDVPS ( void ) 3741 { 3742 V128 xmm0, src, dst, t_xmm0, t_src, t_dst; 3743 Int i; 3744 for (i = 0; i < 10; i++) { 3745 randV128(&t_xmm0); 3746 randV128(&t_src); 3747 randV128(&t_dst); 3748 3749 memcpy(&xmm0, &t_xmm0, 16); 3750 memcpy(&src, &t_src, 16); 3751 memcpy(&dst, &t_dst, 16); 3752 do_BLENDVPS(False/*reg*/, &xmm0, &src, &dst); 3753 printf("r blendvps "); 3754 showV128(&t_xmm0); 3755 printf(" "); 3756 showV128(&t_src); 3757 printf(" "); 3758 showV128(&t_dst); 3759 printf(" -> "); 3760 showV128(&dst); 3761 printf("\n"); 3762 3763 memcpy(&xmm0, &t_xmm0, 16); 3764 memcpy(&src, &t_src, 16); 3765 memcpy(&dst, &t_dst, 16); 3766 do_BLENDVPS(True/*mem*/, &xmm0, &src, &dst); 3767 printf("m blendvps "); 3768 showV128(&t_xmm0); 3769 printf(" "); 3770 showV128(&t_src); 3771 printf(" "); 3772 showV128(&t_dst); 3773 printf(" -> "); 3774 showV128(&dst); 3775 printf("\n"); 3776 } 3777 } 3778 3779 /* ------------ main ------------ */ 3780 3781 int main ( int argc, char** argv ) 3782 { 3783 #if 1 3784 // ------ SSE 4.1 ------ 3785 test_BLENDPD(); // done Apr.01.2010 3786 test_BLENDPS(); // done Apr.02.2010 3787 test_PBLENDW(); 3788 test_PBLENDVB(); 3789 test_BLENDVPD(); 3790 test_BLENDVPS(); 3791 test_DPPD(); // done Apr.08.2010 3792 test_DPPS(); // done Apr.09.2010 3793 test_EXTRACTPS(); 3794 test_INSERTPS(); // done Apr.01.2010 3795 // MOVNTDQA *** 3796 test_PCMPEQQ(); 3797 test_PEXTRB(); // done Apr.15.2010 3798 test_PEXTRD(); // done Apr.14.2010 3799 test_PEXTRQ(); // done Apr.14.2010 3800 test_PEXTRW(); // done Apr.14.2010 3801 test_PINSRQ(); // done Apr.16.2010 3802 test_PINSRD(); // todo 3803 test_PINSRW(); /* Umm, this is SSE2, not SSE4. Right? */ 3804 test_PINSRB(); // todo 3805 test_PMAXSB(); 3806 test_PMAXSD(); // done Apr.09.2010 3807 test_PMAXUD(); // done Apr.16.2010 3808 test_PMAXUW(); 3809 test_PMINSB(); 3810 test_PMINSD(); // done Apr.09.2010 3811 test_PMINUD(); 3812 test_PMINUW(); 3813 test_PMOVSXBW(); // done Apr.02.2010 3814 test_PMOVSXBD(); // done Mar.30.2010 3815 test_PMOVSXBQ(); // done Mar.30.2010 3816 test_PMOVSXWD(); // done Mar.31.2010 3817 test_PMOVSXWQ(); // done Mar.31.2010 3818 test_PMOVSXDQ(); // done Mar.31.2010 3819 test_PMOVZXBW(); // done Mar.28.2010 3820 test_PMOVZXBD(); // done Mar.29.2010 3821 test_PMOVZXBQ(); // done Mar.29.2010 3822 test_PMOVZXWD(); // done Mar.28.2010 3823 test_PMOVZXWQ(); // done Mar.29.2010 3824 test_PMOVZXDQ(); // done Mar.29.2010 3825 test_POPCNTW(); 3826 test_POPCNTL(); 3827 test_POPCNTQ(); 3828 test_PMULDQ(); 3829 test_PMULLD(); 3830 test_PTEST(); 3831 test_ROUNDSD_w_immediate_rounding(); 3832 test_ROUNDSS_w_immediate_rounding(); 3833 test_ROUNDPD_w_immediate_rounding(); 3834 test_ROUNDPS_w_immediate_rounding(); 3835 test_ROUNDSD_w_mxcsr_rounding(); 3836 test_ROUNDSS_w_mxcsr_rounding(); 3837 test_ROUNDPD_w_mxcsr_rounding(); 3838 test_ROUNDPS_w_mxcsr_rounding(); 3839 // ------ SSE 4.2 ------ 3840 test_PCMPGTQ(); 3841 // CRC32B,Q 3842 test_PACKUSDW(); 3843 test_PHMINPOSUW(); 3844 test_MPSADBW(); 3845 #else 3846 test_MPSADBW(); 3847 #endif 3848 3849 return 0; 3850 } 3851 3852