1 2 /* A program to test SSE4.1/SSE4.2 instructions. 3 Revisions: Nov.208 - wrote this file 4 Apr.10.2010 - added PEXTR* tests 5 Apr.16.2010 - added PINS* tests 6 */ 7 8 /* HOW TO COMPILE: 9 gcc -m64 -g -O -Wall -o sse4-64 sse4-64.c 10 */ 11 12 #include <stdio.h> 13 #include <stdlib.h> 14 #include <assert.h> 15 #include "tests/malloc.h" 16 #include <string.h> 17 18 19 typedef unsigned char V128[16]; 20 typedef unsigned int UInt; 21 typedef signed int Int; 22 typedef unsigned char UChar; 23 typedef unsigned long long int ULong; 24 25 typedef unsigned char Bool; 26 #define False ((Bool)0) 27 #define True ((Bool)1) 28 29 30 typedef 31 struct { 32 V128 arg1; 33 V128 arg2; 34 V128 res; 35 } 36 RRArgs; 37 38 typedef 39 struct { 40 V128 arg1; 41 V128 res; 42 } 43 RMArgs; 44 45 static void do64HLtoV128 ( /*OUT*/V128* res, ULong wHi, ULong wLo ) 46 { 47 // try to sidestep strict-aliasing snafus by memcpying explicitly 48 UChar* p = (UChar*)res; 49 memcpy(&p[8], (UChar*)&wHi, 8); 50 memcpy(&p[0], (UChar*)&wLo, 8); 51 } 52 53 static UChar randUChar ( void ) 54 { 55 static UInt seed = 80021; 56 seed = 1103515245 * seed + 12345; 57 return (seed >> 17) & 0xFF; 58 } 59 60 static ULong randULong ( void ) 61 { 62 Int i; 63 ULong r = 0; 64 for (i = 0; i < 8; i++) { 65 r = (r << 8) | (ULong)(0xFF & randUChar()); 66 } 67 return r; 68 } 69 70 static void randV128 ( V128* v ) 71 { 72 Int i; 73 for (i = 0; i < 16; i++) 74 (*v)[i] = randUChar(); 75 } 76 77 static void showV128 ( V128* v ) 78 { 79 Int i; 80 for (i = 15; i >= 0; i--) 81 printf("%02x", (Int)(*v)[i]); 82 } 83 84 static void showMaskedV128 ( V128* v, V128* mask ) 85 { 86 Int i; 87 for (i = 15; i >= 0; i--) 88 printf("%02x", (Int)( ((*v)[i]) & ((*mask)[i]) )); 89 } 90 91 static void showIGVV( char* rOrM, char* op, Int imm, 92 ULong src64, V128* dst, V128* res ) 93 { 94 printf("%s %10s $%d ", rOrM, op, imm); 95 printf("%016llx", src64); 96 printf(" "); 97 showV128(dst); 98 printf(" "); 99 showV128(res); 100 printf("\n"); 101 } 102 103 static void showIAG ( char* rOrM, char* op, Int imm, 104 V128* argL, ULong argR, ULong res ) 105 { 106 printf("%s %10s $%d ", rOrM, op, imm); 107 showV128(argL); 108 printf(" "); 109 printf("%016llx", argR); 110 printf(" "); 111 printf("%016llx", res); 112 printf("\n"); 113 } 114 115 static void showIAA ( char* rOrM, char* op, Int imm, RRArgs* rra, V128* rmask ) 116 { 117 printf("%s %10s $%d ", rOrM, op, imm); 118 showV128(&rra->arg1); 119 printf(" "); 120 showV128(&rra->arg2); 121 printf(" "); 122 showMaskedV128(&rra->res, rmask); 123 printf("\n"); 124 } 125 126 static void showAA ( char* rOrM, char* op, RRArgs* rra, V128* rmask ) 127 { 128 printf("%s %10s ", rOrM, op); 129 showV128(&rra->arg1); 130 printf(" "); 131 showV128(&rra->arg2); 132 printf(" "); 133 showMaskedV128(&rra->res, rmask); 134 printf("\n"); 135 } 136 137 /* Note: these are little endian. Hence first byte is the least 138 significant byte of lane zero. */ 139 140 /* Mask for insns where all result bits are non-approximated. */ 141 static V128 AllMask = { 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 142 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF }; 143 144 /* Mark for insns which produce approximated vector short results. */ 145 __attribute__((unused)) 146 static V128 ApproxPS = { 0x00,0x00,0x80,0xFF, 0x00,0x00,0x80,0xFF, 147 0x00,0x00,0x80,0xFF, 0x00,0x00,0x80,0xFF }; 148 149 /* Mark for insns which produce approximated scalar short results. */ 150 __attribute__((unused)) 151 static V128 ApproxSS = { 0x00,0x00,0x80,0xFF, 0xFF,0xFF,0xFF,0xFF, 152 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF }; 153 154 static V128 fives = { 0x55,0x55,0x55,0x55, 0x55,0x55,0x55,0x55, 155 0x55,0x55,0x55,0x55, 0x55,0x55,0x55,0x55 }; 156 157 static V128 zeroes = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 158 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 }; 159 160 double mkPosInf ( void ) { return 1.0 / 0.0; } 161 double mkNegInf ( void ) { return -mkPosInf(); } 162 double mkPosNan ( void ) { return 0.0 / 0.0; } 163 double mkNegNan ( void ) { return -mkPosNan(); } 164 165 __attribute__((noinline)) 166 UInt get_mxcsr ( void ) 167 { 168 ULong w64; 169 __asm__ __volatile__( 170 "subq $8, %%rsp" "\n\t" 171 "stmxcsr (%%rsp)" "\n\t" 172 "movq (%%rsp), %0" "\n" 173 "addq $8, %%rsp" 174 : /*OUT*/"=r"(w64) : /*IN*/ : "memory","cc" 175 ); 176 if (0) printf("get %08x\n", (UInt)w64); 177 return (UInt)w64; 178 } 179 180 __attribute__((noinline)) 181 void set_mxcsr ( UInt w32 ) 182 { 183 if (0) printf("set %08x\n", w32); 184 ULong w64 = (ULong)w32; 185 __asm__ __volatile__( 186 "subq $8, %%rsp" "\n\t" 187 "movq %0, (%%rsp)" "\n\t" 188 "ldmxcsr (%%rsp)" "\n\t" 189 "addq $8, %%rsp" 190 : /*OUT*/ : /*IN*/"r"(w64) : "memory",/*"mxcsr",*/"cc" 191 ); 192 } 193 194 UInt get_sse_roundingmode ( void ) 195 { 196 UInt w = get_mxcsr(); 197 return (w >> 13) & 3; 198 } 199 200 void set_sse_roundingmode ( UInt m ) 201 { 202 UInt w; 203 assert(0 == (m & ~3)); 204 w = get_mxcsr(); 205 w &= ~(3 << 13); 206 w |= (m << 13); 207 set_mxcsr(w); 208 } 209 210 211 #define DO_imm_r_r(_opname, _imm, _src, _dst) \ 212 { \ 213 V128 _tmp; \ 214 __asm__ __volatile__( \ 215 "movupd (%0), %%xmm2" "\n\t" \ 216 "movupd (%1), %%xmm11" "\n\t" \ 217 _opname " $" #_imm ", %%xmm2, %%xmm11" "\n\t" \ 218 "movupd %%xmm11, (%2)" "\n" \ 219 : /*out*/ : /*in*/ "r"(&(_src)), "r"(&(_dst)), "r"(&(_tmp)) \ 220 : "cc", "memory", "xmm2", "xmm11" \ 221 ); \ 222 RRArgs rra; \ 223 memcpy(&rra.arg1, &(_src), sizeof(V128)); \ 224 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \ 225 memcpy(&rra.res, &(_tmp), sizeof(V128)); \ 226 showIAA("r", (_opname), (_imm), &rra, &AllMask); \ 227 } 228 229 #define DO_imm_m_r(_opname, _imm, _src, _dst) \ 230 { \ 231 V128 _tmp; \ 232 V128* _srcM = memalign16(sizeof(V128)); \ 233 memcpy(_srcM, &(_src), sizeof(V128)); \ 234 __asm__ __volatile__( \ 235 "movupd (%1), %%xmm11" "\n\t" \ 236 _opname " $" #_imm ", (%0), %%xmm11" "\n\t" \ 237 "movupd %%xmm11, (%2)" "\n" \ 238 : /*out*/ : /*in*/ "r"(_srcM), "r"(&(_dst)), "r"(&(_tmp)) \ 239 : "cc", "memory", "xmm11" \ 240 ); \ 241 RRArgs rra; \ 242 memcpy(&rra.arg1, &(_src), sizeof(V128)); \ 243 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \ 244 memcpy(&rra.res, &(_tmp), sizeof(V128)); \ 245 showIAA("m", (_opname), (_imm), &rra, &AllMask); \ 246 free(_srcM); \ 247 } 248 249 #define DO_imm_mandr_r(_opname, _imm, _src, _dst) \ 250 DO_imm_r_r( _opname, _imm, _src, _dst ) \ 251 DO_imm_m_r( _opname, _imm, _src, _dst ) 252 253 254 255 256 257 #define DO_r_r(_opname, _src, _dst) \ 258 { \ 259 V128 _tmp; \ 260 __asm__ __volatile__( \ 261 "movupd (%0), %%xmm2" "\n\t" \ 262 "movupd (%1), %%xmm11" "\n\t" \ 263 _opname " %%xmm2, %%xmm11" "\n\t" \ 264 "movupd %%xmm11, (%2)" "\n" \ 265 : /*out*/ : /*in*/ "r"(&(_src)), "r"(&(_dst)), "r"(&(_tmp)) \ 266 : "cc", "memory", "xmm2", "xmm11" \ 267 ); \ 268 RRArgs rra; \ 269 memcpy(&rra.arg1, &(_src), sizeof(V128)); \ 270 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \ 271 memcpy(&rra.res, &(_tmp), sizeof(V128)); \ 272 showAA("r", (_opname), &rra, &AllMask); \ 273 } 274 275 #define DO_m_r(_opname, _src, _dst) \ 276 { \ 277 V128 _tmp; \ 278 V128* _srcM = memalign16(sizeof(V128)); \ 279 memcpy(_srcM, &(_src), sizeof(V128)); \ 280 __asm__ __volatile__( \ 281 "movupd (%1), %%xmm11" "\n\t" \ 282 _opname " (%0), %%xmm11" "\n\t" \ 283 "movupd %%xmm11, (%2)" "\n" \ 284 : /*out*/ : /*in*/ "r"(_srcM), "r"(&(_dst)), "r"(&(_tmp)) \ 285 : "cc", "memory", "xmm11" \ 286 ); \ 287 RRArgs rra; \ 288 memcpy(&rra.arg1, &(_src), sizeof(V128)); \ 289 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \ 290 memcpy(&rra.res, &(_tmp), sizeof(V128)); \ 291 showAA("m", (_opname), &rra, &AllMask); \ 292 free(_srcM); \ 293 } 294 295 #define DO_mandr_r(_opname, _src, _dst) \ 296 DO_r_r(_opname, _src, _dst) \ 297 DO_m_r(_opname, _src, _dst) 298 299 300 301 302 #define DO_imm_r_to_rscalar(_opname, _imm, _src, _dstsuffix) \ 303 { \ 304 ULong _scbefore = 0x5555555555555555ULL; \ 305 ULong _scafter = 0xAAAAAAAAAAAAAAAAULL; \ 306 /* This assumes that gcc won't make any of %0, %1, %2 */ \ 307 /* be r11. That should be ensured (cough, cough) */ \ 308 /* by declaring r11 to be clobbered. */ \ 309 __asm__ __volatile__( \ 310 "movupd (%0), %%xmm2" "\n\t" \ 311 "movq (%1), %%r11" "\n\t" \ 312 _opname " $" #_imm ", %%xmm2, %%r11" _dstsuffix "\n\t" \ 313 "movq %%r11, (%2)" "\n" \ 314 : /*out*/ \ 315 : /*in*/ "r"(&(_src)), "r"(&(_scbefore)), "r"(&(_scafter)) \ 316 : "cc", "memory", "xmm2", "r11" \ 317 ); \ 318 showIAG("r", (_opname), (_imm), &(_src), (_scbefore), (_scafter)); \ 319 } 320 321 #define DO_imm_r_to_mscalar(_opname, _imm, _src) \ 322 { \ 323 ULong _scbefore = 0x5555555555555555ULL; \ 324 ULong _scafter = _scbefore; \ 325 __asm__ __volatile__( \ 326 "movupd (%0), %%xmm2" "\n\t" \ 327 _opname " $" #_imm ", %%xmm2, (%1)" "\n\t" \ 328 : /*out*/ \ 329 : /*in*/ "r"(&(_src)), "r"(&(_scafter)) \ 330 : "cc", "memory", "xmm2" \ 331 ); \ 332 showIAG("m", (_opname), (_imm), &(_src), (_scbefore), (_scafter)); \ 333 } 334 335 #define DO_imm_r_to_mandrscalar(_opname, _imm, _src, _dstsuffix) \ 336 DO_imm_r_to_rscalar( _opname, _imm, _src, _dstsuffix ) \ 337 DO_imm_r_to_mscalar( _opname, _imm, _src ) 338 339 340 341 342 343 344 345 346 #define DO_imm_rscalar_to_r(_opname, _imm, _src, _srcsuffix) \ 347 { \ 348 V128 dstv; \ 349 V128 res; \ 350 ULong src64 = (ULong)(_src); \ 351 memcpy(dstv, fives, sizeof(dstv)); \ 352 memcpy(res, zeroes, sizeof(res)); \ 353 /* This assumes that gcc won't make any of %0, %1, %2 */ \ 354 /* be r11. That should be ensured (cough, cough) */ \ 355 /* by declaring r11 to be clobbered. */ \ 356 __asm__ __volatile__( \ 357 "movupd (%0), %%xmm2" "\n\t" /*dstv*/ \ 358 "movq (%1), %%r11" "\n\t" /*src64*/ \ 359 _opname " $" #_imm ", %%r11" _srcsuffix ", %%xmm2" "\n\t" \ 360 "movupd %%xmm2, (%2)" "\n" /*res*/ \ 361 : /*out*/ \ 362 : /*in*/ "r"(&dstv), "r"(&src64), "r"(&res) \ 363 : "cc", "memory", "xmm2", "r11" \ 364 ); \ 365 showIGVV("r", (_opname), (_imm), src64, &dstv, &res); \ 366 } 367 #define DO_imm_mscalar_to_r(_opname, _imm, _src) \ 368 { \ 369 V128 dstv; \ 370 V128 res; \ 371 ULong src64 = (ULong)(_src); \ 372 memcpy(dstv, fives, sizeof(dstv)); \ 373 memcpy(res, zeroes, sizeof(res)); \ 374 __asm__ __volatile__( \ 375 "movupd (%0), %%xmm2" "\n\t" /*dstv*/ \ 376 _opname " $" #_imm ", (%1), %%xmm2" "\n\t" \ 377 "movupd %%xmm2, (%2)" "\n" /*res*/ \ 378 : /*out*/ \ 379 : /*in*/ "r"(&dstv), "r"(&src64), "r"(&res) \ 380 : "cc", "memory", "xmm2" \ 381 ); \ 382 showIGVV("m", (_opname), (_imm), src64, &dstv, &res); \ 383 } 384 385 #define DO_imm_mandrscalar_to_r(_opname, _imm, _src, _dstsuffix) \ 386 DO_imm_rscalar_to_r( _opname, _imm, _src, _dstsuffix ) \ 387 DO_imm_mscalar_to_r( _opname, _imm, _src ) 388 389 390 391 392 393 void test_BLENDPD ( void ) 394 { 395 V128 src, dst; 396 Int i; 397 for (i = 0; i < 10; i++) { 398 randV128(&src); 399 randV128(&dst); 400 DO_imm_mandr_r("blendpd", 0, src, dst); 401 DO_imm_mandr_r("blendpd", 1, src, dst); 402 DO_imm_mandr_r("blendpd", 2, src, dst); 403 DO_imm_mandr_r("blendpd", 3, src, dst); 404 } 405 } 406 407 void test_BLENDPS ( void ) 408 { 409 V128 src, dst; 410 Int i; 411 for (i = 0; i < 10; i++) { 412 randV128(&src); 413 randV128(&dst); 414 DO_imm_mandr_r("blendps", 0, src, dst); 415 DO_imm_mandr_r("blendps", 1, src, dst); 416 DO_imm_mandr_r("blendps", 2, src, dst); 417 DO_imm_mandr_r("blendps", 3, src, dst); 418 DO_imm_mandr_r("blendps", 4, src, dst); 419 DO_imm_mandr_r("blendps", 5, src, dst); 420 DO_imm_mandr_r("blendps", 6, src, dst); 421 DO_imm_mandr_r("blendps", 7, src, dst); 422 DO_imm_mandr_r("blendps", 8, src, dst); 423 DO_imm_mandr_r("blendps", 9, src, dst); 424 DO_imm_mandr_r("blendps", 10, src, dst); 425 DO_imm_mandr_r("blendps", 11, src, dst); 426 DO_imm_mandr_r("blendps", 12, src, dst); 427 DO_imm_mandr_r("blendps", 13, src, dst); 428 DO_imm_mandr_r("blendps", 14, src, dst); 429 DO_imm_mandr_r("blendps", 15, src, dst); 430 } 431 } 432 433 void test_DPPD ( void ) 434 { 435 V128 src, dst; 436 { 437 *(double*)(&src[0]) = 1.2345; 438 *(double*)(&src[8]) = -6.78910; 439 *(double*)(&dst[0]) = -11.121314; 440 *(double*)(&dst[8]) = 15.161718; 441 DO_imm_mandr_r("dppd", 0, src, dst); 442 DO_imm_mandr_r("dppd", 1, src, dst); 443 DO_imm_mandr_r("dppd", 2, src, dst); 444 DO_imm_mandr_r("dppd", 3, src, dst); 445 DO_imm_mandr_r("dppd", 4, src, dst); 446 DO_imm_mandr_r("dppd", 5, src, dst); 447 DO_imm_mandr_r("dppd", 6, src, dst); 448 DO_imm_mandr_r("dppd", 7, src, dst); 449 DO_imm_mandr_r("dppd", 8, src, dst); 450 DO_imm_mandr_r("dppd", 9, src, dst); 451 DO_imm_mandr_r("dppd", 10, src, dst); 452 DO_imm_mandr_r("dppd", 11, src, dst); 453 DO_imm_mandr_r("dppd", 12, src, dst); 454 DO_imm_mandr_r("dppd", 13, src, dst); 455 DO_imm_mandr_r("dppd", 14, src, dst); 456 DO_imm_mandr_r("dppd", 15, src, dst); 457 DO_imm_mandr_r("dppd", 16, src, dst); 458 DO_imm_mandr_r("dppd", 17, src, dst); 459 DO_imm_mandr_r("dppd", 18, src, dst); 460 DO_imm_mandr_r("dppd", 19, src, dst); 461 DO_imm_mandr_r("dppd", 20, src, dst); 462 DO_imm_mandr_r("dppd", 21, src, dst); 463 DO_imm_mandr_r("dppd", 22, src, dst); 464 DO_imm_mandr_r("dppd", 23, src, dst); 465 DO_imm_mandr_r("dppd", 24, src, dst); 466 DO_imm_mandr_r("dppd", 25, src, dst); 467 DO_imm_mandr_r("dppd", 26, src, dst); 468 DO_imm_mandr_r("dppd", 27, src, dst); 469 DO_imm_mandr_r("dppd", 28, src, dst); 470 DO_imm_mandr_r("dppd", 29, src, dst); 471 DO_imm_mandr_r("dppd", 30, src, dst); 472 DO_imm_mandr_r("dppd", 31, src, dst); 473 DO_imm_mandr_r("dppd", 32, src, dst); 474 DO_imm_mandr_r("dppd", 33, src, dst); 475 DO_imm_mandr_r("dppd", 34, src, dst); 476 DO_imm_mandr_r("dppd", 35, src, dst); 477 DO_imm_mandr_r("dppd", 36, src, dst); 478 DO_imm_mandr_r("dppd", 37, src, dst); 479 DO_imm_mandr_r("dppd", 38, src, dst); 480 DO_imm_mandr_r("dppd", 39, src, dst); 481 DO_imm_mandr_r("dppd", 40, src, dst); 482 DO_imm_mandr_r("dppd", 41, src, dst); 483 DO_imm_mandr_r("dppd", 42, src, dst); 484 DO_imm_mandr_r("dppd", 43, src, dst); 485 DO_imm_mandr_r("dppd", 44, src, dst); 486 DO_imm_mandr_r("dppd", 45, src, dst); 487 DO_imm_mandr_r("dppd", 46, src, dst); 488 DO_imm_mandr_r("dppd", 47, src, dst); 489 DO_imm_mandr_r("dppd", 48, src, dst); 490 DO_imm_mandr_r("dppd", 49, src, dst); 491 DO_imm_mandr_r("dppd", 50, src, dst); 492 DO_imm_mandr_r("dppd", 51, src, dst); 493 DO_imm_mandr_r("dppd", 52, src, dst); 494 DO_imm_mandr_r("dppd", 53, src, dst); 495 DO_imm_mandr_r("dppd", 54, src, dst); 496 DO_imm_mandr_r("dppd", 55, src, dst); 497 DO_imm_mandr_r("dppd", 56, src, dst); 498 DO_imm_mandr_r("dppd", 57, src, dst); 499 DO_imm_mandr_r("dppd", 58, src, dst); 500 DO_imm_mandr_r("dppd", 59, src, dst); 501 DO_imm_mandr_r("dppd", 60, src, dst); 502 DO_imm_mandr_r("dppd", 61, src, dst); 503 DO_imm_mandr_r("dppd", 62, src, dst); 504 DO_imm_mandr_r("dppd", 63, src, dst); 505 DO_imm_mandr_r("dppd", 64, src, dst); 506 DO_imm_mandr_r("dppd", 65, src, dst); 507 DO_imm_mandr_r("dppd", 66, src, dst); 508 DO_imm_mandr_r("dppd", 67, src, dst); 509 DO_imm_mandr_r("dppd", 68, src, dst); 510 DO_imm_mandr_r("dppd", 69, src, dst); 511 DO_imm_mandr_r("dppd", 70, src, dst); 512 DO_imm_mandr_r("dppd", 71, src, dst); 513 DO_imm_mandr_r("dppd", 72, src, dst); 514 DO_imm_mandr_r("dppd", 73, src, dst); 515 DO_imm_mandr_r("dppd", 74, src, dst); 516 DO_imm_mandr_r("dppd", 75, src, dst); 517 DO_imm_mandr_r("dppd", 76, src, dst); 518 DO_imm_mandr_r("dppd", 77, src, dst); 519 DO_imm_mandr_r("dppd", 78, src, dst); 520 DO_imm_mandr_r("dppd", 79, src, dst); 521 DO_imm_mandr_r("dppd", 80, src, dst); 522 DO_imm_mandr_r("dppd", 81, src, dst); 523 DO_imm_mandr_r("dppd", 82, src, dst); 524 DO_imm_mandr_r("dppd", 83, src, dst); 525 DO_imm_mandr_r("dppd", 84, src, dst); 526 DO_imm_mandr_r("dppd", 85, src, dst); 527 DO_imm_mandr_r("dppd", 86, src, dst); 528 DO_imm_mandr_r("dppd", 87, src, dst); 529 DO_imm_mandr_r("dppd", 88, src, dst); 530 DO_imm_mandr_r("dppd", 89, src, dst); 531 DO_imm_mandr_r("dppd", 90, src, dst); 532 DO_imm_mandr_r("dppd", 91, src, dst); 533 DO_imm_mandr_r("dppd", 92, src, dst); 534 DO_imm_mandr_r("dppd", 93, src, dst); 535 DO_imm_mandr_r("dppd", 94, src, dst); 536 DO_imm_mandr_r("dppd", 95, src, dst); 537 DO_imm_mandr_r("dppd", 96, src, dst); 538 DO_imm_mandr_r("dppd", 97, src, dst); 539 DO_imm_mandr_r("dppd", 98, src, dst); 540 DO_imm_mandr_r("dppd", 99, src, dst); 541 DO_imm_mandr_r("dppd", 100, src, dst); 542 DO_imm_mandr_r("dppd", 101, src, dst); 543 DO_imm_mandr_r("dppd", 102, src, dst); 544 DO_imm_mandr_r("dppd", 103, src, dst); 545 DO_imm_mandr_r("dppd", 104, src, dst); 546 DO_imm_mandr_r("dppd", 105, src, dst); 547 DO_imm_mandr_r("dppd", 106, src, dst); 548 DO_imm_mandr_r("dppd", 107, src, dst); 549 DO_imm_mandr_r("dppd", 108, src, dst); 550 DO_imm_mandr_r("dppd", 109, src, dst); 551 DO_imm_mandr_r("dppd", 110, src, dst); 552 DO_imm_mandr_r("dppd", 111, src, dst); 553 DO_imm_mandr_r("dppd", 112, src, dst); 554 DO_imm_mandr_r("dppd", 113, src, dst); 555 DO_imm_mandr_r("dppd", 114, src, dst); 556 DO_imm_mandr_r("dppd", 115, src, dst); 557 DO_imm_mandr_r("dppd", 116, src, dst); 558 DO_imm_mandr_r("dppd", 117, src, dst); 559 DO_imm_mandr_r("dppd", 118, src, dst); 560 DO_imm_mandr_r("dppd", 119, src, dst); 561 DO_imm_mandr_r("dppd", 120, src, dst); 562 DO_imm_mandr_r("dppd", 121, src, dst); 563 DO_imm_mandr_r("dppd", 122, src, dst); 564 DO_imm_mandr_r("dppd", 123, src, dst); 565 DO_imm_mandr_r("dppd", 124, src, dst); 566 DO_imm_mandr_r("dppd", 125, src, dst); 567 DO_imm_mandr_r("dppd", 126, src, dst); 568 DO_imm_mandr_r("dppd", 127, src, dst); 569 DO_imm_mandr_r("dppd", 128, src, dst); 570 DO_imm_mandr_r("dppd", 129, src, dst); 571 DO_imm_mandr_r("dppd", 130, src, dst); 572 DO_imm_mandr_r("dppd", 131, src, dst); 573 DO_imm_mandr_r("dppd", 132, src, dst); 574 DO_imm_mandr_r("dppd", 133, src, dst); 575 DO_imm_mandr_r("dppd", 134, src, dst); 576 DO_imm_mandr_r("dppd", 135, src, dst); 577 DO_imm_mandr_r("dppd", 136, src, dst); 578 DO_imm_mandr_r("dppd", 137, src, dst); 579 DO_imm_mandr_r("dppd", 138, src, dst); 580 DO_imm_mandr_r("dppd", 139, src, dst); 581 DO_imm_mandr_r("dppd", 140, src, dst); 582 DO_imm_mandr_r("dppd", 141, src, dst); 583 DO_imm_mandr_r("dppd", 142, src, dst); 584 DO_imm_mandr_r("dppd", 143, src, dst); 585 DO_imm_mandr_r("dppd", 144, src, dst); 586 DO_imm_mandr_r("dppd", 145, src, dst); 587 DO_imm_mandr_r("dppd", 146, src, dst); 588 DO_imm_mandr_r("dppd", 147, src, dst); 589 DO_imm_mandr_r("dppd", 148, src, dst); 590 DO_imm_mandr_r("dppd", 149, src, dst); 591 DO_imm_mandr_r("dppd", 150, src, dst); 592 DO_imm_mandr_r("dppd", 151, src, dst); 593 DO_imm_mandr_r("dppd", 152, src, dst); 594 DO_imm_mandr_r("dppd", 153, src, dst); 595 DO_imm_mandr_r("dppd", 154, src, dst); 596 DO_imm_mandr_r("dppd", 155, src, dst); 597 DO_imm_mandr_r("dppd", 156, src, dst); 598 DO_imm_mandr_r("dppd", 157, src, dst); 599 DO_imm_mandr_r("dppd", 158, src, dst); 600 DO_imm_mandr_r("dppd", 159, src, dst); 601 DO_imm_mandr_r("dppd", 160, src, dst); 602 DO_imm_mandr_r("dppd", 161, src, dst); 603 DO_imm_mandr_r("dppd", 162, src, dst); 604 DO_imm_mandr_r("dppd", 163, src, dst); 605 DO_imm_mandr_r("dppd", 164, src, dst); 606 DO_imm_mandr_r("dppd", 165, src, dst); 607 DO_imm_mandr_r("dppd", 166, src, dst); 608 DO_imm_mandr_r("dppd", 167, src, dst); 609 DO_imm_mandr_r("dppd", 168, src, dst); 610 DO_imm_mandr_r("dppd", 169, src, dst); 611 DO_imm_mandr_r("dppd", 170, src, dst); 612 DO_imm_mandr_r("dppd", 171, src, dst); 613 DO_imm_mandr_r("dppd", 172, src, dst); 614 DO_imm_mandr_r("dppd", 173, src, dst); 615 DO_imm_mandr_r("dppd", 174, src, dst); 616 DO_imm_mandr_r("dppd", 175, src, dst); 617 DO_imm_mandr_r("dppd", 176, src, dst); 618 DO_imm_mandr_r("dppd", 177, src, dst); 619 DO_imm_mandr_r("dppd", 178, src, dst); 620 DO_imm_mandr_r("dppd", 179, src, dst); 621 DO_imm_mandr_r("dppd", 180, src, dst); 622 DO_imm_mandr_r("dppd", 181, src, dst); 623 DO_imm_mandr_r("dppd", 182, src, dst); 624 DO_imm_mandr_r("dppd", 183, src, dst); 625 DO_imm_mandr_r("dppd", 184, src, dst); 626 DO_imm_mandr_r("dppd", 185, src, dst); 627 DO_imm_mandr_r("dppd", 186, src, dst); 628 DO_imm_mandr_r("dppd", 187, src, dst); 629 DO_imm_mandr_r("dppd", 188, src, dst); 630 DO_imm_mandr_r("dppd", 189, src, dst); 631 DO_imm_mandr_r("dppd", 190, src, dst); 632 DO_imm_mandr_r("dppd", 191, src, dst); 633 DO_imm_mandr_r("dppd", 192, src, dst); 634 DO_imm_mandr_r("dppd", 193, src, dst); 635 DO_imm_mandr_r("dppd", 194, src, dst); 636 DO_imm_mandr_r("dppd", 195, src, dst); 637 DO_imm_mandr_r("dppd", 196, src, dst); 638 DO_imm_mandr_r("dppd", 197, src, dst); 639 DO_imm_mandr_r("dppd", 198, src, dst); 640 DO_imm_mandr_r("dppd", 199, src, dst); 641 DO_imm_mandr_r("dppd", 200, src, dst); 642 DO_imm_mandr_r("dppd", 201, src, dst); 643 DO_imm_mandr_r("dppd", 202, src, dst); 644 DO_imm_mandr_r("dppd", 203, src, dst); 645 DO_imm_mandr_r("dppd", 204, src, dst); 646 DO_imm_mandr_r("dppd", 205, src, dst); 647 DO_imm_mandr_r("dppd", 206, src, dst); 648 DO_imm_mandr_r("dppd", 207, src, dst); 649 DO_imm_mandr_r("dppd", 208, src, dst); 650 DO_imm_mandr_r("dppd", 209, src, dst); 651 DO_imm_mandr_r("dppd", 210, src, dst); 652 DO_imm_mandr_r("dppd", 211, src, dst); 653 DO_imm_mandr_r("dppd", 212, src, dst); 654 DO_imm_mandr_r("dppd", 213, src, dst); 655 DO_imm_mandr_r("dppd", 214, src, dst); 656 DO_imm_mandr_r("dppd", 215, src, dst); 657 DO_imm_mandr_r("dppd", 216, src, dst); 658 DO_imm_mandr_r("dppd", 217, src, dst); 659 DO_imm_mandr_r("dppd", 218, src, dst); 660 DO_imm_mandr_r("dppd", 219, src, dst); 661 DO_imm_mandr_r("dppd", 220, src, dst); 662 DO_imm_mandr_r("dppd", 221, src, dst); 663 DO_imm_mandr_r("dppd", 222, src, dst); 664 DO_imm_mandr_r("dppd", 223, src, dst); 665 DO_imm_mandr_r("dppd", 224, src, dst); 666 DO_imm_mandr_r("dppd", 225, src, dst); 667 DO_imm_mandr_r("dppd", 226, src, dst); 668 DO_imm_mandr_r("dppd", 227, src, dst); 669 DO_imm_mandr_r("dppd", 228, src, dst); 670 DO_imm_mandr_r("dppd", 229, src, dst); 671 DO_imm_mandr_r("dppd", 230, src, dst); 672 DO_imm_mandr_r("dppd", 231, src, dst); 673 DO_imm_mandr_r("dppd", 232, src, dst); 674 DO_imm_mandr_r("dppd", 233, src, dst); 675 DO_imm_mandr_r("dppd", 234, src, dst); 676 DO_imm_mandr_r("dppd", 235, src, dst); 677 DO_imm_mandr_r("dppd", 236, src, dst); 678 DO_imm_mandr_r("dppd", 237, src, dst); 679 DO_imm_mandr_r("dppd", 238, src, dst); 680 DO_imm_mandr_r("dppd", 239, src, dst); 681 DO_imm_mandr_r("dppd", 240, src, dst); 682 DO_imm_mandr_r("dppd", 241, src, dst); 683 DO_imm_mandr_r("dppd", 242, src, dst); 684 DO_imm_mandr_r("dppd", 243, src, dst); 685 DO_imm_mandr_r("dppd", 244, src, dst); 686 DO_imm_mandr_r("dppd", 245, src, dst); 687 DO_imm_mandr_r("dppd", 246, src, dst); 688 DO_imm_mandr_r("dppd", 247, src, dst); 689 DO_imm_mandr_r("dppd", 248, src, dst); 690 DO_imm_mandr_r("dppd", 249, src, dst); 691 DO_imm_mandr_r("dppd", 250, src, dst); 692 DO_imm_mandr_r("dppd", 251, src, dst); 693 DO_imm_mandr_r("dppd", 252, src, dst); 694 DO_imm_mandr_r("dppd", 253, src, dst); 695 DO_imm_mandr_r("dppd", 254, src, dst); 696 DO_imm_mandr_r("dppd", 255, src, dst); 697 } 698 } 699 700 void test_DPPS ( void ) 701 { 702 V128 src, dst; 703 { 704 *(float*)(&src[0]) = 1.2; 705 *(float*)(&src[4]) = -3.4; 706 *(float*)(&src[8]) = -6.7; 707 *(float*)(&src[12]) = 8.9; 708 *(float*)(&dst[0]) = -10.11; 709 *(float*)(&dst[4]) = 12.13; 710 *(float*)(&dst[8]) = 14.15; 711 *(float*)(&dst[12]) = -16.17; 712 DO_imm_mandr_r("dpps", 0, src, dst); 713 DO_imm_mandr_r("dpps", 1, src, dst); 714 DO_imm_mandr_r("dpps", 2, src, dst); 715 DO_imm_mandr_r("dpps", 3, src, dst); 716 DO_imm_mandr_r("dpps", 4, src, dst); 717 DO_imm_mandr_r("dpps", 5, src, dst); 718 DO_imm_mandr_r("dpps", 6, src, dst); 719 DO_imm_mandr_r("dpps", 7, src, dst); 720 DO_imm_mandr_r("dpps", 8, src, dst); 721 DO_imm_mandr_r("dpps", 9, src, dst); 722 DO_imm_mandr_r("dpps", 10, src, dst); 723 DO_imm_mandr_r("dpps", 11, src, dst); 724 DO_imm_mandr_r("dpps", 12, src, dst); 725 DO_imm_mandr_r("dpps", 13, src, dst); 726 DO_imm_mandr_r("dpps", 14, src, dst); 727 DO_imm_mandr_r("dpps", 15, src, dst); 728 DO_imm_mandr_r("dpps", 16, src, dst); 729 DO_imm_mandr_r("dpps", 17, src, dst); 730 DO_imm_mandr_r("dpps", 18, src, dst); 731 DO_imm_mandr_r("dpps", 19, src, dst); 732 DO_imm_mandr_r("dpps", 20, src, dst); 733 DO_imm_mandr_r("dpps", 21, src, dst); 734 DO_imm_mandr_r("dpps", 22, src, dst); 735 DO_imm_mandr_r("dpps", 23, src, dst); 736 DO_imm_mandr_r("dpps", 24, src, dst); 737 DO_imm_mandr_r("dpps", 25, src, dst); 738 DO_imm_mandr_r("dpps", 26, src, dst); 739 DO_imm_mandr_r("dpps", 27, src, dst); 740 DO_imm_mandr_r("dpps", 28, src, dst); 741 DO_imm_mandr_r("dpps", 29, src, dst); 742 DO_imm_mandr_r("dpps", 30, src, dst); 743 DO_imm_mandr_r("dpps", 31, src, dst); 744 DO_imm_mandr_r("dpps", 32, src, dst); 745 DO_imm_mandr_r("dpps", 33, src, dst); 746 DO_imm_mandr_r("dpps", 34, src, dst); 747 DO_imm_mandr_r("dpps", 35, src, dst); 748 DO_imm_mandr_r("dpps", 36, src, dst); 749 DO_imm_mandr_r("dpps", 37, src, dst); 750 DO_imm_mandr_r("dpps", 38, src, dst); 751 DO_imm_mandr_r("dpps", 39, src, dst); 752 DO_imm_mandr_r("dpps", 40, src, dst); 753 DO_imm_mandr_r("dpps", 41, src, dst); 754 DO_imm_mandr_r("dpps", 42, src, dst); 755 DO_imm_mandr_r("dpps", 43, src, dst); 756 DO_imm_mandr_r("dpps", 44, src, dst); 757 DO_imm_mandr_r("dpps", 45, src, dst); 758 DO_imm_mandr_r("dpps", 46, src, dst); 759 DO_imm_mandr_r("dpps", 47, src, dst); 760 DO_imm_mandr_r("dpps", 48, src, dst); 761 DO_imm_mandr_r("dpps", 49, src, dst); 762 DO_imm_mandr_r("dpps", 50, src, dst); 763 DO_imm_mandr_r("dpps", 51, src, dst); 764 DO_imm_mandr_r("dpps", 52, src, dst); 765 DO_imm_mandr_r("dpps", 53, src, dst); 766 DO_imm_mandr_r("dpps", 54, src, dst); 767 DO_imm_mandr_r("dpps", 55, src, dst); 768 DO_imm_mandr_r("dpps", 56, src, dst); 769 DO_imm_mandr_r("dpps", 57, src, dst); 770 DO_imm_mandr_r("dpps", 58, src, dst); 771 DO_imm_mandr_r("dpps", 59, src, dst); 772 DO_imm_mandr_r("dpps", 60, src, dst); 773 DO_imm_mandr_r("dpps", 61, src, dst); 774 DO_imm_mandr_r("dpps", 62, src, dst); 775 DO_imm_mandr_r("dpps", 63, src, dst); 776 DO_imm_mandr_r("dpps", 64, src, dst); 777 DO_imm_mandr_r("dpps", 65, src, dst); 778 DO_imm_mandr_r("dpps", 66, src, dst); 779 DO_imm_mandr_r("dpps", 67, src, dst); 780 DO_imm_mandr_r("dpps", 68, src, dst); 781 DO_imm_mandr_r("dpps", 69, src, dst); 782 DO_imm_mandr_r("dpps", 70, src, dst); 783 DO_imm_mandr_r("dpps", 71, src, dst); 784 DO_imm_mandr_r("dpps", 72, src, dst); 785 DO_imm_mandr_r("dpps", 73, src, dst); 786 DO_imm_mandr_r("dpps", 74, src, dst); 787 DO_imm_mandr_r("dpps", 75, src, dst); 788 DO_imm_mandr_r("dpps", 76, src, dst); 789 DO_imm_mandr_r("dpps", 77, src, dst); 790 DO_imm_mandr_r("dpps", 78, src, dst); 791 DO_imm_mandr_r("dpps", 79, src, dst); 792 DO_imm_mandr_r("dpps", 80, src, dst); 793 DO_imm_mandr_r("dpps", 81, src, dst); 794 DO_imm_mandr_r("dpps", 82, src, dst); 795 DO_imm_mandr_r("dpps", 83, src, dst); 796 DO_imm_mandr_r("dpps", 84, src, dst); 797 DO_imm_mandr_r("dpps", 85, src, dst); 798 DO_imm_mandr_r("dpps", 86, src, dst); 799 DO_imm_mandr_r("dpps", 87, src, dst); 800 DO_imm_mandr_r("dpps", 88, src, dst); 801 DO_imm_mandr_r("dpps", 89, src, dst); 802 DO_imm_mandr_r("dpps", 90, src, dst); 803 DO_imm_mandr_r("dpps", 91, src, dst); 804 DO_imm_mandr_r("dpps", 92, src, dst); 805 DO_imm_mandr_r("dpps", 93, src, dst); 806 DO_imm_mandr_r("dpps", 94, src, dst); 807 DO_imm_mandr_r("dpps", 95, src, dst); 808 DO_imm_mandr_r("dpps", 96, src, dst); 809 DO_imm_mandr_r("dpps", 97, src, dst); 810 DO_imm_mandr_r("dpps", 98, src, dst); 811 DO_imm_mandr_r("dpps", 99, src, dst); 812 DO_imm_mandr_r("dpps", 100, src, dst); 813 DO_imm_mandr_r("dpps", 101, src, dst); 814 DO_imm_mandr_r("dpps", 102, src, dst); 815 DO_imm_mandr_r("dpps", 103, src, dst); 816 DO_imm_mandr_r("dpps", 104, src, dst); 817 DO_imm_mandr_r("dpps", 105, src, dst); 818 DO_imm_mandr_r("dpps", 106, src, dst); 819 DO_imm_mandr_r("dpps", 107, src, dst); 820 DO_imm_mandr_r("dpps", 108, src, dst); 821 DO_imm_mandr_r("dpps", 109, src, dst); 822 DO_imm_mandr_r("dpps", 110, src, dst); 823 DO_imm_mandr_r("dpps", 111, src, dst); 824 DO_imm_mandr_r("dpps", 112, src, dst); 825 DO_imm_mandr_r("dpps", 113, src, dst); 826 DO_imm_mandr_r("dpps", 114, src, dst); 827 DO_imm_mandr_r("dpps", 115, src, dst); 828 DO_imm_mandr_r("dpps", 116, src, dst); 829 DO_imm_mandr_r("dpps", 117, src, dst); 830 DO_imm_mandr_r("dpps", 118, src, dst); 831 DO_imm_mandr_r("dpps", 119, src, dst); 832 DO_imm_mandr_r("dpps", 120, src, dst); 833 DO_imm_mandr_r("dpps", 121, src, dst); 834 DO_imm_mandr_r("dpps", 122, src, dst); 835 DO_imm_mandr_r("dpps", 123, src, dst); 836 DO_imm_mandr_r("dpps", 124, src, dst); 837 DO_imm_mandr_r("dpps", 125, src, dst); 838 DO_imm_mandr_r("dpps", 126, src, dst); 839 DO_imm_mandr_r("dpps", 127, src, dst); 840 DO_imm_mandr_r("dpps", 128, src, dst); 841 DO_imm_mandr_r("dpps", 129, src, dst); 842 DO_imm_mandr_r("dpps", 130, src, dst); 843 DO_imm_mandr_r("dpps", 131, src, dst); 844 DO_imm_mandr_r("dpps", 132, src, dst); 845 DO_imm_mandr_r("dpps", 133, src, dst); 846 DO_imm_mandr_r("dpps", 134, src, dst); 847 DO_imm_mandr_r("dpps", 135, src, dst); 848 DO_imm_mandr_r("dpps", 136, src, dst); 849 DO_imm_mandr_r("dpps", 137, src, dst); 850 DO_imm_mandr_r("dpps", 138, src, dst); 851 DO_imm_mandr_r("dpps", 139, src, dst); 852 DO_imm_mandr_r("dpps", 140, src, dst); 853 DO_imm_mandr_r("dpps", 141, src, dst); 854 DO_imm_mandr_r("dpps", 142, src, dst); 855 DO_imm_mandr_r("dpps", 143, src, dst); 856 DO_imm_mandr_r("dpps", 144, src, dst); 857 DO_imm_mandr_r("dpps", 145, src, dst); 858 DO_imm_mandr_r("dpps", 146, src, dst); 859 DO_imm_mandr_r("dpps", 147, src, dst); 860 DO_imm_mandr_r("dpps", 148, src, dst); 861 DO_imm_mandr_r("dpps", 149, src, dst); 862 DO_imm_mandr_r("dpps", 150, src, dst); 863 DO_imm_mandr_r("dpps", 151, src, dst); 864 DO_imm_mandr_r("dpps", 152, src, dst); 865 DO_imm_mandr_r("dpps", 153, src, dst); 866 DO_imm_mandr_r("dpps", 154, src, dst); 867 DO_imm_mandr_r("dpps", 155, src, dst); 868 DO_imm_mandr_r("dpps", 156, src, dst); 869 DO_imm_mandr_r("dpps", 157, src, dst); 870 DO_imm_mandr_r("dpps", 158, src, dst); 871 DO_imm_mandr_r("dpps", 159, src, dst); 872 DO_imm_mandr_r("dpps", 160, src, dst); 873 DO_imm_mandr_r("dpps", 161, src, dst); 874 DO_imm_mandr_r("dpps", 162, src, dst); 875 DO_imm_mandr_r("dpps", 163, src, dst); 876 DO_imm_mandr_r("dpps", 164, src, dst); 877 DO_imm_mandr_r("dpps", 165, src, dst); 878 DO_imm_mandr_r("dpps", 166, src, dst); 879 DO_imm_mandr_r("dpps", 167, src, dst); 880 DO_imm_mandr_r("dpps", 168, src, dst); 881 DO_imm_mandr_r("dpps", 169, src, dst); 882 DO_imm_mandr_r("dpps", 170, src, dst); 883 DO_imm_mandr_r("dpps", 171, src, dst); 884 DO_imm_mandr_r("dpps", 172, src, dst); 885 DO_imm_mandr_r("dpps", 173, src, dst); 886 DO_imm_mandr_r("dpps", 174, src, dst); 887 DO_imm_mandr_r("dpps", 175, src, dst); 888 DO_imm_mandr_r("dpps", 176, src, dst); 889 DO_imm_mandr_r("dpps", 177, src, dst); 890 DO_imm_mandr_r("dpps", 178, src, dst); 891 DO_imm_mandr_r("dpps", 179, src, dst); 892 DO_imm_mandr_r("dpps", 180, src, dst); 893 DO_imm_mandr_r("dpps", 181, src, dst); 894 DO_imm_mandr_r("dpps", 182, src, dst); 895 DO_imm_mandr_r("dpps", 183, src, dst); 896 DO_imm_mandr_r("dpps", 184, src, dst); 897 DO_imm_mandr_r("dpps", 185, src, dst); 898 DO_imm_mandr_r("dpps", 186, src, dst); 899 DO_imm_mandr_r("dpps", 187, src, dst); 900 DO_imm_mandr_r("dpps", 188, src, dst); 901 DO_imm_mandr_r("dpps", 189, src, dst); 902 DO_imm_mandr_r("dpps", 190, src, dst); 903 DO_imm_mandr_r("dpps", 191, src, dst); 904 DO_imm_mandr_r("dpps", 192, src, dst); 905 DO_imm_mandr_r("dpps", 193, src, dst); 906 DO_imm_mandr_r("dpps", 194, src, dst); 907 DO_imm_mandr_r("dpps", 195, src, dst); 908 DO_imm_mandr_r("dpps", 196, src, dst); 909 DO_imm_mandr_r("dpps", 197, src, dst); 910 DO_imm_mandr_r("dpps", 198, src, dst); 911 DO_imm_mandr_r("dpps", 199, src, dst); 912 DO_imm_mandr_r("dpps", 200, src, dst); 913 DO_imm_mandr_r("dpps", 201, src, dst); 914 DO_imm_mandr_r("dpps", 202, src, dst); 915 DO_imm_mandr_r("dpps", 203, src, dst); 916 DO_imm_mandr_r("dpps", 204, src, dst); 917 DO_imm_mandr_r("dpps", 205, src, dst); 918 DO_imm_mandr_r("dpps", 206, src, dst); 919 DO_imm_mandr_r("dpps", 207, src, dst); 920 DO_imm_mandr_r("dpps", 208, src, dst); 921 DO_imm_mandr_r("dpps", 209, src, dst); 922 DO_imm_mandr_r("dpps", 210, src, dst); 923 DO_imm_mandr_r("dpps", 211, src, dst); 924 DO_imm_mandr_r("dpps", 212, src, dst); 925 DO_imm_mandr_r("dpps", 213, src, dst); 926 DO_imm_mandr_r("dpps", 214, src, dst); 927 DO_imm_mandr_r("dpps", 215, src, dst); 928 DO_imm_mandr_r("dpps", 216, src, dst); 929 DO_imm_mandr_r("dpps", 217, src, dst); 930 DO_imm_mandr_r("dpps", 218, src, dst); 931 DO_imm_mandr_r("dpps", 219, src, dst); 932 DO_imm_mandr_r("dpps", 220, src, dst); 933 DO_imm_mandr_r("dpps", 221, src, dst); 934 DO_imm_mandr_r("dpps", 222, src, dst); 935 DO_imm_mandr_r("dpps", 223, src, dst); 936 DO_imm_mandr_r("dpps", 224, src, dst); 937 DO_imm_mandr_r("dpps", 225, src, dst); 938 DO_imm_mandr_r("dpps", 226, src, dst); 939 DO_imm_mandr_r("dpps", 227, src, dst); 940 DO_imm_mandr_r("dpps", 228, src, dst); 941 DO_imm_mandr_r("dpps", 229, src, dst); 942 DO_imm_mandr_r("dpps", 230, src, dst); 943 DO_imm_mandr_r("dpps", 231, src, dst); 944 DO_imm_mandr_r("dpps", 232, src, dst); 945 DO_imm_mandr_r("dpps", 233, src, dst); 946 DO_imm_mandr_r("dpps", 234, src, dst); 947 DO_imm_mandr_r("dpps", 235, src, dst); 948 DO_imm_mandr_r("dpps", 236, src, dst); 949 DO_imm_mandr_r("dpps", 237, src, dst); 950 DO_imm_mandr_r("dpps", 238, src, dst); 951 DO_imm_mandr_r("dpps", 239, src, dst); 952 DO_imm_mandr_r("dpps", 240, src, dst); 953 DO_imm_mandr_r("dpps", 241, src, dst); 954 DO_imm_mandr_r("dpps", 242, src, dst); 955 DO_imm_mandr_r("dpps", 243, src, dst); 956 DO_imm_mandr_r("dpps", 244, src, dst); 957 DO_imm_mandr_r("dpps", 245, src, dst); 958 DO_imm_mandr_r("dpps", 246, src, dst); 959 DO_imm_mandr_r("dpps", 247, src, dst); 960 DO_imm_mandr_r("dpps", 248, src, dst); 961 DO_imm_mandr_r("dpps", 249, src, dst); 962 DO_imm_mandr_r("dpps", 250, src, dst); 963 DO_imm_mandr_r("dpps", 251, src, dst); 964 DO_imm_mandr_r("dpps", 252, src, dst); 965 DO_imm_mandr_r("dpps", 253, src, dst); 966 DO_imm_mandr_r("dpps", 254, src, dst); 967 DO_imm_mandr_r("dpps", 255, src, dst); 968 } 969 } 970 971 void test_INSERTPS ( void ) 972 { 973 V128 src, dst; 974 { 975 *(float*)(&src[0]) = 1.2; 976 *(float*)(&src[4]) = -3.4; 977 *(float*)(&src[8]) = -6.7; 978 *(float*)(&src[12]) = 8.9; 979 *(float*)(&dst[0]) = -10.11; 980 *(float*)(&dst[4]) = 12.13; 981 *(float*)(&dst[8]) = 14.15; 982 *(float*)(&dst[12]) = -16.17; 983 DO_imm_mandr_r("insertps", 0, src, dst); 984 DO_imm_mandr_r("insertps", 1, src, dst); 985 DO_imm_mandr_r("insertps", 2, src, dst); 986 DO_imm_mandr_r("insertps", 3, src, dst); 987 DO_imm_mandr_r("insertps", 4, src, dst); 988 DO_imm_mandr_r("insertps", 5, src, dst); 989 DO_imm_mandr_r("insertps", 6, src, dst); 990 DO_imm_mandr_r("insertps", 7, src, dst); 991 DO_imm_mandr_r("insertps", 8, src, dst); 992 DO_imm_mandr_r("insertps", 9, src, dst); 993 DO_imm_mandr_r("insertps", 10, src, dst); 994 DO_imm_mandr_r("insertps", 11, src, dst); 995 DO_imm_mandr_r("insertps", 12, src, dst); 996 DO_imm_mandr_r("insertps", 13, src, dst); 997 DO_imm_mandr_r("insertps", 14, src, dst); 998 DO_imm_mandr_r("insertps", 15, src, dst); 999 DO_imm_mandr_r("insertps", 16, src, dst); 1000 DO_imm_mandr_r("insertps", 17, src, dst); 1001 DO_imm_mandr_r("insertps", 18, src, dst); 1002 DO_imm_mandr_r("insertps", 19, src, dst); 1003 DO_imm_mandr_r("insertps", 20, src, dst); 1004 DO_imm_mandr_r("insertps", 21, src, dst); 1005 DO_imm_mandr_r("insertps", 22, src, dst); 1006 DO_imm_mandr_r("insertps", 23, src, dst); 1007 DO_imm_mandr_r("insertps", 24, src, dst); 1008 DO_imm_mandr_r("insertps", 25, src, dst); 1009 DO_imm_mandr_r("insertps", 26, src, dst); 1010 DO_imm_mandr_r("insertps", 27, src, dst); 1011 DO_imm_mandr_r("insertps", 28, src, dst); 1012 DO_imm_mandr_r("insertps", 29, src, dst); 1013 DO_imm_mandr_r("insertps", 30, src, dst); 1014 DO_imm_mandr_r("insertps", 31, src, dst); 1015 DO_imm_mandr_r("insertps", 32, src, dst); 1016 DO_imm_mandr_r("insertps", 33, src, dst); 1017 DO_imm_mandr_r("insertps", 34, src, dst); 1018 DO_imm_mandr_r("insertps", 35, src, dst); 1019 DO_imm_mandr_r("insertps", 36, src, dst); 1020 DO_imm_mandr_r("insertps", 37, src, dst); 1021 DO_imm_mandr_r("insertps", 38, src, dst); 1022 DO_imm_mandr_r("insertps", 39, src, dst); 1023 DO_imm_mandr_r("insertps", 40, src, dst); 1024 DO_imm_mandr_r("insertps", 41, src, dst); 1025 DO_imm_mandr_r("insertps", 42, src, dst); 1026 DO_imm_mandr_r("insertps", 43, src, dst); 1027 DO_imm_mandr_r("insertps", 44, src, dst); 1028 DO_imm_mandr_r("insertps", 45, src, dst); 1029 DO_imm_mandr_r("insertps", 46, src, dst); 1030 DO_imm_mandr_r("insertps", 47, src, dst); 1031 DO_imm_mandr_r("insertps", 48, src, dst); 1032 DO_imm_mandr_r("insertps", 49, src, dst); 1033 DO_imm_mandr_r("insertps", 50, src, dst); 1034 DO_imm_mandr_r("insertps", 51, src, dst); 1035 DO_imm_mandr_r("insertps", 52, src, dst); 1036 DO_imm_mandr_r("insertps", 53, src, dst); 1037 DO_imm_mandr_r("insertps", 54, src, dst); 1038 DO_imm_mandr_r("insertps", 55, src, dst); 1039 DO_imm_mandr_r("insertps", 56, src, dst); 1040 DO_imm_mandr_r("insertps", 57, src, dst); 1041 DO_imm_mandr_r("insertps", 58, src, dst); 1042 DO_imm_mandr_r("insertps", 59, src, dst); 1043 DO_imm_mandr_r("insertps", 60, src, dst); 1044 DO_imm_mandr_r("insertps", 61, src, dst); 1045 DO_imm_mandr_r("insertps", 62, src, dst); 1046 DO_imm_mandr_r("insertps", 63, src, dst); 1047 DO_imm_mandr_r("insertps", 64, src, dst); 1048 DO_imm_mandr_r("insertps", 65, src, dst); 1049 DO_imm_mandr_r("insertps", 66, src, dst); 1050 DO_imm_mandr_r("insertps", 67, src, dst); 1051 DO_imm_mandr_r("insertps", 68, src, dst); 1052 DO_imm_mandr_r("insertps", 69, src, dst); 1053 DO_imm_mandr_r("insertps", 70, src, dst); 1054 DO_imm_mandr_r("insertps", 71, src, dst); 1055 DO_imm_mandr_r("insertps", 72, src, dst); 1056 DO_imm_mandr_r("insertps", 73, src, dst); 1057 DO_imm_mandr_r("insertps", 74, src, dst); 1058 DO_imm_mandr_r("insertps", 75, src, dst); 1059 DO_imm_mandr_r("insertps", 76, src, dst); 1060 DO_imm_mandr_r("insertps", 77, src, dst); 1061 DO_imm_mandr_r("insertps", 78, src, dst); 1062 DO_imm_mandr_r("insertps", 79, src, dst); 1063 DO_imm_mandr_r("insertps", 80, src, dst); 1064 DO_imm_mandr_r("insertps", 81, src, dst); 1065 DO_imm_mandr_r("insertps", 82, src, dst); 1066 DO_imm_mandr_r("insertps", 83, src, dst); 1067 DO_imm_mandr_r("insertps", 84, src, dst); 1068 DO_imm_mandr_r("insertps", 85, src, dst); 1069 DO_imm_mandr_r("insertps", 86, src, dst); 1070 DO_imm_mandr_r("insertps", 87, src, dst); 1071 DO_imm_mandr_r("insertps", 88, src, dst); 1072 DO_imm_mandr_r("insertps", 89, src, dst); 1073 DO_imm_mandr_r("insertps", 90, src, dst); 1074 DO_imm_mandr_r("insertps", 91, src, dst); 1075 DO_imm_mandr_r("insertps", 92, src, dst); 1076 DO_imm_mandr_r("insertps", 93, src, dst); 1077 DO_imm_mandr_r("insertps", 94, src, dst); 1078 DO_imm_mandr_r("insertps", 95, src, dst); 1079 DO_imm_mandr_r("insertps", 96, src, dst); 1080 DO_imm_mandr_r("insertps", 97, src, dst); 1081 DO_imm_mandr_r("insertps", 98, src, dst); 1082 DO_imm_mandr_r("insertps", 99, src, dst); 1083 DO_imm_mandr_r("insertps", 100, src, dst); 1084 DO_imm_mandr_r("insertps", 101, src, dst); 1085 DO_imm_mandr_r("insertps", 102, src, dst); 1086 DO_imm_mandr_r("insertps", 103, src, dst); 1087 DO_imm_mandr_r("insertps", 104, src, dst); 1088 DO_imm_mandr_r("insertps", 105, src, dst); 1089 DO_imm_mandr_r("insertps", 106, src, dst); 1090 DO_imm_mandr_r("insertps", 107, src, dst); 1091 DO_imm_mandr_r("insertps", 108, src, dst); 1092 DO_imm_mandr_r("insertps", 109, src, dst); 1093 DO_imm_mandr_r("insertps", 110, src, dst); 1094 DO_imm_mandr_r("insertps", 111, src, dst); 1095 DO_imm_mandr_r("insertps", 112, src, dst); 1096 DO_imm_mandr_r("insertps", 113, src, dst); 1097 DO_imm_mandr_r("insertps", 114, src, dst); 1098 DO_imm_mandr_r("insertps", 115, src, dst); 1099 DO_imm_mandr_r("insertps", 116, src, dst); 1100 DO_imm_mandr_r("insertps", 117, src, dst); 1101 DO_imm_mandr_r("insertps", 118, src, dst); 1102 DO_imm_mandr_r("insertps", 119, src, dst); 1103 DO_imm_mandr_r("insertps", 120, src, dst); 1104 DO_imm_mandr_r("insertps", 121, src, dst); 1105 DO_imm_mandr_r("insertps", 122, src, dst); 1106 DO_imm_mandr_r("insertps", 123, src, dst); 1107 DO_imm_mandr_r("insertps", 124, src, dst); 1108 DO_imm_mandr_r("insertps", 125, src, dst); 1109 DO_imm_mandr_r("insertps", 126, src, dst); 1110 DO_imm_mandr_r("insertps", 127, src, dst); 1111 DO_imm_mandr_r("insertps", 128, src, dst); 1112 DO_imm_mandr_r("insertps", 129, src, dst); 1113 DO_imm_mandr_r("insertps", 130, src, dst); 1114 DO_imm_mandr_r("insertps", 131, src, dst); 1115 DO_imm_mandr_r("insertps", 132, src, dst); 1116 DO_imm_mandr_r("insertps", 133, src, dst); 1117 DO_imm_mandr_r("insertps", 134, src, dst); 1118 DO_imm_mandr_r("insertps", 135, src, dst); 1119 DO_imm_mandr_r("insertps", 136, src, dst); 1120 DO_imm_mandr_r("insertps", 137, src, dst); 1121 DO_imm_mandr_r("insertps", 138, src, dst); 1122 DO_imm_mandr_r("insertps", 139, src, dst); 1123 DO_imm_mandr_r("insertps", 140, src, dst); 1124 DO_imm_mandr_r("insertps", 141, src, dst); 1125 DO_imm_mandr_r("insertps", 142, src, dst); 1126 DO_imm_mandr_r("insertps", 143, src, dst); 1127 DO_imm_mandr_r("insertps", 144, src, dst); 1128 DO_imm_mandr_r("insertps", 145, src, dst); 1129 DO_imm_mandr_r("insertps", 146, src, dst); 1130 DO_imm_mandr_r("insertps", 147, src, dst); 1131 DO_imm_mandr_r("insertps", 148, src, dst); 1132 DO_imm_mandr_r("insertps", 149, src, dst); 1133 DO_imm_mandr_r("insertps", 150, src, dst); 1134 DO_imm_mandr_r("insertps", 151, src, dst); 1135 DO_imm_mandr_r("insertps", 152, src, dst); 1136 DO_imm_mandr_r("insertps", 153, src, dst); 1137 DO_imm_mandr_r("insertps", 154, src, dst); 1138 DO_imm_mandr_r("insertps", 155, src, dst); 1139 DO_imm_mandr_r("insertps", 156, src, dst); 1140 DO_imm_mandr_r("insertps", 157, src, dst); 1141 DO_imm_mandr_r("insertps", 158, src, dst); 1142 DO_imm_mandr_r("insertps", 159, src, dst); 1143 DO_imm_mandr_r("insertps", 160, src, dst); 1144 DO_imm_mandr_r("insertps", 161, src, dst); 1145 DO_imm_mandr_r("insertps", 162, src, dst); 1146 DO_imm_mandr_r("insertps", 163, src, dst); 1147 DO_imm_mandr_r("insertps", 164, src, dst); 1148 DO_imm_mandr_r("insertps", 165, src, dst); 1149 DO_imm_mandr_r("insertps", 166, src, dst); 1150 DO_imm_mandr_r("insertps", 167, src, dst); 1151 DO_imm_mandr_r("insertps", 168, src, dst); 1152 DO_imm_mandr_r("insertps", 169, src, dst); 1153 DO_imm_mandr_r("insertps", 170, src, dst); 1154 DO_imm_mandr_r("insertps", 171, src, dst); 1155 DO_imm_mandr_r("insertps", 172, src, dst); 1156 DO_imm_mandr_r("insertps", 173, src, dst); 1157 DO_imm_mandr_r("insertps", 174, src, dst); 1158 DO_imm_mandr_r("insertps", 175, src, dst); 1159 DO_imm_mandr_r("insertps", 176, src, dst); 1160 DO_imm_mandr_r("insertps", 177, src, dst); 1161 DO_imm_mandr_r("insertps", 178, src, dst); 1162 DO_imm_mandr_r("insertps", 179, src, dst); 1163 DO_imm_mandr_r("insertps", 180, src, dst); 1164 DO_imm_mandr_r("insertps", 181, src, dst); 1165 DO_imm_mandr_r("insertps", 182, src, dst); 1166 DO_imm_mandr_r("insertps", 183, src, dst); 1167 DO_imm_mandr_r("insertps", 184, src, dst); 1168 DO_imm_mandr_r("insertps", 185, src, dst); 1169 DO_imm_mandr_r("insertps", 186, src, dst); 1170 DO_imm_mandr_r("insertps", 187, src, dst); 1171 DO_imm_mandr_r("insertps", 188, src, dst); 1172 DO_imm_mandr_r("insertps", 189, src, dst); 1173 DO_imm_mandr_r("insertps", 190, src, dst); 1174 DO_imm_mandr_r("insertps", 191, src, dst); 1175 DO_imm_mandr_r("insertps", 192, src, dst); 1176 DO_imm_mandr_r("insertps", 193, src, dst); 1177 DO_imm_mandr_r("insertps", 194, src, dst); 1178 DO_imm_mandr_r("insertps", 195, src, dst); 1179 DO_imm_mandr_r("insertps", 196, src, dst); 1180 DO_imm_mandr_r("insertps", 197, src, dst); 1181 DO_imm_mandr_r("insertps", 198, src, dst); 1182 DO_imm_mandr_r("insertps", 199, src, dst); 1183 DO_imm_mandr_r("insertps", 200, src, dst); 1184 DO_imm_mandr_r("insertps", 201, src, dst); 1185 DO_imm_mandr_r("insertps", 202, src, dst); 1186 DO_imm_mandr_r("insertps", 203, src, dst); 1187 DO_imm_mandr_r("insertps", 204, src, dst); 1188 DO_imm_mandr_r("insertps", 205, src, dst); 1189 DO_imm_mandr_r("insertps", 206, src, dst); 1190 DO_imm_mandr_r("insertps", 207, src, dst); 1191 DO_imm_mandr_r("insertps", 208, src, dst); 1192 DO_imm_mandr_r("insertps", 209, src, dst); 1193 DO_imm_mandr_r("insertps", 210, src, dst); 1194 DO_imm_mandr_r("insertps", 211, src, dst); 1195 DO_imm_mandr_r("insertps", 212, src, dst); 1196 DO_imm_mandr_r("insertps", 213, src, dst); 1197 DO_imm_mandr_r("insertps", 214, src, dst); 1198 DO_imm_mandr_r("insertps", 215, src, dst); 1199 DO_imm_mandr_r("insertps", 216, src, dst); 1200 DO_imm_mandr_r("insertps", 217, src, dst); 1201 DO_imm_mandr_r("insertps", 218, src, dst); 1202 DO_imm_mandr_r("insertps", 219, src, dst); 1203 DO_imm_mandr_r("insertps", 220, src, dst); 1204 DO_imm_mandr_r("insertps", 221, src, dst); 1205 DO_imm_mandr_r("insertps", 222, src, dst); 1206 DO_imm_mandr_r("insertps", 223, src, dst); 1207 DO_imm_mandr_r("insertps", 224, src, dst); 1208 DO_imm_mandr_r("insertps", 225, src, dst); 1209 DO_imm_mandr_r("insertps", 226, src, dst); 1210 DO_imm_mandr_r("insertps", 227, src, dst); 1211 DO_imm_mandr_r("insertps", 228, src, dst); 1212 DO_imm_mandr_r("insertps", 229, src, dst); 1213 DO_imm_mandr_r("insertps", 230, src, dst); 1214 DO_imm_mandr_r("insertps", 231, src, dst); 1215 DO_imm_mandr_r("insertps", 232, src, dst); 1216 DO_imm_mandr_r("insertps", 233, src, dst); 1217 DO_imm_mandr_r("insertps", 234, src, dst); 1218 DO_imm_mandr_r("insertps", 235, src, dst); 1219 DO_imm_mandr_r("insertps", 236, src, dst); 1220 DO_imm_mandr_r("insertps", 237, src, dst); 1221 DO_imm_mandr_r("insertps", 238, src, dst); 1222 DO_imm_mandr_r("insertps", 239, src, dst); 1223 DO_imm_mandr_r("insertps", 240, src, dst); 1224 DO_imm_mandr_r("insertps", 241, src, dst); 1225 DO_imm_mandr_r("insertps", 242, src, dst); 1226 DO_imm_mandr_r("insertps", 243, src, dst); 1227 DO_imm_mandr_r("insertps", 244, src, dst); 1228 DO_imm_mandr_r("insertps", 245, src, dst); 1229 DO_imm_mandr_r("insertps", 246, src, dst); 1230 DO_imm_mandr_r("insertps", 247, src, dst); 1231 DO_imm_mandr_r("insertps", 248, src, dst); 1232 DO_imm_mandr_r("insertps", 249, src, dst); 1233 DO_imm_mandr_r("insertps", 250, src, dst); 1234 DO_imm_mandr_r("insertps", 251, src, dst); 1235 DO_imm_mandr_r("insertps", 252, src, dst); 1236 DO_imm_mandr_r("insertps", 253, src, dst); 1237 DO_imm_mandr_r("insertps", 254, src, dst); 1238 DO_imm_mandr_r("insertps", 255, src, dst); 1239 } 1240 } 1241 1242 void test_MPSADBW ( void ) 1243 { 1244 V128 src, dst; 1245 Int i; 1246 for (i = 0; i < 50; i++) { 1247 randV128(&src); 1248 randV128(&dst); 1249 DO_imm_mandr_r("mpsadbw", 0, src, dst); 1250 DO_imm_mandr_r("mpsadbw", 1, src, dst); 1251 DO_imm_mandr_r("mpsadbw", 2, src, dst); 1252 DO_imm_mandr_r("mpsadbw", 3, src, dst); 1253 DO_imm_mandr_r("mpsadbw", 4, src, dst); 1254 DO_imm_mandr_r("mpsadbw", 5, src, dst); 1255 DO_imm_mandr_r("mpsadbw", 6, src, dst); 1256 DO_imm_mandr_r("mpsadbw", 7, src, dst); 1257 } 1258 } 1259 1260 void test_PACKUSDW ( void ) 1261 { 1262 V128 src, dst; 1263 Int i; 1264 for (i = 0; i < 10; i++) { 1265 if (i < 9) { 1266 randV128(&src); 1267 randV128(&dst); 1268 } else { 1269 memset(&src, 0, sizeof(src)); 1270 memset(&dst, 0, sizeof(src)); 1271 src[0] = 0x11; src[1] = 0x22; 1272 src[4] = 0x33; src[5] = 0x44; 1273 src[8] = 0x55; src[9] = 0x66; 1274 src[12] = 0x77; src[13] = 0x88; 1275 dst[0] = 0xaa; dst[1] = 0xbb; 1276 dst[4] = 0xcc; dst[5] = 0xdd; 1277 dst[8] = 0xee; dst[9] = 0xff; 1278 dst[12] = 0xa1; dst[13] = 0xb2; 1279 } 1280 DO_mandr_r("packusdw", src, dst); 1281 } 1282 } 1283 1284 void test_PBLENDW ( void ) 1285 { 1286 V128 src, dst; 1287 randV128(&src); 1288 randV128(&dst); 1289 { 1290 DO_imm_mandr_r("pblendw", 0, src, dst); 1291 DO_imm_mandr_r("pblendw", 1, src, dst); 1292 DO_imm_mandr_r("pblendw", 2, src, dst); 1293 DO_imm_mandr_r("pblendw", 3, src, dst); 1294 DO_imm_mandr_r("pblendw", 4, src, dst); 1295 DO_imm_mandr_r("pblendw", 5, src, dst); 1296 DO_imm_mandr_r("pblendw", 6, src, dst); 1297 DO_imm_mandr_r("pblendw", 7, src, dst); 1298 DO_imm_mandr_r("pblendw", 8, src, dst); 1299 DO_imm_mandr_r("pblendw", 9, src, dst); 1300 DO_imm_mandr_r("pblendw", 10, src, dst); 1301 DO_imm_mandr_r("pblendw", 11, src, dst); 1302 DO_imm_mandr_r("pblendw", 12, src, dst); 1303 DO_imm_mandr_r("pblendw", 13, src, dst); 1304 DO_imm_mandr_r("pblendw", 14, src, dst); 1305 DO_imm_mandr_r("pblendw", 15, src, dst); 1306 DO_imm_mandr_r("pblendw", 16, src, dst); 1307 DO_imm_mandr_r("pblendw", 17, src, dst); 1308 DO_imm_mandr_r("pblendw", 18, src, dst); 1309 DO_imm_mandr_r("pblendw", 19, src, dst); 1310 DO_imm_mandr_r("pblendw", 20, src, dst); 1311 DO_imm_mandr_r("pblendw", 21, src, dst); 1312 DO_imm_mandr_r("pblendw", 22, src, dst); 1313 DO_imm_mandr_r("pblendw", 23, src, dst); 1314 DO_imm_mandr_r("pblendw", 24, src, dst); 1315 DO_imm_mandr_r("pblendw", 25, src, dst); 1316 DO_imm_mandr_r("pblendw", 26, src, dst); 1317 DO_imm_mandr_r("pblendw", 27, src, dst); 1318 DO_imm_mandr_r("pblendw", 28, src, dst); 1319 DO_imm_mandr_r("pblendw", 29, src, dst); 1320 DO_imm_mandr_r("pblendw", 30, src, dst); 1321 DO_imm_mandr_r("pblendw", 31, src, dst); 1322 DO_imm_mandr_r("pblendw", 32, src, dst); 1323 DO_imm_mandr_r("pblendw", 33, src, dst); 1324 DO_imm_mandr_r("pblendw", 34, src, dst); 1325 DO_imm_mandr_r("pblendw", 35, src, dst); 1326 DO_imm_mandr_r("pblendw", 36, src, dst); 1327 DO_imm_mandr_r("pblendw", 37, src, dst); 1328 DO_imm_mandr_r("pblendw", 38, src, dst); 1329 DO_imm_mandr_r("pblendw", 39, src, dst); 1330 DO_imm_mandr_r("pblendw", 40, src, dst); 1331 DO_imm_mandr_r("pblendw", 41, src, dst); 1332 DO_imm_mandr_r("pblendw", 42, src, dst); 1333 DO_imm_mandr_r("pblendw", 43, src, dst); 1334 DO_imm_mandr_r("pblendw", 44, src, dst); 1335 DO_imm_mandr_r("pblendw", 45, src, dst); 1336 DO_imm_mandr_r("pblendw", 46, src, dst); 1337 DO_imm_mandr_r("pblendw", 47, src, dst); 1338 DO_imm_mandr_r("pblendw", 48, src, dst); 1339 DO_imm_mandr_r("pblendw", 49, src, dst); 1340 DO_imm_mandr_r("pblendw", 50, src, dst); 1341 DO_imm_mandr_r("pblendw", 51, src, dst); 1342 DO_imm_mandr_r("pblendw", 52, src, dst); 1343 DO_imm_mandr_r("pblendw", 53, src, dst); 1344 DO_imm_mandr_r("pblendw", 54, src, dst); 1345 DO_imm_mandr_r("pblendw", 55, src, dst); 1346 DO_imm_mandr_r("pblendw", 56, src, dst); 1347 DO_imm_mandr_r("pblendw", 57, src, dst); 1348 DO_imm_mandr_r("pblendw", 58, src, dst); 1349 DO_imm_mandr_r("pblendw", 59, src, dst); 1350 DO_imm_mandr_r("pblendw", 60, src, dst); 1351 DO_imm_mandr_r("pblendw", 61, src, dst); 1352 DO_imm_mandr_r("pblendw", 62, src, dst); 1353 DO_imm_mandr_r("pblendw", 63, src, dst); 1354 DO_imm_mandr_r("pblendw", 64, src, dst); 1355 DO_imm_mandr_r("pblendw", 65, src, dst); 1356 DO_imm_mandr_r("pblendw", 66, src, dst); 1357 DO_imm_mandr_r("pblendw", 67, src, dst); 1358 DO_imm_mandr_r("pblendw", 68, src, dst); 1359 DO_imm_mandr_r("pblendw", 69, src, dst); 1360 DO_imm_mandr_r("pblendw", 70, src, dst); 1361 DO_imm_mandr_r("pblendw", 71, src, dst); 1362 DO_imm_mandr_r("pblendw", 72, src, dst); 1363 DO_imm_mandr_r("pblendw", 73, src, dst); 1364 DO_imm_mandr_r("pblendw", 74, src, dst); 1365 DO_imm_mandr_r("pblendw", 75, src, dst); 1366 DO_imm_mandr_r("pblendw", 76, src, dst); 1367 DO_imm_mandr_r("pblendw", 77, src, dst); 1368 DO_imm_mandr_r("pblendw", 78, src, dst); 1369 DO_imm_mandr_r("pblendw", 79, src, dst); 1370 DO_imm_mandr_r("pblendw", 80, src, dst); 1371 DO_imm_mandr_r("pblendw", 81, src, dst); 1372 DO_imm_mandr_r("pblendw", 82, src, dst); 1373 DO_imm_mandr_r("pblendw", 83, src, dst); 1374 DO_imm_mandr_r("pblendw", 84, src, dst); 1375 DO_imm_mandr_r("pblendw", 85, src, dst); 1376 DO_imm_mandr_r("pblendw", 86, src, dst); 1377 DO_imm_mandr_r("pblendw", 87, src, dst); 1378 DO_imm_mandr_r("pblendw", 88, src, dst); 1379 DO_imm_mandr_r("pblendw", 89, src, dst); 1380 DO_imm_mandr_r("pblendw", 90, src, dst); 1381 DO_imm_mandr_r("pblendw", 91, src, dst); 1382 DO_imm_mandr_r("pblendw", 92, src, dst); 1383 DO_imm_mandr_r("pblendw", 93, src, dst); 1384 DO_imm_mandr_r("pblendw", 94, src, dst); 1385 DO_imm_mandr_r("pblendw", 95, src, dst); 1386 DO_imm_mandr_r("pblendw", 96, src, dst); 1387 DO_imm_mandr_r("pblendw", 97, src, dst); 1388 DO_imm_mandr_r("pblendw", 98, src, dst); 1389 DO_imm_mandr_r("pblendw", 99, src, dst); 1390 DO_imm_mandr_r("pblendw", 100, src, dst); 1391 DO_imm_mandr_r("pblendw", 101, src, dst); 1392 DO_imm_mandr_r("pblendw", 102, src, dst); 1393 DO_imm_mandr_r("pblendw", 103, src, dst); 1394 DO_imm_mandr_r("pblendw", 104, src, dst); 1395 DO_imm_mandr_r("pblendw", 105, src, dst); 1396 DO_imm_mandr_r("pblendw", 106, src, dst); 1397 DO_imm_mandr_r("pblendw", 107, src, dst); 1398 DO_imm_mandr_r("pblendw", 108, src, dst); 1399 DO_imm_mandr_r("pblendw", 109, src, dst); 1400 DO_imm_mandr_r("pblendw", 110, src, dst); 1401 DO_imm_mandr_r("pblendw", 111, src, dst); 1402 DO_imm_mandr_r("pblendw", 112, src, dst); 1403 DO_imm_mandr_r("pblendw", 113, src, dst); 1404 DO_imm_mandr_r("pblendw", 114, src, dst); 1405 DO_imm_mandr_r("pblendw", 115, src, dst); 1406 DO_imm_mandr_r("pblendw", 116, src, dst); 1407 DO_imm_mandr_r("pblendw", 117, src, dst); 1408 DO_imm_mandr_r("pblendw", 118, src, dst); 1409 DO_imm_mandr_r("pblendw", 119, src, dst); 1410 DO_imm_mandr_r("pblendw", 120, src, dst); 1411 DO_imm_mandr_r("pblendw", 121, src, dst); 1412 DO_imm_mandr_r("pblendw", 122, src, dst); 1413 DO_imm_mandr_r("pblendw", 123, src, dst); 1414 DO_imm_mandr_r("pblendw", 124, src, dst); 1415 DO_imm_mandr_r("pblendw", 125, src, dst); 1416 DO_imm_mandr_r("pblendw", 126, src, dst); 1417 DO_imm_mandr_r("pblendw", 127, src, dst); 1418 DO_imm_mandr_r("pblendw", 128, src, dst); 1419 DO_imm_mandr_r("pblendw", 129, src, dst); 1420 DO_imm_mandr_r("pblendw", 130, src, dst); 1421 DO_imm_mandr_r("pblendw", 131, src, dst); 1422 DO_imm_mandr_r("pblendw", 132, src, dst); 1423 DO_imm_mandr_r("pblendw", 133, src, dst); 1424 DO_imm_mandr_r("pblendw", 134, src, dst); 1425 DO_imm_mandr_r("pblendw", 135, src, dst); 1426 DO_imm_mandr_r("pblendw", 136, src, dst); 1427 DO_imm_mandr_r("pblendw", 137, src, dst); 1428 DO_imm_mandr_r("pblendw", 138, src, dst); 1429 DO_imm_mandr_r("pblendw", 139, src, dst); 1430 DO_imm_mandr_r("pblendw", 140, src, dst); 1431 DO_imm_mandr_r("pblendw", 141, src, dst); 1432 DO_imm_mandr_r("pblendw", 142, src, dst); 1433 DO_imm_mandr_r("pblendw", 143, src, dst); 1434 DO_imm_mandr_r("pblendw", 144, src, dst); 1435 DO_imm_mandr_r("pblendw", 145, src, dst); 1436 DO_imm_mandr_r("pblendw", 146, src, dst); 1437 DO_imm_mandr_r("pblendw", 147, src, dst); 1438 DO_imm_mandr_r("pblendw", 148, src, dst); 1439 DO_imm_mandr_r("pblendw", 149, src, dst); 1440 DO_imm_mandr_r("pblendw", 150, src, dst); 1441 DO_imm_mandr_r("pblendw", 151, src, dst); 1442 DO_imm_mandr_r("pblendw", 152, src, dst); 1443 DO_imm_mandr_r("pblendw", 153, src, dst); 1444 DO_imm_mandr_r("pblendw", 154, src, dst); 1445 DO_imm_mandr_r("pblendw", 155, src, dst); 1446 DO_imm_mandr_r("pblendw", 156, src, dst); 1447 DO_imm_mandr_r("pblendw", 157, src, dst); 1448 DO_imm_mandr_r("pblendw", 158, src, dst); 1449 DO_imm_mandr_r("pblendw", 159, src, dst); 1450 DO_imm_mandr_r("pblendw", 160, src, dst); 1451 DO_imm_mandr_r("pblendw", 161, src, dst); 1452 DO_imm_mandr_r("pblendw", 162, src, dst); 1453 DO_imm_mandr_r("pblendw", 163, src, dst); 1454 DO_imm_mandr_r("pblendw", 164, src, dst); 1455 DO_imm_mandr_r("pblendw", 165, src, dst); 1456 DO_imm_mandr_r("pblendw", 166, src, dst); 1457 DO_imm_mandr_r("pblendw", 167, src, dst); 1458 DO_imm_mandr_r("pblendw", 168, src, dst); 1459 DO_imm_mandr_r("pblendw", 169, src, dst); 1460 DO_imm_mandr_r("pblendw", 170, src, dst); 1461 DO_imm_mandr_r("pblendw", 171, src, dst); 1462 DO_imm_mandr_r("pblendw", 172, src, dst); 1463 DO_imm_mandr_r("pblendw", 173, src, dst); 1464 DO_imm_mandr_r("pblendw", 174, src, dst); 1465 DO_imm_mandr_r("pblendw", 175, src, dst); 1466 DO_imm_mandr_r("pblendw", 176, src, dst); 1467 DO_imm_mandr_r("pblendw", 177, src, dst); 1468 DO_imm_mandr_r("pblendw", 178, src, dst); 1469 DO_imm_mandr_r("pblendw", 179, src, dst); 1470 DO_imm_mandr_r("pblendw", 180, src, dst); 1471 DO_imm_mandr_r("pblendw", 181, src, dst); 1472 DO_imm_mandr_r("pblendw", 182, src, dst); 1473 DO_imm_mandr_r("pblendw", 183, src, dst); 1474 DO_imm_mandr_r("pblendw", 184, src, dst); 1475 DO_imm_mandr_r("pblendw", 185, src, dst); 1476 DO_imm_mandr_r("pblendw", 186, src, dst); 1477 DO_imm_mandr_r("pblendw", 187, src, dst); 1478 DO_imm_mandr_r("pblendw", 188, src, dst); 1479 DO_imm_mandr_r("pblendw", 189, src, dst); 1480 DO_imm_mandr_r("pblendw", 190, src, dst); 1481 DO_imm_mandr_r("pblendw", 191, src, dst); 1482 DO_imm_mandr_r("pblendw", 192, src, dst); 1483 DO_imm_mandr_r("pblendw", 193, src, dst); 1484 DO_imm_mandr_r("pblendw", 194, src, dst); 1485 DO_imm_mandr_r("pblendw", 195, src, dst); 1486 DO_imm_mandr_r("pblendw", 196, src, dst); 1487 DO_imm_mandr_r("pblendw", 197, src, dst); 1488 DO_imm_mandr_r("pblendw", 198, src, dst); 1489 DO_imm_mandr_r("pblendw", 199, src, dst); 1490 DO_imm_mandr_r("pblendw", 200, src, dst); 1491 DO_imm_mandr_r("pblendw", 201, src, dst); 1492 DO_imm_mandr_r("pblendw", 202, src, dst); 1493 DO_imm_mandr_r("pblendw", 203, src, dst); 1494 DO_imm_mandr_r("pblendw", 204, src, dst); 1495 DO_imm_mandr_r("pblendw", 205, src, dst); 1496 DO_imm_mandr_r("pblendw", 206, src, dst); 1497 DO_imm_mandr_r("pblendw", 207, src, dst); 1498 DO_imm_mandr_r("pblendw", 208, src, dst); 1499 DO_imm_mandr_r("pblendw", 209, src, dst); 1500 DO_imm_mandr_r("pblendw", 210, src, dst); 1501 DO_imm_mandr_r("pblendw", 211, src, dst); 1502 DO_imm_mandr_r("pblendw", 212, src, dst); 1503 DO_imm_mandr_r("pblendw", 213, src, dst); 1504 DO_imm_mandr_r("pblendw", 214, src, dst); 1505 DO_imm_mandr_r("pblendw", 215, src, dst); 1506 DO_imm_mandr_r("pblendw", 216, src, dst); 1507 DO_imm_mandr_r("pblendw", 217, src, dst); 1508 DO_imm_mandr_r("pblendw", 218, src, dst); 1509 DO_imm_mandr_r("pblendw", 219, src, dst); 1510 DO_imm_mandr_r("pblendw", 220, src, dst); 1511 DO_imm_mandr_r("pblendw", 221, src, dst); 1512 DO_imm_mandr_r("pblendw", 222, src, dst); 1513 DO_imm_mandr_r("pblendw", 223, src, dst); 1514 DO_imm_mandr_r("pblendw", 224, src, dst); 1515 DO_imm_mandr_r("pblendw", 225, src, dst); 1516 DO_imm_mandr_r("pblendw", 226, src, dst); 1517 DO_imm_mandr_r("pblendw", 227, src, dst); 1518 DO_imm_mandr_r("pblendw", 228, src, dst); 1519 DO_imm_mandr_r("pblendw", 229, src, dst); 1520 DO_imm_mandr_r("pblendw", 230, src, dst); 1521 DO_imm_mandr_r("pblendw", 231, src, dst); 1522 DO_imm_mandr_r("pblendw", 232, src, dst); 1523 DO_imm_mandr_r("pblendw", 233, src, dst); 1524 DO_imm_mandr_r("pblendw", 234, src, dst); 1525 DO_imm_mandr_r("pblendw", 235, src, dst); 1526 DO_imm_mandr_r("pblendw", 236, src, dst); 1527 DO_imm_mandr_r("pblendw", 237, src, dst); 1528 DO_imm_mandr_r("pblendw", 238, src, dst); 1529 DO_imm_mandr_r("pblendw", 239, src, dst); 1530 DO_imm_mandr_r("pblendw", 240, src, dst); 1531 DO_imm_mandr_r("pblendw", 241, src, dst); 1532 DO_imm_mandr_r("pblendw", 242, src, dst); 1533 DO_imm_mandr_r("pblendw", 243, src, dst); 1534 DO_imm_mandr_r("pblendw", 244, src, dst); 1535 DO_imm_mandr_r("pblendw", 245, src, dst); 1536 DO_imm_mandr_r("pblendw", 246, src, dst); 1537 DO_imm_mandr_r("pblendw", 247, src, dst); 1538 DO_imm_mandr_r("pblendw", 248, src, dst); 1539 DO_imm_mandr_r("pblendw", 249, src, dst); 1540 DO_imm_mandr_r("pblendw", 250, src, dst); 1541 DO_imm_mandr_r("pblendw", 251, src, dst); 1542 DO_imm_mandr_r("pblendw", 252, src, dst); 1543 DO_imm_mandr_r("pblendw", 253, src, dst); 1544 DO_imm_mandr_r("pblendw", 254, src, dst); 1545 DO_imm_mandr_r("pblendw", 255, src, dst); 1546 } 1547 } 1548 1549 1550 void test_PCMPEQQ ( void ) 1551 { 1552 V128 src, dst; 1553 Int i; 1554 for (i = 0; i < 10; i++) { 1555 randV128(&src); 1556 randV128(&dst); 1557 switch (i - 6) { 1558 case 0: memset(&src[0], 0x55, 8); 1559 memset(&dst[0], 0x55, 8); break; 1560 case 1: memset(&src[8], 0x55, 8); 1561 memset(&dst[8], 0x55, 8); break; 1562 default: 1563 break; 1564 } 1565 DO_mandr_r("pcmpeqq", src, dst); 1566 } 1567 } 1568 1569 1570 void test_PEXTRB ( void ) 1571 { 1572 V128 src; 1573 randV128(&src); 1574 DO_imm_r_to_mandrscalar("pextrb", 0, src, "d"); 1575 DO_imm_r_to_mandrscalar("pextrb", 1, src, "d"); 1576 DO_imm_r_to_mandrscalar("pextrb", 2, src, "d"); 1577 DO_imm_r_to_mandrscalar("pextrb", 3, src, "d"); 1578 DO_imm_r_to_mandrscalar("pextrb", 4, src, "d"); 1579 DO_imm_r_to_mandrscalar("pextrb", 5, src, "d"); 1580 DO_imm_r_to_mandrscalar("pextrb", 6, src, "d"); 1581 DO_imm_r_to_mandrscalar("pextrb", 7, src, "d"); 1582 DO_imm_r_to_mandrscalar("pextrb", 8, src, "d"); 1583 DO_imm_r_to_mandrscalar("pextrb", 9, src, "d"); 1584 DO_imm_r_to_mandrscalar("pextrb", 10, src, "d"); 1585 DO_imm_r_to_mandrscalar("pextrb", 11, src, "d"); 1586 DO_imm_r_to_mandrscalar("pextrb", 12, src, "d"); 1587 DO_imm_r_to_mandrscalar("pextrb", 13, src, "d"); 1588 DO_imm_r_to_mandrscalar("pextrb", 14, src, "d"); 1589 DO_imm_r_to_mandrscalar("pextrb", 15, src, "d"); 1590 } 1591 1592 void test_PINSRB ( void ) 1593 { 1594 ULong src; 1595 src = randULong(); 1596 DO_imm_mandrscalar_to_r("pinsrb", 0, src, "d"); 1597 src = randULong(); 1598 DO_imm_mandrscalar_to_r("pinsrb", 1, src, "d"); 1599 src = randULong(); 1600 DO_imm_mandrscalar_to_r("pinsrb", 2, src, "d"); 1601 src = randULong(); 1602 DO_imm_mandrscalar_to_r("pinsrb", 3, src, "d"); 1603 src = randULong(); 1604 DO_imm_mandrscalar_to_r("pinsrb", 4, src, "d"); 1605 src = randULong(); 1606 DO_imm_mandrscalar_to_r("pinsrb", 5, src, "d"); 1607 src = randULong(); 1608 DO_imm_mandrscalar_to_r("pinsrb", 6, src, "d"); 1609 src = randULong(); 1610 DO_imm_mandrscalar_to_r("pinsrb", 7, src, "d"); 1611 src = randULong(); 1612 DO_imm_mandrscalar_to_r("pinsrb", 8, src, "d"); 1613 src = randULong(); 1614 DO_imm_mandrscalar_to_r("pinsrb", 9, src, "d"); 1615 src = randULong(); 1616 DO_imm_mandrscalar_to_r("pinsrb", 10, src, "d"); 1617 src = randULong(); 1618 DO_imm_mandrscalar_to_r("pinsrb", 11, src, "d"); 1619 src = randULong(); 1620 DO_imm_mandrscalar_to_r("pinsrb", 12, src, "d"); 1621 src = randULong(); 1622 DO_imm_mandrscalar_to_r("pinsrb", 13, src, "d"); 1623 src = randULong(); 1624 DO_imm_mandrscalar_to_r("pinsrb", 14, src, "d"); 1625 src = randULong(); 1626 DO_imm_mandrscalar_to_r("pinsrb", 15, src, "d"); 1627 } 1628 1629 1630 void test_PEXTRW ( void ) 1631 { 1632 V128 src; 1633 randV128(&src); 1634 DO_imm_r_to_mandrscalar("pextrw", 0, src, "d"); 1635 DO_imm_r_to_mandrscalar("pextrw", 1, src, "d"); 1636 DO_imm_r_to_mandrscalar("pextrw", 2, src, "d"); 1637 DO_imm_r_to_mandrscalar("pextrw", 3, src, "d"); 1638 DO_imm_r_to_mandrscalar("pextrw", 4, src, "d"); 1639 DO_imm_r_to_mandrscalar("pextrw", 5, src, "d"); 1640 DO_imm_r_to_mandrscalar("pextrw", 6, src, "d"); 1641 DO_imm_r_to_mandrscalar("pextrw", 7, src, "d"); 1642 } 1643 1644 void test_PINSRW ( void ) 1645 { 1646 ULong src; 1647 src = randULong(); 1648 DO_imm_mandrscalar_to_r("pinsrw", 0, src, "d"); 1649 src = randULong(); 1650 DO_imm_mandrscalar_to_r("pinsrw", 1, src, "d"); 1651 src = randULong(); 1652 DO_imm_mandrscalar_to_r("pinsrw", 2, src, "d"); 1653 src = randULong(); 1654 DO_imm_mandrscalar_to_r("pinsrw", 3, src, "d"); 1655 src = randULong(); 1656 DO_imm_mandrscalar_to_r("pinsrw", 4, src, "d"); 1657 src = randULong(); 1658 DO_imm_mandrscalar_to_r("pinsrw", 5, src, "d"); 1659 src = randULong(); 1660 DO_imm_mandrscalar_to_r("pinsrw", 6, src, "d"); 1661 src = randULong(); 1662 DO_imm_mandrscalar_to_r("pinsrw", 7, src, "d"); 1663 } 1664 1665 1666 void test_PEXTRD ( void ) 1667 { 1668 V128 src; 1669 randV128(&src); 1670 DO_imm_r_to_mandrscalar("pextrd", 0, src, "d"); 1671 DO_imm_r_to_mandrscalar("pextrd", 1, src, "d"); 1672 DO_imm_r_to_mandrscalar("pextrd", 2, src, "d"); 1673 DO_imm_r_to_mandrscalar("pextrd", 3, src, "d"); 1674 } 1675 1676 void test_PINSRD ( void ) 1677 { 1678 ULong src; 1679 src = randULong(); 1680 DO_imm_mandrscalar_to_r("pinsrd", 0, src, "d"); 1681 src = randULong(); 1682 DO_imm_mandrscalar_to_r("pinsrd", 1, src, "d"); 1683 src = randULong(); 1684 DO_imm_mandrscalar_to_r("pinsrd", 2, src, "d"); 1685 src = randULong(); 1686 DO_imm_mandrscalar_to_r("pinsrd", 3, src, "d"); 1687 } 1688 1689 1690 void test_PEXTRQ ( void ) 1691 { 1692 V128 src; 1693 randV128(&src); 1694 DO_imm_r_to_mandrscalar("pextrq", 0, src, ""); 1695 DO_imm_r_to_mandrscalar("pextrq", 1, src, ""); 1696 } 1697 1698 void test_PINSRQ ( void ) 1699 { 1700 ULong src; 1701 src = randULong(); 1702 DO_imm_mandrscalar_to_r("pinsrq", 0, src, ""); 1703 src = randULong(); 1704 DO_imm_mandrscalar_to_r("pinsrq", 1, src, ""); 1705 } 1706 1707 1708 void test_EXTRACTPS ( void ) 1709 { 1710 V128 src; 1711 randV128(&src); 1712 DO_imm_r_to_mandrscalar("extractps", 0, src, "d"); 1713 DO_imm_r_to_mandrscalar("extractps", 1, src, "d"); 1714 DO_imm_r_to_mandrscalar("extractps", 2, src, "d"); 1715 DO_imm_r_to_mandrscalar("extractps", 3, src, "d"); 1716 } 1717 1718 1719 void test_PHMINPOSUW ( void ) 1720 { 1721 V128 src, dst; 1722 Int i; 1723 for (i = 0; i < 20; i++) { 1724 randV128(&src); 1725 randV128(&dst); 1726 DO_mandr_r("phminposuw", src, dst); 1727 } 1728 memset(src, 0x55, sizeof(src)); 1729 memset(dst, 0xAA, sizeof(dst)); 1730 DO_mandr_r("phminposuw", src, dst); 1731 } 1732 1733 void test_PMAXSB ( void ) 1734 { 1735 V128 src, dst; 1736 Int i; 1737 for (i = 0; i < 10; i++) { 1738 randV128(&src); 1739 randV128(&dst); 1740 DO_mandr_r("pmaxsb", src, dst); 1741 } 1742 } 1743 1744 void test_PMAXSD ( void ) 1745 { 1746 V128 src, dst; 1747 Int i; 1748 for (i = 0; i < 10; i++) { 1749 randV128(&src); 1750 randV128(&dst); 1751 DO_mandr_r("pmaxsd", src, dst); 1752 } 1753 } 1754 1755 void test_PMAXUD ( void ) 1756 { 1757 V128 src, dst; 1758 Int i; 1759 for (i = 0; i < 10; i++) { 1760 randV128(&src); 1761 randV128(&dst); 1762 DO_mandr_r("pmaxud", src, dst); 1763 } 1764 } 1765 1766 void test_PMAXUW ( void ) 1767 { 1768 V128 src, dst; 1769 Int i; 1770 for (i = 0; i < 10; i++) { 1771 randV128(&src); 1772 randV128(&dst); 1773 DO_mandr_r("pmaxuw", src, dst); 1774 } 1775 } 1776 1777 void test_PMINSB ( void ) 1778 { 1779 V128 src, dst; 1780 Int i; 1781 for (i = 0; i < 10; i++) { 1782 randV128(&src); 1783 randV128(&dst); 1784 DO_mandr_r("pminsb", src, dst); 1785 } 1786 } 1787 1788 void test_PMINSD ( void ) 1789 { 1790 V128 src, dst; 1791 Int i; 1792 for (i = 0; i < 10; i++) { 1793 randV128(&src); 1794 randV128(&dst); 1795 DO_mandr_r("pminsd", src, dst); 1796 } 1797 } 1798 1799 void test_PMINUD ( void ) 1800 { 1801 V128 src, dst; 1802 Int i; 1803 for (i = 0; i < 10; i++) { 1804 randV128(&src); 1805 randV128(&dst); 1806 DO_mandr_r("pminud", src, dst); 1807 } 1808 } 1809 1810 void test_PMINUW ( void ) 1811 { 1812 V128 src, dst; 1813 Int i; 1814 for (i = 0; i < 10; i++) { 1815 randV128(&src); 1816 randV128(&dst); 1817 DO_mandr_r("pminuw", src, dst); 1818 } 1819 } 1820 1821 void test_PMOVSXBW ( void ) 1822 { 1823 V128 src, dst; 1824 Int i; 1825 for (i = 0; i < 10; i++) { 1826 randV128(&src); 1827 randV128(&dst); 1828 DO_mandr_r("pmovsxbw", src, dst); 1829 } 1830 } 1831 1832 void test_PMOVSXBD ( void ) 1833 { 1834 V128 src, dst; 1835 Int i; 1836 for (i = 0; i < 10; i++) { 1837 randV128(&src); 1838 randV128(&dst); 1839 DO_mandr_r("pmovsxbd", src, dst); 1840 } 1841 } 1842 1843 void test_PMOVSXBQ ( void ) 1844 { 1845 V128 src, dst; 1846 Int i; 1847 for (i = 0; i < 10; i++) { 1848 randV128(&src); 1849 randV128(&dst); 1850 DO_mandr_r("pmovsxbq", src, dst); 1851 } 1852 } 1853 1854 void test_PMOVSXWD ( void ) 1855 { 1856 V128 src, dst; 1857 Int i; 1858 for (i = 0; i < 10; i++) { 1859 randV128(&src); 1860 randV128(&dst); 1861 DO_mandr_r("pmovsxwd", src, dst); 1862 } 1863 } 1864 1865 void test_PMOVSXWQ ( void ) 1866 { 1867 V128 src, dst; 1868 Int i; 1869 for (i = 0; i < 10; i++) { 1870 randV128(&src); 1871 randV128(&dst); 1872 DO_mandr_r("pmovsxwq", src, dst); 1873 } 1874 } 1875 1876 void test_PMOVSXDQ ( void ) 1877 { 1878 V128 src, dst; 1879 Int i; 1880 for (i = 0; i < 10; i++) { 1881 randV128(&src); 1882 randV128(&dst); 1883 DO_mandr_r("pmovsxdq", src, dst); 1884 } 1885 } 1886 1887 void test_PMOVZXBW ( void ) 1888 { 1889 V128 src, dst; 1890 Int i; 1891 for (i = 0; i < 10; i++) { 1892 randV128(&src); 1893 randV128(&dst); 1894 DO_mandr_r("pmovzxbw", src, dst); 1895 } 1896 } 1897 1898 void test_PMOVZXBD ( void ) 1899 { 1900 V128 src, dst; 1901 Int i; 1902 for (i = 0; i < 10; i++) { 1903 randV128(&src); 1904 randV128(&dst); 1905 DO_mandr_r("pmovzxbd", src, dst); 1906 } 1907 } 1908 1909 void test_PMOVZXBQ ( void ) 1910 { 1911 V128 src, dst; 1912 Int i; 1913 for (i = 0; i < 10; i++) { 1914 randV128(&src); 1915 randV128(&dst); 1916 DO_mandr_r("pmovzxbq", src, dst); 1917 } 1918 } 1919 1920 void test_PMOVZXWD ( void ) 1921 { 1922 V128 src, dst; 1923 Int i; 1924 for (i = 0; i < 10; i++) { 1925 randV128(&src); 1926 randV128(&dst); 1927 DO_mandr_r("pmovzxwd", src, dst); 1928 } 1929 } 1930 1931 void test_PMOVZXWQ ( void ) 1932 { 1933 V128 src, dst; 1934 Int i; 1935 for (i = 0; i < 10; i++) { 1936 randV128(&src); 1937 randV128(&dst); 1938 DO_mandr_r("pmovzxwq", src, dst); 1939 } 1940 } 1941 1942 void test_PMOVZXDQ ( void ) 1943 { 1944 V128 src, dst; 1945 Int i; 1946 for (i = 0; i < 10; i++) { 1947 randV128(&src); 1948 randV128(&dst); 1949 DO_mandr_r("pmovzxdq", src, dst); 1950 } 1951 } 1952 1953 void test_PMULDQ ( void ) 1954 { 1955 V128 src, dst; 1956 Int i; 1957 for (i = 0; i < 10; i++) { 1958 randV128(&src); 1959 randV128(&dst); 1960 DO_mandr_r("pmuldq", src, dst); 1961 } 1962 } 1963 1964 1965 void test_PMULLD ( void ) 1966 { 1967 V128 src, dst; 1968 Int i; 1969 for (i = 0; i < 10; i++) { 1970 randV128(&src); 1971 randV128(&dst); 1972 DO_mandr_r("pmulld", src, dst); 1973 } 1974 } 1975 1976 1977 void test_POPCNTQ ( void ) 1978 { 1979 ULong block[4]; 1980 Int i; 1981 ULong oszacp_mask = 0x8D5; 1982 for (i = 0; i < 10; i++) { 1983 block[0] = i == 0 ? 0 : randULong(); 1984 block[1] = randULong(); 1985 block[2] = randULong(); 1986 block[3] = randULong(); 1987 __asm__ __volatile__( 1988 "movq %0, %%rax" "\n\t" 1989 "movq 0(%%rax), %%rdi" "\n\t" 1990 "movq 8(%%rax), %%r11" "\n\t" 1991 #ifndef VGP_amd64_darwin 1992 "popcntq %%rdi, %%r11" "\n\t" 1993 #else 1994 "popcnt %%rdi, %%r11" "\n\t" 1995 #endif 1996 "movq %%r11, 16(%%rax)" "\n\t" 1997 "pushfq" "\n\t" 1998 "popq %%r12" "\n\t" 1999 "movq %%r12, 24(%%rax)" "\n" 2000 : /*out*/ 2001 : /*in*/"r"(&block[0]) 2002 : /*trash*/ "cc", "memory", "rdi", "r11", "r12" 2003 ); 2004 printf("r popcntq %016llx %016llx %016llx %016llx\n", 2005 block[0], block[1], block[2], block[3] & oszacp_mask); 2006 2007 block[0] = i == 0 ? 0 : randULong(); 2008 block[1] = randULong(); 2009 block[2] = randULong(); 2010 block[3] = randULong(); 2011 __asm__ __volatile__( 2012 "movq %0, %%rax" "\n\t" 2013 "movq 8(%%rax), %%r11" "\n\t" 2014 #ifndef VGP_amd64_darwin 2015 "popcntq 0(%%rax), %%r11" "\n\t" 2016 #else 2017 "popcnt 0(%%rax), %%r11" "\n\t" 2018 #endif 2019 "movq %%r11, 16(%%rax)" "\n\t" 2020 "pushfq" "\n\t" 2021 "popq %%r12" "\n\t" 2022 "movq %%r12, 24(%%rax)" "\n" 2023 : /*out*/ 2024 : /*in*/"r"(&block[0]) 2025 : /*trash*/ "cc", "memory", "r11", "r12" 2026 ); 2027 printf("m popcntq %016llx %016llx %016llx %016llx\n", 2028 block[0], block[1], block[2], block[3] & oszacp_mask); 2029 } 2030 } 2031 2032 2033 void test_POPCNTL ( void ) 2034 { 2035 ULong block[4]; 2036 Int i; 2037 ULong oszacp_mask = 0x8D5; 2038 for (i = 0; i < 10; i++) { 2039 block[0] = i == 0 ? 0 : randULong(); 2040 block[1] = randULong(); 2041 block[2] = randULong(); 2042 block[3] = randULong(); 2043 __asm__ __volatile__( 2044 "movq %0, %%rax" "\n\t" 2045 "movq 0(%%rax), %%rdi" "\n\t" 2046 "movq 8(%%rax), %%r11" "\n\t" 2047 #ifndef VGP_amd64_darwin 2048 "popcntl %%edi, %%r11d" "\n\t" 2049 #else 2050 "popcnt %%edi, %%r11d" "\n\t" 2051 #endif 2052 "movq %%r11, 16(%%rax)" "\n\t" 2053 "pushfq" "\n\t" 2054 "popq %%r12" "\n\t" 2055 "movq %%r12, 24(%%rax)" "\n" 2056 : /*out*/ 2057 : /*in*/"r"(&block[0]) 2058 : /*trash*/ "cc", "memory", "rdi", "r11", "r12" 2059 ); 2060 printf("r popcntl %016llx %016llx %016llx %016llx\n", 2061 block[0], block[1], block[2], block[3] & oszacp_mask); 2062 2063 block[0] = i == 0 ? 0 : randULong(); 2064 block[1] = randULong(); 2065 block[2] = randULong(); 2066 block[3] = randULong(); 2067 __asm__ __volatile__( 2068 "movq %0, %%rax" "\n\t" 2069 "movq 8(%%rax), %%r11" "\n\t" 2070 #ifndef VGP_amd64_darwin 2071 "popcntl 0(%%rax), %%r11d" "\n\t" 2072 #else 2073 "popcnt 0(%%rax), %%r11d" "\n\t" 2074 #endif 2075 "movq %%r11, 16(%%rax)" "\n\t" 2076 "pushfq" "\n\t" 2077 "popq %%r12" "\n\t" 2078 "movq %%r12, 24(%%rax)" "\n" 2079 : /*out*/ 2080 : /*in*/"r"(&block[0]) 2081 : /*trash*/ "cc", "memory", "r11", "r12" 2082 ); 2083 printf("m popcntl %016llx %016llx %016llx %016llx\n", 2084 block[0], block[1], block[2], block[3] & oszacp_mask); 2085 } 2086 } 2087 2088 2089 void test_POPCNTW ( void ) 2090 { 2091 ULong block[4]; 2092 Int i; 2093 ULong oszacp_mask = 0x8D5; 2094 for (i = 0; i < 10; i++) { 2095 block[0] = i == 0 ? 0 : randULong(); 2096 block[1] = randULong(); 2097 block[2] = randULong(); 2098 block[3] = randULong(); 2099 __asm__ __volatile__( 2100 "movq %0, %%rax" "\n\t" 2101 "movq 0(%%rax), %%rdi" "\n\t" 2102 "movq 8(%%rax), %%r11" "\n\t" 2103 #ifndef VGP_amd64_darwin 2104 "popcntw %%di, %%r11w" "\n\t" 2105 #else 2106 "popcnt %%di, %%r11w" "\n\t" 2107 #endif 2108 "movq %%r11, 16(%%rax)" "\n\t" 2109 "pushfq" "\n\t" 2110 "popq %%r12" "\n\t" 2111 "movq %%r12, 24(%%rax)" "\n" 2112 : /*out*/ 2113 : /*in*/"r"(&block[0]) 2114 : /*trash*/ "cc", "memory", "rdi", "r11", "r12" 2115 ); 2116 printf("r popcntw %016llx %016llx %016llx %016llx\n", 2117 block[0], block[1], block[2], block[3] & oszacp_mask); 2118 2119 block[0] = i == 0 ? 0 : randULong(); 2120 block[1] = randULong(); 2121 block[2] = randULong(); 2122 block[3] = randULong(); 2123 __asm__ __volatile__( 2124 "movq %0, %%rax" "\n\t" 2125 "movq 8(%%rax), %%r11" "\n\t" 2126 #ifndef VGP_amd64_darwin 2127 "popcntw 0(%%rax), %%r11w" "\n\t" 2128 #else 2129 "popcnt 0(%%rax), %%r11w" "\n\t" 2130 #endif 2131 "movq %%r11, 16(%%rax)" "\n\t" 2132 "pushfq" "\n\t" 2133 "popq %%r12" "\n\t" 2134 "movq %%r12, 24(%%rax)" "\n" 2135 : /*out*/ 2136 : /*in*/"r"(&block[0]) 2137 : /*trash*/ "cc", "memory", "r11", "r12" 2138 ); 2139 printf("m popcntw %016llx %016llx %016llx %016llx\n", 2140 block[0], block[1], block[2], block[3] & oszacp_mask); 2141 } 2142 } 2143 2144 2145 void test_PCMPGTQ ( void ) 2146 { 2147 V128 spec[7]; 2148 do64HLtoV128( &spec[0], 0x0000000000000000ULL, 0xffffffffffffffffULL ); 2149 do64HLtoV128( &spec[1], 0x0000000000000001ULL, 0xfffffffffffffffeULL ); 2150 do64HLtoV128( &spec[2], 0x7fffffffffffffffULL, 0x8000000000000001ULL ); 2151 do64HLtoV128( &spec[3], 0x8000000000000000ULL, 0x8000000000000000ULL ); 2152 do64HLtoV128( &spec[4], 0x8000000000000001ULL, 0x7fffffffffffffffULL ); 2153 do64HLtoV128( &spec[5], 0xfffffffffffffffeULL, 0x0000000000000001ULL ); 2154 do64HLtoV128( &spec[6], 0xffffffffffffffffULL, 0x0000000000000000ULL ); 2155 2156 V128 src, dst; 2157 Int i, j; 2158 for (i = 0; i < 10; i++) { 2159 randV128(&src); 2160 randV128(&dst); 2161 DO_mandr_r("pcmpgtq", src, dst); 2162 } 2163 for (i = 0; i < 7; i++) { 2164 for (j = 0; j < 7; j++) { 2165 memcpy(&src, &spec[i], 16); 2166 memcpy(&dst, &spec[j], 16); 2167 DO_mandr_r("pcmpgtq", src, dst); 2168 } 2169 } 2170 } 2171 2172 /* ------------ ROUNDSD ------------ */ 2173 2174 void do_ROUNDSD_000 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2175 { 2176 if (mem) { 2177 __asm__ __volatile__( 2178 "movupd (%1), %%xmm11" "\n\t" 2179 "roundsd $0, (%0), %%xmm11" "\n\t" 2180 "movupd %%xmm11, (%1)" "\n" 2181 : /*OUT*/ 2182 : /*IN*/ "r"(src), "r"(dst) 2183 : /*TRASH*/ "xmm11" 2184 ); 2185 } else { 2186 __asm__ __volatile__( 2187 "movupd (%1), %%xmm11" "\n\t" 2188 "movupd (%0), %%xmm2" "\n\t" 2189 "roundsd $0, %%xmm2, %%xmm11" "\n\t" 2190 "movupd %%xmm11, (%1)" "\n" 2191 : /*OUT*/ 2192 : /*IN*/ "r"(src), "r"(dst) 2193 : /*TRASH*/ "xmm11","xmm2" 2194 ); 2195 } 2196 } 2197 2198 void do_ROUNDSD_001 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2199 { 2200 if (mem) { 2201 __asm__ __volatile__( 2202 "movupd (%1), %%xmm11" "\n\t" 2203 "roundsd $1, (%0), %%xmm11" "\n\t" 2204 "movupd %%xmm11, (%1)" "\n" 2205 : /*OUT*/ 2206 : /*IN*/ "r"(src), "r"(dst) 2207 : /*TRASH*/ "xmm11" 2208 ); 2209 } else { 2210 __asm__ __volatile__( 2211 "movupd (%1), %%xmm11" "\n\t" 2212 "movupd (%0), %%xmm2" "\n\t" 2213 "roundsd $1, %%xmm2, %%xmm11" "\n\t" 2214 "movupd %%xmm11, (%1)" "\n" 2215 : /*OUT*/ 2216 : /*IN*/ "r"(src), "r"(dst) 2217 : /*TRASH*/ "xmm11","xmm2" 2218 ); 2219 } 2220 } 2221 2222 void do_ROUNDSD_010 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2223 { 2224 if (mem) { 2225 __asm__ __volatile__( 2226 "movupd (%1), %%xmm11" "\n\t" 2227 "roundsd $2, (%0), %%xmm11" "\n\t" 2228 "movupd %%xmm11, (%1)" "\n" 2229 : /*OUT*/ 2230 : /*IN*/ "r"(src), "r"(dst) 2231 : /*TRASH*/ "xmm11" 2232 ); 2233 } else { 2234 __asm__ __volatile__( 2235 "movupd (%1), %%xmm11" "\n\t" 2236 "movupd (%0), %%xmm2" "\n\t" 2237 "roundsd $2, %%xmm2, %%xmm11" "\n\t" 2238 "movupd %%xmm11, (%1)" "\n" 2239 : /*OUT*/ 2240 : /*IN*/ "r"(src), "r"(dst) 2241 : /*TRASH*/ "xmm11","xmm2" 2242 ); 2243 } 2244 } 2245 2246 void do_ROUNDSD_011 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2247 { 2248 if (mem) { 2249 __asm__ __volatile__( 2250 "movupd (%1), %%xmm11" "\n\t" 2251 "roundsd $3, (%0), %%xmm11" "\n\t" 2252 "movupd %%xmm11, (%1)" "\n" 2253 : /*OUT*/ 2254 : /*IN*/ "r"(src), "r"(dst) 2255 : /*TRASH*/ "xmm11" 2256 ); 2257 } else { 2258 __asm__ __volatile__( 2259 "movupd (%1), %%xmm11" "\n\t" 2260 "movupd (%0), %%xmm2" "\n\t" 2261 "roundsd $3, %%xmm2, %%xmm11" "\n\t" 2262 "movupd %%xmm11, (%1)" "\n" 2263 : /*OUT*/ 2264 : /*IN*/ "r"(src), "r"(dst) 2265 : /*TRASH*/ "xmm11","xmm2" 2266 ); 2267 } 2268 } 2269 2270 void do_ROUNDSD_1XX ( Bool mem, V128* src, /*OUT*/V128* dst ) 2271 { 2272 if (mem) { 2273 __asm__ __volatile__( 2274 "movupd (%1), %%xmm11" "\n\t" 2275 "roundsd $4, (%0), %%xmm11" "\n\t" 2276 "movupd %%xmm11, (%1)" "\n" 2277 : /*OUT*/ 2278 : /*IN*/ "r"(src), "r"(dst) 2279 : /*TRASH*/ "xmm11" 2280 ); 2281 } else { 2282 __asm__ __volatile__( 2283 "movupd (%1), %%xmm11" "\n\t" 2284 "movupd (%0), %%xmm2" "\n\t" 2285 "roundsd $4, %%xmm2, %%xmm11" "\n\t" 2286 "movupd %%xmm11, (%1)" "\n" 2287 : /*OUT*/ 2288 : /*IN*/ "r"(src), "r"(dst) 2289 : /*TRASH*/ "xmm11","xmm2" 2290 ); 2291 } 2292 } 2293 2294 void test_ROUNDSD_w_immediate_rounding ( void ) 2295 { 2296 double vals[22]; 2297 Int i = 0; 2298 vals[i++] = 0.0; 2299 vals[i++] = -0.0; 2300 vals[i++] = mkPosInf(); 2301 vals[i++] = mkNegInf(); 2302 vals[i++] = mkPosNan(); 2303 vals[i++] = mkNegNan(); 2304 vals[i++] = -1.3; 2305 vals[i++] = -1.1; 2306 vals[i++] = -0.9; 2307 vals[i++] = -0.7; 2308 vals[i++] = -0.50001; 2309 vals[i++] = -0.49999; 2310 vals[i++] = -0.3; 2311 vals[i++] = -0.1; 2312 vals[i++] = 0.1; 2313 vals[i++] = 0.3; 2314 vals[i++] = 0.49999; 2315 vals[i++] = 0.50001; 2316 vals[i++] = 0.7; 2317 vals[i++] = 0.9; 2318 vals[i++] = 1.1; 2319 vals[i++] = 1.3; 2320 assert(i == 22); 2321 2322 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 2323 V128 src, dst; 2324 2325 randV128(&src); 2326 randV128(&dst); 2327 memcpy(&src[0], &vals[i], 8); 2328 do_ROUNDSD_000(False/*reg*/, &src, &dst); 2329 printf("r roundsd_000 "); 2330 showV128(&src); 2331 printf(" "); 2332 showV128(&dst); 2333 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2334 printf("\n"); 2335 2336 randV128(&src); 2337 randV128(&dst); 2338 memcpy(&src[0], &vals[i], 8); 2339 do_ROUNDSD_000(True/*mem*/, &src, &dst); 2340 printf("m roundsd_000 "); 2341 showV128(&src); 2342 printf(" "); 2343 showV128(&dst); 2344 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2345 printf("\n"); 2346 2347 2348 randV128(&src); 2349 randV128(&dst); 2350 memcpy(&src[0], &vals[i], 8); 2351 do_ROUNDSD_001(False/*reg*/, &src, &dst); 2352 printf("r roundsd_001 "); 2353 showV128(&src); 2354 printf(" "); 2355 showV128(&dst); 2356 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2357 printf("\n"); 2358 2359 randV128(&src); 2360 randV128(&dst); 2361 memcpy(&src[0], &vals[i], 8); 2362 do_ROUNDSD_001(True/*mem*/, &src, &dst); 2363 printf("m roundsd_001 "); 2364 showV128(&src); 2365 printf(" "); 2366 showV128(&dst); 2367 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2368 printf("\n"); 2369 2370 2371 randV128(&src); 2372 randV128(&dst); 2373 memcpy(&src[0], &vals[i], 8); 2374 do_ROUNDSD_010(False/*reg*/, &src, &dst); 2375 printf("r roundsd_010 "); 2376 showV128(&src); 2377 printf(" "); 2378 showV128(&dst); 2379 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2380 printf("\n"); 2381 2382 randV128(&src); 2383 randV128(&dst); 2384 memcpy(&src[0], &vals[i], 8); 2385 do_ROUNDSD_010(True/*mem*/, &src, &dst); 2386 printf("m roundsd_010 "); 2387 showV128(&src); 2388 printf(" "); 2389 showV128(&dst); 2390 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2391 printf("\n"); 2392 2393 2394 randV128(&src); 2395 randV128(&dst); 2396 memcpy(&src[0], &vals[i], 8); 2397 do_ROUNDSD_011(False/*reg*/, &src, &dst); 2398 printf("r roundsd_011 "); 2399 showV128(&src); 2400 printf(" "); 2401 showV128(&dst); 2402 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2403 printf("\n"); 2404 2405 randV128(&src); 2406 randV128(&dst); 2407 memcpy(&src[0], &vals[i], 8); 2408 do_ROUNDSD_011(True/*mem*/, &src, &dst); 2409 printf("m roundsd_011 "); 2410 showV128(&src); 2411 printf(" "); 2412 showV128(&dst); 2413 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2414 printf("\n"); 2415 } 2416 } 2417 2418 void test_ROUNDSD_w_mxcsr_rounding ( void ) 2419 { 2420 UInt rm; 2421 double vals[22]; 2422 Int i = 0; 2423 vals[i++] = 0.0; 2424 vals[i++] = -0.0; 2425 vals[i++] = mkPosInf(); 2426 vals[i++] = mkNegInf(); 2427 vals[i++] = mkPosNan(); 2428 vals[i++] = mkNegNan(); 2429 vals[i++] = -1.3; 2430 vals[i++] = -1.1; 2431 vals[i++] = -0.9; 2432 vals[i++] = -0.7; 2433 vals[i++] = -0.50001; 2434 vals[i++] = -0.49999; 2435 vals[i++] = -0.3; 2436 vals[i++] = -0.1; 2437 vals[i++] = 0.1; 2438 vals[i++] = 0.3; 2439 vals[i++] = 0.49999; 2440 vals[i++] = 0.50001; 2441 vals[i++] = 0.7; 2442 vals[i++] = 0.9; 2443 vals[i++] = 1.1; 2444 vals[i++] = 1.3; 2445 assert(i == 22); 2446 2447 rm = get_sse_roundingmode(); 2448 assert(rm == 0); // 0 == RN == default 2449 2450 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 2451 V128 src, dst; 2452 2453 for (rm = 0; rm <= 3; rm++) { 2454 set_sse_roundingmode(rm); 2455 2456 randV128(&src); 2457 randV128(&dst); 2458 memcpy(&src[0], &vals[i], 8); 2459 do_ROUNDSD_1XX(False/*reg*/, &src, &dst); 2460 printf("r (rm=%u) roundsd_1XX ", rm); 2461 showV128(&src); 2462 printf(" "); 2463 showV128(&dst); 2464 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2465 printf("\n"); 2466 2467 randV128(&src); 2468 randV128(&dst); 2469 memcpy(&src[0], &vals[i], 8); 2470 do_ROUNDSD_1XX(True/*mem*/, &src, &dst); 2471 printf("m (rm=%u) roundsd_1XX ", rm); 2472 showV128(&src); 2473 printf(" "); 2474 showV128(&dst); 2475 printf(" %10f %10f", vals[i], *(double*)(&dst[0])); 2476 printf("\n"); 2477 } 2478 } 2479 2480 rm = get_sse_roundingmode(); 2481 assert(rm == 3); 2482 set_sse_roundingmode(0); 2483 rm = get_sse_roundingmode(); 2484 assert(rm == 0); // 0 == RN == default 2485 } 2486 2487 2488 /* ------------ ROUNDSS ------------ */ 2489 2490 void do_ROUNDSS_000 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2491 { 2492 if (mem) { 2493 __asm__ __volatile__( 2494 "movupd (%1), %%xmm11" "\n\t" 2495 "roundss $0, (%0), %%xmm11" "\n\t" 2496 "movupd %%xmm11, (%1)" "\n" 2497 : /*OUT*/ 2498 : /*IN*/ "r"(src), "r"(dst) 2499 : /*TRASH*/ "xmm11" 2500 ); 2501 } else { 2502 __asm__ __volatile__( 2503 "movupd (%1), %%xmm11" "\n\t" 2504 "movupd (%0), %%xmm2" "\n\t" 2505 "roundss $0, %%xmm2, %%xmm11" "\n\t" 2506 "movupd %%xmm11, (%1)" "\n" 2507 : /*OUT*/ 2508 : /*IN*/ "r"(src), "r"(dst) 2509 : /*TRASH*/ "xmm11","xmm2" 2510 ); 2511 } 2512 } 2513 2514 void do_ROUNDSS_001 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2515 { 2516 if (mem) { 2517 __asm__ __volatile__( 2518 "movupd (%1), %%xmm11" "\n\t" 2519 "roundss $1, (%0), %%xmm11" "\n\t" 2520 "movupd %%xmm11, (%1)" "\n" 2521 : /*OUT*/ 2522 : /*IN*/ "r"(src), "r"(dst) 2523 : /*TRASH*/ "xmm11" 2524 ); 2525 } else { 2526 __asm__ __volatile__( 2527 "movupd (%1), %%xmm11" "\n\t" 2528 "movupd (%0), %%xmm2" "\n\t" 2529 "roundss $1, %%xmm2, %%xmm11" "\n\t" 2530 "movupd %%xmm11, (%1)" "\n" 2531 : /*OUT*/ 2532 : /*IN*/ "r"(src), "r"(dst) 2533 : /*TRASH*/ "xmm11","xmm2" 2534 ); 2535 } 2536 } 2537 2538 void do_ROUNDSS_010 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2539 { 2540 if (mem) { 2541 __asm__ __volatile__( 2542 "movupd (%1), %%xmm11" "\n\t" 2543 "roundss $2, (%0), %%xmm11" "\n\t" 2544 "movupd %%xmm11, (%1)" "\n" 2545 : /*OUT*/ 2546 : /*IN*/ "r"(src), "r"(dst) 2547 : /*TRASH*/ "xmm11" 2548 ); 2549 } else { 2550 __asm__ __volatile__( 2551 "movupd (%1), %%xmm11" "\n\t" 2552 "movupd (%0), %%xmm2" "\n\t" 2553 "roundss $2, %%xmm2, %%xmm11" "\n\t" 2554 "movupd %%xmm11, (%1)" "\n" 2555 : /*OUT*/ 2556 : /*IN*/ "r"(src), "r"(dst) 2557 : /*TRASH*/ "xmm11","xmm2" 2558 ); 2559 } 2560 } 2561 2562 void do_ROUNDSS_011 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2563 { 2564 if (mem) { 2565 __asm__ __volatile__( 2566 "movupd (%1), %%xmm11" "\n\t" 2567 "roundss $3, (%0), %%xmm11" "\n\t" 2568 "movupd %%xmm11, (%1)" "\n" 2569 : /*OUT*/ 2570 : /*IN*/ "r"(src), "r"(dst) 2571 : /*TRASH*/ "xmm11" 2572 ); 2573 } else { 2574 __asm__ __volatile__( 2575 "movupd (%1), %%xmm11" "\n\t" 2576 "movupd (%0), %%xmm2" "\n\t" 2577 "roundss $3, %%xmm2, %%xmm11" "\n\t" 2578 "movupd %%xmm11, (%1)" "\n" 2579 : /*OUT*/ 2580 : /*IN*/ "r"(src), "r"(dst) 2581 : /*TRASH*/ "xmm11","xmm2" 2582 ); 2583 } 2584 } 2585 2586 void do_ROUNDSS_1XX ( Bool mem, V128* src, /*OUT*/V128* dst ) 2587 { 2588 if (mem) { 2589 __asm__ __volatile__( 2590 "movupd (%1), %%xmm11" "\n\t" 2591 "roundss $4, (%0), %%xmm11" "\n\t" 2592 "movupd %%xmm11, (%1)" "\n" 2593 : /*OUT*/ 2594 : /*IN*/ "r"(src), "r"(dst) 2595 : /*TRASH*/ "xmm11" 2596 ); 2597 } else { 2598 __asm__ __volatile__( 2599 "movupd (%1), %%xmm11" "\n\t" 2600 "movupd (%0), %%xmm2" "\n\t" 2601 "roundss $4, %%xmm2, %%xmm11" "\n\t" 2602 "movupd %%xmm11, (%1)" "\n" 2603 : /*OUT*/ 2604 : /*IN*/ "r"(src), "r"(dst) 2605 : /*TRASH*/ "xmm11","xmm2" 2606 ); 2607 } 2608 } 2609 2610 void test_ROUNDSS_w_immediate_rounding ( void ) 2611 { 2612 float vals[22]; 2613 Int i = 0; 2614 vals[i++] = 0.0; 2615 vals[i++] = -0.0; 2616 vals[i++] = mkPosInf(); 2617 vals[i++] = mkNegInf(); 2618 vals[i++] = mkPosNan(); 2619 vals[i++] = mkNegNan(); 2620 vals[i++] = -1.3; 2621 vals[i++] = -1.1; 2622 vals[i++] = -0.9; 2623 vals[i++] = -0.7; 2624 vals[i++] = -0.50001; 2625 vals[i++] = -0.49999; 2626 vals[i++] = -0.3; 2627 vals[i++] = -0.1; 2628 vals[i++] = 0.1; 2629 vals[i++] = 0.3; 2630 vals[i++] = 0.49999; 2631 vals[i++] = 0.50001; 2632 vals[i++] = 0.7; 2633 vals[i++] = 0.9; 2634 vals[i++] = 1.1; 2635 vals[i++] = 1.3; 2636 assert(i == 22); 2637 2638 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 2639 V128 src, dst; 2640 2641 randV128(&src); 2642 randV128(&dst); 2643 memcpy(&src[0], &vals[i], 4); 2644 do_ROUNDSS_000(False/*reg*/, &src, &dst); 2645 printf("r roundss_000 "); 2646 showV128(&src); 2647 printf(" "); 2648 showV128(&dst); 2649 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2650 printf("\n"); 2651 2652 randV128(&src); 2653 randV128(&dst); 2654 memcpy(&src[0], &vals[i], 4); 2655 do_ROUNDSS_000(True/*mem*/, &src, &dst); 2656 printf("m roundss_000 "); 2657 showV128(&src); 2658 printf(" "); 2659 showV128(&dst); 2660 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2661 printf("\n"); 2662 2663 2664 randV128(&src); 2665 randV128(&dst); 2666 memcpy(&src[0], &vals[i], 4); 2667 do_ROUNDSS_001(False/*reg*/, &src, &dst); 2668 printf("r roundss_001 "); 2669 showV128(&src); 2670 printf(" "); 2671 showV128(&dst); 2672 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2673 printf("\n"); 2674 2675 randV128(&src); 2676 randV128(&dst); 2677 memcpy(&src[0], &vals[i], 4); 2678 do_ROUNDSS_001(True/*mem*/, &src, &dst); 2679 printf("m roundss_001 "); 2680 showV128(&src); 2681 printf(" "); 2682 showV128(&dst); 2683 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2684 printf("\n"); 2685 2686 2687 randV128(&src); 2688 randV128(&dst); 2689 memcpy(&src[0], &vals[i], 4); 2690 do_ROUNDSS_010(False/*reg*/, &src, &dst); 2691 printf("r roundss_010 "); 2692 showV128(&src); 2693 printf(" "); 2694 showV128(&dst); 2695 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2696 printf("\n"); 2697 2698 randV128(&src); 2699 randV128(&dst); 2700 memcpy(&src[0], &vals[i], 4); 2701 do_ROUNDSS_010(True/*mem*/, &src, &dst); 2702 printf("m roundss_010 "); 2703 showV128(&src); 2704 printf(" "); 2705 showV128(&dst); 2706 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2707 printf("\n"); 2708 2709 2710 randV128(&src); 2711 randV128(&dst); 2712 memcpy(&src[0], &vals[i], 4); 2713 do_ROUNDSS_011(False/*reg*/, &src, &dst); 2714 printf("r roundss_011 "); 2715 showV128(&src); 2716 printf(" "); 2717 showV128(&dst); 2718 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2719 printf("\n"); 2720 2721 randV128(&src); 2722 randV128(&dst); 2723 memcpy(&src[0], &vals[i], 4); 2724 do_ROUNDSS_011(True/*mem*/, &src, &dst); 2725 printf("m roundss_011 "); 2726 showV128(&src); 2727 printf(" "); 2728 showV128(&dst); 2729 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2730 printf("\n"); 2731 } 2732 } 2733 2734 void test_ROUNDSS_w_mxcsr_rounding ( void ) 2735 { 2736 UInt rm; 2737 float vals[22]; 2738 Int i = 0; 2739 vals[i++] = 0.0; 2740 vals[i++] = -0.0; 2741 vals[i++] = mkPosInf(); 2742 vals[i++] = mkNegInf(); 2743 vals[i++] = mkPosNan(); 2744 vals[i++] = mkNegNan(); 2745 vals[i++] = -1.3; 2746 vals[i++] = -1.1; 2747 vals[i++] = -0.9; 2748 vals[i++] = -0.7; 2749 vals[i++] = -0.50001; 2750 vals[i++] = -0.49999; 2751 vals[i++] = -0.3; 2752 vals[i++] = -0.1; 2753 vals[i++] = 0.1; 2754 vals[i++] = 0.3; 2755 vals[i++] = 0.49999; 2756 vals[i++] = 0.50001; 2757 vals[i++] = 0.7; 2758 vals[i++] = 0.9; 2759 vals[i++] = 1.1; 2760 vals[i++] = 1.3; 2761 assert(i == 22); 2762 2763 rm = get_sse_roundingmode(); 2764 assert(rm == 0); // 0 == RN == default 2765 2766 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 2767 V128 src, dst; 2768 2769 for (rm = 0; rm <= 3; rm++) { 2770 set_sse_roundingmode(rm); 2771 2772 randV128(&src); 2773 randV128(&dst); 2774 memcpy(&src[0], &vals[i], 4); 2775 do_ROUNDSS_1XX(False/*reg*/, &src, &dst); 2776 printf("r (rm=%u) roundss_1XX ", rm); 2777 showV128(&src); 2778 printf(" "); 2779 showV128(&dst); 2780 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2781 printf("\n"); 2782 2783 randV128(&src); 2784 randV128(&dst); 2785 memcpy(&src[0], &vals[i], 4); 2786 do_ROUNDSS_1XX(True/*mem*/, &src, &dst); 2787 printf("m (rm=%u) roundss_1XX ", rm); 2788 showV128(&src); 2789 printf(" "); 2790 showV128(&dst); 2791 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0])); 2792 printf("\n"); 2793 } 2794 } 2795 2796 rm = get_sse_roundingmode(); 2797 assert(rm == 3); 2798 set_sse_roundingmode(0); 2799 rm = get_sse_roundingmode(); 2800 assert(rm == 0); // 0 == RN == default 2801 } 2802 2803 /* ------------ ROUNDPD ------------ */ 2804 2805 void do_ROUNDPD_000 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2806 { 2807 if (mem) { 2808 __asm__ __volatile__( 2809 "movupd (%1), %%xmm11" "\n\t" 2810 "roundpd $0, (%0), %%xmm11" "\n\t" 2811 "movupd %%xmm11, (%1)" "\n" 2812 : /*OUT*/ 2813 : /*IN*/ "r"(src), "r"(dst) 2814 : /*TRASH*/ "xmm11" 2815 ); 2816 } else { 2817 __asm__ __volatile__( 2818 "movupd (%1), %%xmm11" "\n\t" 2819 "movupd (%0), %%xmm2" "\n\t" 2820 "roundpd $0, %%xmm2, %%xmm11" "\n\t" 2821 "movupd %%xmm11, (%1)" "\n" 2822 : /*OUT*/ 2823 : /*IN*/ "r"(src), "r"(dst) 2824 : /*TRASH*/ "xmm11","xmm2" 2825 ); 2826 } 2827 } 2828 2829 void do_ROUNDPD_001 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2830 { 2831 if (mem) { 2832 __asm__ __volatile__( 2833 "movupd (%1), %%xmm11" "\n\t" 2834 "roundpd $1, (%0), %%xmm11" "\n\t" 2835 "movupd %%xmm11, (%1)" "\n" 2836 : /*OUT*/ 2837 : /*IN*/ "r"(src), "r"(dst) 2838 : /*TRASH*/ "xmm11" 2839 ); 2840 } else { 2841 __asm__ __volatile__( 2842 "movupd (%1), %%xmm11" "\n\t" 2843 "movupd (%0), %%xmm2" "\n\t" 2844 "roundpd $1, %%xmm2, %%xmm11" "\n\t" 2845 "movupd %%xmm11, (%1)" "\n" 2846 : /*OUT*/ 2847 : /*IN*/ "r"(src), "r"(dst) 2848 : /*TRASH*/ "xmm11","xmm2" 2849 ); 2850 } 2851 } 2852 2853 void do_ROUNDPD_010 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2854 { 2855 if (mem) { 2856 __asm__ __volatile__( 2857 "movupd (%1), %%xmm11" "\n\t" 2858 "roundpd $2, (%0), %%xmm11" "\n\t" 2859 "movupd %%xmm11, (%1)" "\n" 2860 : /*OUT*/ 2861 : /*IN*/ "r"(src), "r"(dst) 2862 : /*TRASH*/ "xmm11" 2863 ); 2864 } else { 2865 __asm__ __volatile__( 2866 "movupd (%1), %%xmm11" "\n\t" 2867 "movupd (%0), %%xmm2" "\n\t" 2868 "roundpd $2, %%xmm2, %%xmm11" "\n\t" 2869 "movupd %%xmm11, (%1)" "\n" 2870 : /*OUT*/ 2871 : /*IN*/ "r"(src), "r"(dst) 2872 : /*TRASH*/ "xmm11","xmm2" 2873 ); 2874 } 2875 } 2876 2877 void do_ROUNDPD_011 ( Bool mem, V128* src, /*OUT*/V128* dst ) 2878 { 2879 if (mem) { 2880 __asm__ __volatile__( 2881 "movupd (%1), %%xmm11" "\n\t" 2882 "roundpd $3, (%0), %%xmm11" "\n\t" 2883 "movupd %%xmm11, (%1)" "\n" 2884 : /*OUT*/ 2885 : /*IN*/ "r"(src), "r"(dst) 2886 : /*TRASH*/ "xmm11" 2887 ); 2888 } else { 2889 __asm__ __volatile__( 2890 "movupd (%1), %%xmm11" "\n\t" 2891 "movupd (%0), %%xmm2" "\n\t" 2892 "roundpd $3, %%xmm2, %%xmm11" "\n\t" 2893 "movupd %%xmm11, (%1)" "\n" 2894 : /*OUT*/ 2895 : /*IN*/ "r"(src), "r"(dst) 2896 : /*TRASH*/ "xmm11","xmm2" 2897 ); 2898 } 2899 } 2900 2901 void do_ROUNDPD_1XX ( Bool mem, V128* src, /*OUT*/V128* dst ) 2902 { 2903 if (mem) { 2904 __asm__ __volatile__( 2905 "movupd (%1), %%xmm11" "\n\t" 2906 "roundpd $4, (%0), %%xmm11" "\n\t" 2907 "movupd %%xmm11, (%1)" "\n" 2908 : /*OUT*/ 2909 : /*IN*/ "r"(src), "r"(dst) 2910 : /*TRASH*/ "xmm11" 2911 ); 2912 } else { 2913 __asm__ __volatile__( 2914 "movupd (%1), %%xmm11" "\n\t" 2915 "movupd (%0), %%xmm2" "\n\t" 2916 "roundpd $4, %%xmm2, %%xmm11" "\n\t" 2917 "movupd %%xmm11, (%1)" "\n" 2918 : /*OUT*/ 2919 : /*IN*/ "r"(src), "r"(dst) 2920 : /*TRASH*/ "xmm11","xmm2" 2921 ); 2922 } 2923 } 2924 2925 void test_ROUNDPD_w_immediate_rounding ( void ) 2926 { 2927 double vals[22]; 2928 Int i = 0; 2929 vals[i++] = 0.0; 2930 vals[i++] = -0.0; 2931 vals[i++] = mkPosInf(); 2932 vals[i++] = mkNegInf(); 2933 vals[i++] = mkPosNan(); 2934 vals[i++] = mkNegNan(); 2935 vals[i++] = -1.3; 2936 vals[i++] = -1.1; 2937 vals[i++] = -0.9; 2938 vals[i++] = -0.7; 2939 vals[i++] = -0.50001; 2940 vals[i++] = -0.49999; 2941 vals[i++] = -0.3; 2942 vals[i++] = -0.1; 2943 vals[i++] = 0.1; 2944 vals[i++] = 0.3; 2945 vals[i++] = 0.49999; 2946 vals[i++] = 0.50001; 2947 vals[i++] = 0.7; 2948 vals[i++] = 0.9; 2949 vals[i++] = 1.1; 2950 vals[i++] = 1.3; 2951 assert(i == 22); 2952 2953 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 2954 V128 src, dst; 2955 2956 randV128(&src); 2957 randV128(&dst); 2958 memcpy(&src[0], &vals[i], 8); 2959 memcpy(&src[8], &vals[(i+11)%22], 8); 2960 do_ROUNDPD_000(False/*reg*/, &src, &dst); 2961 printf("r roundpd_000 "); 2962 showV128(&src); 2963 printf(" "); 2964 showV128(&dst); 2965 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 2966 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 2967 printf("\n"); 2968 2969 randV128(&src); 2970 randV128(&dst); 2971 memcpy(&src[0], &vals[i], 8); 2972 memcpy(&src[8], &vals[(i+11)%22], 8); 2973 do_ROUNDPD_000(True/*mem*/, &src, &dst); 2974 printf("m roundpd_000 "); 2975 showV128(&src); 2976 printf(" "); 2977 showV128(&dst); 2978 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 2979 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 2980 printf("\n"); 2981 2982 2983 randV128(&src); 2984 randV128(&dst); 2985 memcpy(&src[0], &vals[i], 8); 2986 memcpy(&src[8], &vals[(i+11)%22], 8); 2987 do_ROUNDPD_001(False/*reg*/, &src, &dst); 2988 printf("r roundpd_001 "); 2989 showV128(&src); 2990 printf(" "); 2991 showV128(&dst); 2992 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 2993 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 2994 printf("\n"); 2995 2996 randV128(&src); 2997 randV128(&dst); 2998 memcpy(&src[0], &vals[i], 8); 2999 memcpy(&src[8], &vals[(i+11)%22], 8); 3000 do_ROUNDPD_001(True/*mem*/, &src, &dst); 3001 printf("m roundpd_001 "); 3002 showV128(&src); 3003 printf(" "); 3004 showV128(&dst); 3005 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3006 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3007 printf("\n"); 3008 3009 3010 randV128(&src); 3011 randV128(&dst); 3012 memcpy(&src[0], &vals[i], 8); 3013 memcpy(&src[8], &vals[(i+11)%22], 8); 3014 do_ROUNDPD_010(False/*reg*/, &src, &dst); 3015 printf("r roundpd_010 "); 3016 showV128(&src); 3017 printf(" "); 3018 showV128(&dst); 3019 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3020 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3021 printf("\n"); 3022 3023 randV128(&src); 3024 randV128(&dst); 3025 memcpy(&src[0], &vals[i], 8); 3026 memcpy(&src[8], &vals[(i+11)%22], 8); 3027 do_ROUNDPD_010(True/*mem*/, &src, &dst); 3028 printf("m roundpd_010 "); 3029 showV128(&src); 3030 printf(" "); 3031 showV128(&dst); 3032 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3033 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3034 printf("\n"); 3035 3036 3037 randV128(&src); 3038 randV128(&dst); 3039 memcpy(&src[0], &vals[i], 8); 3040 memcpy(&src[8], &vals[(i+11)%22], 8); 3041 do_ROUNDPD_011(False/*reg*/, &src, &dst); 3042 printf("r roundpd_011 "); 3043 showV128(&src); 3044 printf(" "); 3045 showV128(&dst); 3046 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3047 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3048 printf("\n"); 3049 3050 randV128(&src); 3051 randV128(&dst); 3052 memcpy(&src[0], &vals[i], 8); 3053 memcpy(&src[8], &vals[(i+11)%22], 8); 3054 do_ROUNDPD_011(True/*mem*/, &src, &dst); 3055 printf("m roundpd_011 "); 3056 showV128(&src); 3057 printf(" "); 3058 showV128(&dst); 3059 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3060 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3061 printf("\n"); 3062 } 3063 } 3064 3065 void test_ROUNDPD_w_mxcsr_rounding ( void ) 3066 { 3067 UInt rm; 3068 double vals[22]; 3069 Int i = 0; 3070 vals[i++] = 0.0; 3071 vals[i++] = -0.0; 3072 vals[i++] = mkPosInf(); 3073 vals[i++] = mkNegInf(); 3074 vals[i++] = mkPosNan(); 3075 vals[i++] = mkNegNan(); 3076 vals[i++] = -1.3; 3077 vals[i++] = -1.1; 3078 vals[i++] = -0.9; 3079 vals[i++] = -0.7; 3080 vals[i++] = -0.50001; 3081 vals[i++] = -0.49999; 3082 vals[i++] = -0.3; 3083 vals[i++] = -0.1; 3084 vals[i++] = 0.1; 3085 vals[i++] = 0.3; 3086 vals[i++] = 0.49999; 3087 vals[i++] = 0.50001; 3088 vals[i++] = 0.7; 3089 vals[i++] = 0.9; 3090 vals[i++] = 1.1; 3091 vals[i++] = 1.3; 3092 assert(i == 22); 3093 3094 rm = get_sse_roundingmode(); 3095 assert(rm == 0); // 0 == RN == default 3096 3097 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 3098 V128 src, dst; 3099 3100 for (rm = 0; rm <= 3; rm++) { 3101 set_sse_roundingmode(rm); 3102 3103 randV128(&src); 3104 randV128(&dst); 3105 memcpy(&src[0], &vals[i], 8); 3106 memcpy(&src[8], &vals[(i+11)%22], 8); 3107 do_ROUNDPD_1XX(False/*reg*/, &src, &dst); 3108 printf("r (rm=%u) roundpd_1XX ", rm); 3109 showV128(&src); 3110 printf(" "); 3111 showV128(&dst); 3112 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3113 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3114 printf("\n"); 3115 3116 randV128(&src); 3117 randV128(&dst); 3118 memcpy(&src[0], &vals[i], 8); 3119 memcpy(&src[8], &vals[(i+11)%22], 8); 3120 do_ROUNDPD_1XX(True/*mem*/, &src, &dst); 3121 printf("m (rm=%u) roundpd_1XX ", rm); 3122 showV128(&src); 3123 printf(" "); 3124 showV128(&dst); 3125 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0])); 3126 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8])); 3127 printf("\n"); 3128 } 3129 } 3130 3131 rm = get_sse_roundingmode(); 3132 assert(rm == 3); 3133 set_sse_roundingmode(0); 3134 rm = get_sse_roundingmode(); 3135 assert(rm == 0); // 0 == RN == default 3136 } 3137 3138 /* ------------ ROUNDPS ------------ */ 3139 3140 void do_ROUNDPS_000 ( Bool mem, V128* src, /*OUT*/V128* dst ) 3141 { 3142 if (mem) { 3143 __asm__ __volatile__( 3144 "movupd (%1), %%xmm11" "\n\t" 3145 "roundps $0, (%0), %%xmm11" "\n\t" 3146 "movupd %%xmm11, (%1)" "\n" 3147 : /*OUT*/ 3148 : /*IN*/ "r"(src), "r"(dst) 3149 : /*TRASH*/ "xmm11" 3150 ); 3151 } else { 3152 __asm__ __volatile__( 3153 "movupd (%1), %%xmm11" "\n\t" 3154 "movupd (%0), %%xmm2" "\n\t" 3155 "roundps $0, %%xmm2, %%xmm11" "\n\t" 3156 "movupd %%xmm11, (%1)" "\n" 3157 : /*OUT*/ 3158 : /*IN*/ "r"(src), "r"(dst) 3159 : /*TRASH*/ "xmm11","xmm2" 3160 ); 3161 } 3162 } 3163 3164 void do_ROUNDPS_001 ( Bool mem, V128* src, /*OUT*/V128* dst ) 3165 { 3166 if (mem) { 3167 __asm__ __volatile__( 3168 "movupd (%1), %%xmm11" "\n\t" 3169 "roundps $1, (%0), %%xmm11" "\n\t" 3170 "movupd %%xmm11, (%1)" "\n" 3171 : /*OUT*/ 3172 : /*IN*/ "r"(src), "r"(dst) 3173 : /*TRASH*/ "xmm11" 3174 ); 3175 } else { 3176 __asm__ __volatile__( 3177 "movupd (%1), %%xmm11" "\n\t" 3178 "movupd (%0), %%xmm2" "\n\t" 3179 "roundps $1, %%xmm2, %%xmm11" "\n\t" 3180 "movupd %%xmm11, (%1)" "\n" 3181 : /*OUT*/ 3182 : /*IN*/ "r"(src), "r"(dst) 3183 : /*TRASH*/ "xmm11","xmm2" 3184 ); 3185 } 3186 } 3187 3188 void do_ROUNDPS_010 ( Bool mem, V128* src, /*OUT*/V128* dst ) 3189 { 3190 if (mem) { 3191 __asm__ __volatile__( 3192 "movupd (%1), %%xmm11" "\n\t" 3193 "roundps $2, (%0), %%xmm11" "\n\t" 3194 "movupd %%xmm11, (%1)" "\n" 3195 : /*OUT*/ 3196 : /*IN*/ "r"(src), "r"(dst) 3197 : /*TRASH*/ "xmm11" 3198 ); 3199 } else { 3200 __asm__ __volatile__( 3201 "movupd (%1), %%xmm11" "\n\t" 3202 "movupd (%0), %%xmm2" "\n\t" 3203 "roundps $2, %%xmm2, %%xmm11" "\n\t" 3204 "movupd %%xmm11, (%1)" "\n" 3205 : /*OUT*/ 3206 : /*IN*/ "r"(src), "r"(dst) 3207 : /*TRASH*/ "xmm11","xmm2" 3208 ); 3209 } 3210 } 3211 3212 void do_ROUNDPS_011 ( Bool mem, V128* src, /*OUT*/V128* dst ) 3213 { 3214 if (mem) { 3215 __asm__ __volatile__( 3216 "movupd (%1), %%xmm11" "\n\t" 3217 "roundps $3, (%0), %%xmm11" "\n\t" 3218 "movupd %%xmm11, (%1)" "\n" 3219 : /*OUT*/ 3220 : /*IN*/ "r"(src), "r"(dst) 3221 : /*TRASH*/ "xmm11" 3222 ); 3223 } else { 3224 __asm__ __volatile__( 3225 "movupd (%1), %%xmm11" "\n\t" 3226 "movupd (%0), %%xmm2" "\n\t" 3227 "roundps $3, %%xmm2, %%xmm11" "\n\t" 3228 "movupd %%xmm11, (%1)" "\n" 3229 : /*OUT*/ 3230 : /*IN*/ "r"(src), "r"(dst) 3231 : /*TRASH*/ "xmm11","xmm2" 3232 ); 3233 } 3234 } 3235 3236 void do_ROUNDPS_1XX ( Bool mem, V128* src, /*OUT*/V128* dst ) 3237 { 3238 if (mem) { 3239 __asm__ __volatile__( 3240 "movupd (%1), %%xmm11" "\n\t" 3241 "roundps $4, (%0), %%xmm11" "\n\t" 3242 "movupd %%xmm11, (%1)" "\n" 3243 : /*OUT*/ 3244 : /*IN*/ "r"(src), "r"(dst) 3245 : /*TRASH*/ "xmm11" 3246 ); 3247 } else { 3248 __asm__ __volatile__( 3249 "movupd (%1), %%xmm11" "\n\t" 3250 "movupd (%0), %%xmm2" "\n\t" 3251 "roundps $4, %%xmm2, %%xmm11" "\n\t" 3252 "movupd %%xmm11, (%1)" "\n" 3253 : /*OUT*/ 3254 : /*IN*/ "r"(src), "r"(dst) 3255 : /*TRASH*/ "xmm11","xmm2" 3256 ); 3257 } 3258 } 3259 3260 void test_ROUNDPS_w_immediate_rounding ( void ) 3261 { 3262 float vals[22]; 3263 Int i = 0; 3264 vals[i++] = 0.0; 3265 vals[i++] = -0.0; 3266 vals[i++] = mkPosInf(); 3267 vals[i++] = mkNegInf(); 3268 vals[i++] = mkPosNan(); 3269 vals[i++] = mkNegNan(); 3270 vals[i++] = -1.3; 3271 vals[i++] = -1.1; 3272 vals[i++] = -0.9; 3273 vals[i++] = -0.7; 3274 vals[i++] = -0.50001; 3275 vals[i++] = -0.49999; 3276 vals[i++] = -0.3; 3277 vals[i++] = -0.1; 3278 vals[i++] = 0.1; 3279 vals[i++] = 0.3; 3280 vals[i++] = 0.49999; 3281 vals[i++] = 0.50001; 3282 vals[i++] = 0.7; 3283 vals[i++] = 0.9; 3284 vals[i++] = 1.1; 3285 vals[i++] = 1.3; 3286 assert(i == 22); 3287 3288 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 3289 V128 src, dst; 3290 3291 randV128(&src); 3292 randV128(&dst); 3293 memcpy(&src[0], &vals[i], 4); 3294 memcpy(&src[4], &vals[(i+5)%22], 4); 3295 memcpy(&src[8], &vals[(i+11)%22], 4); 3296 memcpy(&src[12], &vals[(i+17)%22], 4); 3297 do_ROUNDPS_000(False/*reg*/, &src, &dst); 3298 printf("r roundps_000 "); 3299 showV128(&src); 3300 printf(" "); 3301 showV128(&dst); 3302 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3303 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3304 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3305 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3306 printf("\n"); 3307 3308 randV128(&src); 3309 randV128(&dst); 3310 memcpy(&src[0], &vals[i], 4); 3311 memcpy(&src[4], &vals[(i+5)%22], 4); 3312 memcpy(&src[8], &vals[(i+11)%22], 4); 3313 memcpy(&src[12], &vals[(i+17)%22], 4); 3314 do_ROUNDPS_000(True/*mem*/, &src, &dst); 3315 printf("m roundps_000 "); 3316 showV128(&src); 3317 printf(" "); 3318 showV128(&dst); 3319 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3320 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3321 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3322 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3323 printf("\n"); 3324 3325 3326 randV128(&src); 3327 randV128(&dst); 3328 memcpy(&src[0], &vals[i], 4); 3329 memcpy(&src[4], &vals[(i+5)%22], 4); 3330 memcpy(&src[8], &vals[(i+11)%22], 4); 3331 memcpy(&src[12], &vals[(i+17)%22], 4); 3332 do_ROUNDPS_001(False/*reg*/, &src, &dst); 3333 printf("r roundps_001 "); 3334 showV128(&src); 3335 printf(" "); 3336 showV128(&dst); 3337 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3338 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3339 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3340 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3341 printf("\n"); 3342 3343 randV128(&src); 3344 randV128(&dst); 3345 memcpy(&src[0], &vals[i], 4); 3346 memcpy(&src[4], &vals[(i+5)%22], 4); 3347 memcpy(&src[8], &vals[(i+11)%22], 4); 3348 memcpy(&src[12], &vals[(i+17)%22], 4); 3349 do_ROUNDPS_001(True/*mem*/, &src, &dst); 3350 printf("m roundps_001 "); 3351 showV128(&src); 3352 printf(" "); 3353 showV128(&dst); 3354 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3355 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3356 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3357 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3358 printf("\n"); 3359 3360 3361 randV128(&src); 3362 randV128(&dst); 3363 memcpy(&src[0], &vals[i], 4); 3364 memcpy(&src[4], &vals[(i+5)%22], 4); 3365 memcpy(&src[8], &vals[(i+11)%22], 4); 3366 memcpy(&src[12], &vals[(i+17)%22], 4); 3367 do_ROUNDPS_010(False/*reg*/, &src, &dst); 3368 printf("r roundps_010 "); 3369 showV128(&src); 3370 printf(" "); 3371 showV128(&dst); 3372 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3373 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3374 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3375 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3376 printf("\n"); 3377 3378 randV128(&src); 3379 randV128(&dst); 3380 memcpy(&src[0], &vals[i], 4); 3381 memcpy(&src[4], &vals[(i+5)%22], 4); 3382 memcpy(&src[8], &vals[(i+11)%22], 4); 3383 memcpy(&src[12], &vals[(i+17)%22], 4); 3384 do_ROUNDPS_010(True/*mem*/, &src, &dst); 3385 printf("m roundps_010 "); 3386 showV128(&src); 3387 printf(" "); 3388 showV128(&dst); 3389 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3390 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3391 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3392 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3393 printf("\n"); 3394 3395 3396 randV128(&src); 3397 randV128(&dst); 3398 memcpy(&src[0], &vals[i], 4); 3399 memcpy(&src[4], &vals[(i+5)%22], 4); 3400 memcpy(&src[8], &vals[(i+11)%22], 4); 3401 memcpy(&src[12], &vals[(i+17)%22], 4); 3402 do_ROUNDPS_011(False/*reg*/, &src, &dst); 3403 printf("r roundps_011 "); 3404 showV128(&src); 3405 printf(" "); 3406 showV128(&dst); 3407 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3408 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3409 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3410 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3411 printf("\n"); 3412 3413 randV128(&src); 3414 randV128(&dst); 3415 memcpy(&src[0], &vals[i], 4); 3416 memcpy(&src[4], &vals[(i+5)%22], 4); 3417 memcpy(&src[8], &vals[(i+11)%22], 4); 3418 memcpy(&src[12], &vals[(i+17)%22], 4); 3419 do_ROUNDPS_011(True/*mem*/, &src, &dst); 3420 printf("m roundps_011 "); 3421 showV128(&src); 3422 printf(" "); 3423 showV128(&dst); 3424 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3425 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3426 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3427 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3428 printf("\n"); 3429 } 3430 } 3431 3432 void test_ROUNDPS_w_mxcsr_rounding ( void ) 3433 { 3434 UInt rm; 3435 float vals[22]; 3436 Int i = 0; 3437 vals[i++] = 0.0; 3438 vals[i++] = -0.0; 3439 vals[i++] = mkPosInf(); 3440 vals[i++] = mkNegInf(); 3441 vals[i++] = mkPosNan(); 3442 vals[i++] = mkNegNan(); 3443 vals[i++] = -1.3; 3444 vals[i++] = -1.1; 3445 vals[i++] = -0.9; 3446 vals[i++] = -0.7; 3447 vals[i++] = -0.50001; 3448 vals[i++] = -0.49999; 3449 vals[i++] = -0.3; 3450 vals[i++] = -0.1; 3451 vals[i++] = 0.1; 3452 vals[i++] = 0.3; 3453 vals[i++] = 0.49999; 3454 vals[i++] = 0.50001; 3455 vals[i++] = 0.7; 3456 vals[i++] = 0.9; 3457 vals[i++] = 1.1; 3458 vals[i++] = 1.3; 3459 assert(i == 22); 3460 3461 rm = get_sse_roundingmode(); 3462 assert(rm == 0); // 0 == RN == default 3463 3464 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) { 3465 V128 src, dst; 3466 3467 for (rm = 0; rm <= 3; rm++) { 3468 set_sse_roundingmode(rm); 3469 3470 randV128(&src); 3471 randV128(&dst); 3472 memcpy(&src[0], &vals[i], 4); 3473 memcpy(&src[4], &vals[(i+5)%22], 4); 3474 memcpy(&src[8], &vals[(i+11)%22], 4); 3475 memcpy(&src[12], &vals[(i+17)%22], 4); 3476 do_ROUNDPS_1XX(False/*reg*/, &src, &dst); 3477 printf("r (rm=%u) roundps_1XX ", rm); 3478 showV128(&src); 3479 printf(" "); 3480 showV128(&dst); 3481 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3482 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3483 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3484 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3485 printf("\n"); 3486 3487 randV128(&src); 3488 randV128(&dst); 3489 memcpy(&src[0], &vals[i], 4); 3490 memcpy(&src[4], &vals[(i+5)%22], 4); 3491 memcpy(&src[8], &vals[(i+11)%22], 4); 3492 memcpy(&src[12], &vals[(i+17)%22], 4); 3493 do_ROUNDPS_1XX(True/*mem*/, &src, &dst); 3494 printf("m (rm=%u) roundps_1XX ", rm); 3495 showV128(&src); 3496 printf(" "); 3497 showV128(&dst); 3498 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0])); 3499 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4])); 3500 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8])); 3501 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12])); 3502 printf("\n"); 3503 } 3504 } 3505 3506 rm = get_sse_roundingmode(); 3507 assert(rm == 3); 3508 set_sse_roundingmode(0); 3509 rm = get_sse_roundingmode(); 3510 assert(rm == 0); // 0 == RN == default 3511 } 3512 3513 /* ------------ PTEST ------------ */ 3514 3515 void test_PTEST ( void ) 3516 { 3517 const Int ntests = 8; 3518 V128 spec[ntests]; 3519 do64HLtoV128( &spec[0], 0x0000000000000000ULL, 0x0000000000000000ULL ); 3520 do64HLtoV128( &spec[1], 0x0000000000000000ULL, 0x0000000000000001ULL ); 3521 do64HLtoV128( &spec[2], 0x0000000000000001ULL, 0x0000000000000000ULL ); 3522 do64HLtoV128( &spec[3], 0x0000000000000001ULL, 0x0000000000000001ULL ); 3523 do64HLtoV128( &spec[4], 0xffffffffffffffffULL, 0xffffffffffffffffULL ); 3524 do64HLtoV128( &spec[5], 0xffffffffffffffffULL, 0xfffffffffffffffeULL ); 3525 do64HLtoV128( &spec[6], 0xfffffffffffffffeULL, 0xffffffffffffffffULL ); 3526 do64HLtoV128( &spec[7], 0xfffffffffffffffeULL, 0xfffffffffffffffeULL ); 3527 V128 block[2]; 3528 Int i, j; 3529 ULong flags; 3530 for (i = 0; i < ntests; i++) { 3531 for (j = 0; j < ntests; j++) { 3532 memcpy(&block[0], &spec[i], 16); 3533 memcpy(&block[1], &spec[j], 16); 3534 __asm__ __volatile__( 3535 "subq $256, %%rsp" "\n\t" 3536 "movupd 0(%1), %%xmm2" "\n\t" 3537 "ptest 16(%1), %%xmm2" "\n\t" 3538 "pushfq" "\n\t" 3539 "popq %0" "\n\t" 3540 "addq $256, %%rsp" "\n\t" 3541 : /*out*/"=r"(flags) : /*in*/ "r"(&block[0]) : 3542 "xmm2", "memory", "cc" 3543 ); 3544 printf("r ptest "); 3545 showV128(&block[0]); 3546 printf(" "); 3547 showV128(&block[1]); 3548 printf(" -> eflags %04x\n", (UInt)flags & 0x8D5); 3549 } 3550 } 3551 } 3552 3553 /* ------------ PBLENDVB ------------ */ 3554 3555 void do_PBLENDVB ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst ) 3556 { 3557 if (mem) { 3558 __asm__ __volatile__( 3559 "movupd (%2), %%xmm0" "\n\t" 3560 "movupd (%1), %%xmm11" "\n\t" 3561 "pblendvb (%0), %%xmm11" "\n\t" 3562 "movupd %%xmm11, (%1)" "\n" 3563 : /*OUT*/ 3564 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0) 3565 : /*TRASH*/ "xmm11","xmm0" 3566 ); 3567 } else { 3568 __asm__ __volatile__( 3569 "movupd (%2), %%xmm0" "\n\t" 3570 "movupd (%1), %%xmm11" "\n\t" 3571 "movupd (%0), %%xmm2" "\n\t" 3572 "pblendvb %%xmm2, %%xmm11" "\n\t" 3573 "movupd %%xmm11, (%1)" "\n" 3574 : /*OUT*/ 3575 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0) 3576 : /*TRASH*/ "xmm11","xmm2","xmm0" 3577 ); 3578 } 3579 } 3580 3581 void test_PBLENDVB ( void ) 3582 { 3583 V128 xmm0, src, dst, t_xmm0, t_src, t_dst; 3584 Int i; 3585 for (i = 0; i < 10; i++) { 3586 randV128(&t_xmm0); 3587 randV128(&t_src); 3588 randV128(&t_dst); 3589 3590 memcpy(&xmm0, &t_xmm0, 16); 3591 memcpy(&src, &t_src, 16); 3592 memcpy(&dst, &t_dst, 16); 3593 do_PBLENDVB(False/*reg*/, &xmm0, &src, &dst); 3594 printf("r pblendvb "); 3595 showV128(&t_xmm0); 3596 printf(" "); 3597 showV128(&t_src); 3598 printf(" "); 3599 showV128(&t_dst); 3600 printf(" -> "); 3601 showV128(&dst); 3602 printf("\n"); 3603 3604 memcpy(&xmm0, &t_xmm0, 16); 3605 memcpy(&src, &t_src, 16); 3606 memcpy(&dst, &t_dst, 16); 3607 do_PBLENDVB(True/*mem*/, &xmm0, &src, &dst); 3608 printf("m pblendvb "); 3609 showV128(&t_xmm0); 3610 printf(" "); 3611 showV128(&t_src); 3612 printf(" "); 3613 showV128(&t_dst); 3614 printf(" -> "); 3615 showV128(&dst); 3616 printf("\n"); 3617 } 3618 } 3619 3620 /* ------------ BLENDVPD ------------ */ 3621 3622 void do_BLENDVPD ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst ) 3623 { 3624 if (mem) { 3625 __asm__ __volatile__( 3626 "movupd (%2), %%xmm0" "\n\t" 3627 "movupd (%1), %%xmm11" "\n\t" 3628 "blendvpd (%0), %%xmm11" "\n\t" 3629 "movupd %%xmm11, (%1)" "\n" 3630 : /*OUT*/ 3631 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0) 3632 : /*TRASH*/ "xmm11","xmm0" 3633 ); 3634 } else { 3635 __asm__ __volatile__( 3636 "movupd (%2), %%xmm0" "\n\t" 3637 "movupd (%1), %%xmm11" "\n\t" 3638 "movupd (%0), %%xmm2" "\n\t" 3639 "blendvpd %%xmm2, %%xmm11" "\n\t" 3640 "movupd %%xmm11, (%1)" "\n" 3641 : /*OUT*/ 3642 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0) 3643 : /*TRASH*/ "xmm11","xmm2","xmm0" 3644 ); 3645 } 3646 } 3647 3648 void test_BLENDVPD ( void ) 3649 { 3650 V128 xmm0, src, dst, t_xmm0, t_src, t_dst; 3651 Int i; 3652 for (i = 0; i < 10; i++) { 3653 randV128(&t_xmm0); 3654 randV128(&t_src); 3655 randV128(&t_dst); 3656 3657 memcpy(&xmm0, &t_xmm0, 16); 3658 memcpy(&src, &t_src, 16); 3659 memcpy(&dst, &t_dst, 16); 3660 do_BLENDVPD(False/*reg*/, &xmm0, &src, &dst); 3661 printf("r blendvpd "); 3662 showV128(&t_xmm0); 3663 printf(" "); 3664 showV128(&t_src); 3665 printf(" "); 3666 showV128(&t_dst); 3667 printf(" -> "); 3668 showV128(&dst); 3669 printf("\n"); 3670 3671 memcpy(&xmm0, &t_xmm0, 16); 3672 memcpy(&src, &t_src, 16); 3673 memcpy(&dst, &t_dst, 16); 3674 do_BLENDVPD(True/*mem*/, &xmm0, &src, &dst); 3675 printf("m blendvpd "); 3676 showV128(&t_xmm0); 3677 printf(" "); 3678 showV128(&t_src); 3679 printf(" "); 3680 showV128(&t_dst); 3681 printf(" -> "); 3682 showV128(&dst); 3683 printf("\n"); 3684 } 3685 } 3686 3687 /* ------------ BLENDVPS ------------ */ 3688 3689 void do_BLENDVPS ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst ) 3690 { 3691 if (mem) { 3692 __asm__ __volatile__( 3693 "movupd (%2), %%xmm0" "\n\t" 3694 "movupd (%1), %%xmm11" "\n\t" 3695 "blendvps (%0), %%xmm11" "\n\t" 3696 "movupd %%xmm11, (%1)" "\n" 3697 : /*OUT*/ 3698 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0) 3699 : /*TRASH*/ "xmm11","xmm0" 3700 ); 3701 } else { 3702 __asm__ __volatile__( 3703 "movupd (%2), %%xmm0" "\n\t" 3704 "movupd (%1), %%xmm11" "\n\t" 3705 "movupd (%0), %%xmm2" "\n\t" 3706 "blendvps %%xmm2, %%xmm11" "\n\t" 3707 "movupd %%xmm11, (%1)" "\n" 3708 : /*OUT*/ 3709 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0) 3710 : /*TRASH*/ "xmm11","xmm2","xmm0" 3711 ); 3712 } 3713 } 3714 3715 void test_BLENDVPS ( void ) 3716 { 3717 V128 xmm0, src, dst, t_xmm0, t_src, t_dst; 3718 Int i; 3719 for (i = 0; i < 10; i++) { 3720 randV128(&t_xmm0); 3721 randV128(&t_src); 3722 randV128(&t_dst); 3723 3724 memcpy(&xmm0, &t_xmm0, 16); 3725 memcpy(&src, &t_src, 16); 3726 memcpy(&dst, &t_dst, 16); 3727 do_BLENDVPS(False/*reg*/, &xmm0, &src, &dst); 3728 printf("r blendvps "); 3729 showV128(&t_xmm0); 3730 printf(" "); 3731 showV128(&t_src); 3732 printf(" "); 3733 showV128(&t_dst); 3734 printf(" -> "); 3735 showV128(&dst); 3736 printf("\n"); 3737 3738 memcpy(&xmm0, &t_xmm0, 16); 3739 memcpy(&src, &t_src, 16); 3740 memcpy(&dst, &t_dst, 16); 3741 do_BLENDVPS(True/*mem*/, &xmm0, &src, &dst); 3742 printf("m blendvps "); 3743 showV128(&t_xmm0); 3744 printf(" "); 3745 showV128(&t_src); 3746 printf(" "); 3747 showV128(&t_dst); 3748 printf(" -> "); 3749 showV128(&dst); 3750 printf("\n"); 3751 } 3752 } 3753 3754 void test_MOVNTDQA ( void ) 3755 { 3756 V128 src, dst; 3757 Int i; 3758 for (i = 0; i < 10; i++) { 3759 randV128(&src); 3760 /* make sure the load actually happens */ 3761 randV128(&dst); 3762 DO_m_r("movntdqa", src, dst); 3763 } 3764 } 3765 3766 /* ------------ main ------------ */ 3767 3768 int main ( int argc, char** argv ) 3769 { 3770 #if 1 3771 // ------ SSE 4.1 ------ 3772 test_BLENDPD(); // done Apr.01.2010 3773 test_BLENDPS(); // done Apr.02.2010 3774 test_PBLENDW(); 3775 test_PBLENDVB(); 3776 test_BLENDVPD(); 3777 test_BLENDVPS(); 3778 test_DPPD(); // done Apr.08.2010 3779 test_DPPS(); // done Apr.09.2010 3780 test_EXTRACTPS(); 3781 test_INSERTPS(); // done Apr.01.2010 3782 test_PCMPEQQ(); 3783 test_PEXTRB(); // done Apr.15.2010 3784 test_PEXTRD(); // done Apr.14.2010 3785 test_PEXTRQ(); // done Apr.14.2010 3786 test_PEXTRW(); // done Apr.14.2010 3787 test_PINSRQ(); // done Apr.16.2010 3788 test_PINSRD(); // todo 3789 test_PINSRW(); /* Umm, this is SSE2, not SSE4. Right? */ 3790 test_PINSRB(); // todo 3791 test_PMAXSB(); 3792 test_PMAXSD(); // done Apr.09.2010 3793 test_PMAXUD(); // done Apr.16.2010 3794 test_PMAXUW(); 3795 test_PMINSB(); 3796 test_PMINSD(); // done Apr.09.2010 3797 test_PMINUD(); 3798 test_PMINUW(); 3799 test_PMOVSXBW(); // done Apr.02.2010 3800 test_PMOVSXBD(); // done Mar.30.2010 3801 test_PMOVSXBQ(); // done Mar.30.2010 3802 test_PMOVSXWD(); // done Mar.31.2010 3803 test_PMOVSXWQ(); // done Mar.31.2010 3804 test_PMOVSXDQ(); // done Mar.31.2010 3805 test_PMOVZXBW(); // done Mar.28.2010 3806 test_PMOVZXBD(); // done Mar.29.2010 3807 test_PMOVZXBQ(); // done Mar.29.2010 3808 test_PMOVZXWD(); // done Mar.28.2010 3809 test_PMOVZXWQ(); // done Mar.29.2010 3810 test_PMOVZXDQ(); // done Mar.29.2010 3811 test_POPCNTW(); 3812 test_POPCNTL(); 3813 test_POPCNTQ(); 3814 test_PMULDQ(); 3815 test_PMULLD(); 3816 test_PTEST(); 3817 test_ROUNDSD_w_immediate_rounding(); 3818 test_ROUNDSS_w_immediate_rounding(); 3819 test_ROUNDPD_w_immediate_rounding(); 3820 test_ROUNDPS_w_immediate_rounding(); 3821 test_ROUNDSD_w_mxcsr_rounding(); 3822 test_ROUNDSS_w_mxcsr_rounding(); 3823 test_ROUNDPD_w_mxcsr_rounding(); 3824 test_ROUNDPS_w_mxcsr_rounding(); 3825 // ------ SSE 4.2 ------ 3826 test_PCMPGTQ(); 3827 // CRC32B,Q 3828 test_PACKUSDW(); 3829 test_PHMINPOSUW(); 3830 test_MPSADBW(); 3831 test_MOVNTDQA(); /* not sure whether this is 4.1 or 4.2 */ 3832 #else 3833 test_MPSADBW(); 3834 #endif 3835 3836 return 0; 3837 } 3838 3839