1 //===---------------------------------------------------------------------===// 2 // Random ideas for the X86 backend: SSE-specific stuff. 3 //===---------------------------------------------------------------------===// 4 5 //===---------------------------------------------------------------------===// 6 7 SSE Variable shift can be custom lowered to something like this, which uses a 8 small table + unaligned load + shuffle instead of going through memory. 9 10 __m128i_shift_right: 11 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 12 .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 13 14 ... 15 __m128i shift_right(__m128i value, unsigned long offset) { 16 return _mm_shuffle_epi8(value, 17 _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset))); 18 } 19 20 //===---------------------------------------------------------------------===// 21 22 SSE has instructions for doing operations on complex numbers, we should pattern 23 match them. For example, this should turn into a horizontal add: 24 25 typedef float __attribute__((vector_size(16))) v4f32; 26 float f32(v4f32 A) { 27 return A[0]+A[1]+A[2]+A[3]; 28 } 29 30 Instead we get this: 31 32 _f32: ## @f32 33 pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0] 34 addss %xmm0, %xmm1 35 pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0] 36 movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1] 37 movaps %xmm0, %xmm3 38 addss %xmm1, %xmm3 39 movdqa %xmm2, %xmm0 40 addss %xmm3, %xmm0 41 ret 42 43 Also, there are cases where some simple local SLP would improve codegen a bit. 44 compiling this: 45 46 _Complex float f32(_Complex float A, _Complex float B) { 47 return A+B; 48 } 49 50 into: 51 52 _f32: ## @f32 53 movdqa %xmm0, %xmm2 54 addss %xmm1, %xmm2 55 pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0] 56 pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0] 57 addss %xmm1, %xmm3 58 movaps %xmm2, %xmm0 59 unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 60 ret 61 62 seems silly when it could just be one addps. 63 64 65 //===---------------------------------------------------------------------===// 66 67 Expand libm rounding functions inline: Significant speedups possible. 68 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html 69 70 //===---------------------------------------------------------------------===// 71 72 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and 73 other fast SSE modes. 74 75 //===---------------------------------------------------------------------===// 76 77 Think about doing i64 math in SSE regs on x86-32. 78 79 //===---------------------------------------------------------------------===// 80 81 This testcase should have no SSE instructions in it, and only one load from 82 a constant pool: 83 84 double %test3(bool %B) { 85 %C = select bool %B, double 123.412, double 523.01123123 86 ret double %C 87 } 88 89 Currently, the select is being lowered, which prevents the dag combiner from 90 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)' 91 92 The pattern isel got this one right. 93 94 //===---------------------------------------------------------------------===// 95 96 SSE should implement 'select_cc' using 'emulated conditional moves' that use 97 pcmp/pand/pandn/por to do a selection instead of a conditional branch: 98 99 double %X(double %Y, double %Z, double %A, double %B) { 100 %C = setlt double %A, %B 101 %z = fadd double %Z, 0.0 ;; select operand is not a load 102 %D = select bool %C, double %Y, double %z 103 ret double %D 104 } 105 106 We currently emit: 107 108 _X: 109 subl $12, %esp 110 xorpd %xmm0, %xmm0 111 addsd 24(%esp), %xmm0 112 movsd 32(%esp), %xmm1 113 movsd 16(%esp), %xmm2 114 ucomisd 40(%esp), %xmm1 115 jb LBB_X_2 116 LBB_X_1: 117 movsd %xmm0, %xmm2 118 LBB_X_2: 119 movsd %xmm2, (%esp) 120 fldl (%esp) 121 addl $12, %esp 122 ret 123 124 //===---------------------------------------------------------------------===// 125 126 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's 127 feasible. 128 129 //===---------------------------------------------------------------------===// 130 131 Codegen: 132 if (copysign(1.0, x) == copysign(1.0, y)) 133 into: 134 if (x^y & mask) 135 when using SSE. 136 137 //===---------------------------------------------------------------------===// 138 139 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half 140 of a v4sf value. 141 142 //===---------------------------------------------------------------------===// 143 144 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}. 145 Perhaps use pxor / xorp* to clear a XMM register first? 146 147 //===---------------------------------------------------------------------===// 148 149 External test Nurbs exposed some problems. Look for 150 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc 151 emits: 152 153 movaps (%edx), %xmm2 #59.21 154 movaps (%edx), %xmm5 #60.21 155 movaps (%edx), %xmm4 #61.21 156 movaps (%edx), %xmm3 #62.21 157 movl 40(%ecx), %ebp #69.49 158 shufps $0, %xmm2, %xmm5 #60.21 159 movl 100(%esp), %ebx #69.20 160 movl (%ebx), %edi #69.20 161 imull %ebp, %edi #69.49 162 addl (%eax), %edi #70.33 163 shufps $85, %xmm2, %xmm4 #61.21 164 shufps $170, %xmm2, %xmm3 #62.21 165 shufps $255, %xmm2, %xmm2 #63.21 166 lea (%ebp,%ebp,2), %ebx #69.49 167 negl %ebx #69.49 168 lea -3(%edi,%ebx), %ebx #70.33 169 shll $4, %ebx #68.37 170 addl 32(%ecx), %ebx #68.37 171 testb $15, %bl #91.13 172 jne L_B1.24 # Prob 5% #91.13 173 174 This is the llvm code after instruction scheduling: 175 176 cond_next140 (0xa910740, LLVM BB @0xa90beb0): 177 %reg1078 = MOV32ri -3 178 %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0 179 %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40 180 %reg1080 = IMUL32rr %reg1079, %reg1037 181 %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0 182 %reg1038 = LEA32r %reg1081, 1, %reg1080, -3 183 %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32 184 %reg1082 = SHL32ri %reg1038, 4 185 %reg1039 = ADD32rr %reg1036, %reg1082 186 %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0 187 %reg1034 = SHUFPSrr %reg1083, %reg1083, 170 188 %reg1032 = SHUFPSrr %reg1083, %reg1083, 0 189 %reg1035 = SHUFPSrr %reg1083, %reg1083, 255 190 %reg1033 = SHUFPSrr %reg1083, %reg1083, 85 191 %reg1040 = MOV32rr %reg1039 192 %reg1084 = AND32ri8 %reg1039, 15 193 CMP32ri8 %reg1084, 0 194 JE mbb<cond_next204,0xa914d30> 195 196 Still ok. After register allocation: 197 198 cond_next140 (0xa910740, LLVM BB @0xa90beb0): 199 %EAX = MOV32ri -3 200 %EDX = MOV32rm <fi#3>, 1, %NOREG, 0 201 ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0 202 %EDX = MOV32rm <fi#7>, 1, %NOREG, 0 203 %EDX = MOV32rm %EDX, 1, %NOREG, 40 204 IMUL32rr %EAX<def&use>, %EDX 205 %ESI = MOV32rm <fi#5>, 1, %NOREG, 0 206 %ESI = MOV32rm %ESI, 1, %NOREG, 0 207 MOV32mr <fi#4>, 1, %NOREG, 0, %ESI 208 %EAX = LEA32r %ESI, 1, %EAX, -3 209 %ESI = MOV32rm <fi#7>, 1, %NOREG, 0 210 %ESI = MOV32rm %ESI, 1, %NOREG, 32 211 %EDI = MOV32rr %EAX 212 SHL32ri %EDI<def&use>, 4 213 ADD32rr %EDI<def&use>, %ESI 214 %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0 215 %XMM1 = MOVAPSrr %XMM0 216 SHUFPSrr %XMM1<def&use>, %XMM1, 170 217 %XMM2 = MOVAPSrr %XMM0 218 SHUFPSrr %XMM2<def&use>, %XMM2, 0 219 %XMM3 = MOVAPSrr %XMM0 220 SHUFPSrr %XMM3<def&use>, %XMM3, 255 221 SHUFPSrr %XMM0<def&use>, %XMM0, 85 222 %EBX = MOV32rr %EDI 223 AND32ri8 %EBX<def&use>, 15 224 CMP32ri8 %EBX, 0 225 JE mbb<cond_next204,0xa914d30> 226 227 This looks really bad. The problem is shufps is a destructive opcode. Since it 228 appears as operand two in more than one shufps ops. It resulted in a number of 229 copies. Note icc also suffers from the same problem. Either the instruction 230 selector should select pshufd or The register allocator can made the two-address 231 to three-address transformation. 232 233 It also exposes some other problems. See MOV32ri -3 and the spills. 234 235 //===---------------------------------------------------------------------===// 236 237 Consider: 238 239 __m128 test(float a) { 240 return _mm_set_ps(0.0, 0.0, 0.0, a*a); 241 } 242 243 This compiles into: 244 245 movss 4(%esp), %xmm1 246 mulss %xmm1, %xmm1 247 xorps %xmm0, %xmm0 248 movss %xmm1, %xmm0 249 ret 250 251 Because mulss doesn't modify the top 3 elements, the top elements of 252 xmm1 are already zero'd. We could compile this to: 253 254 movss 4(%esp), %xmm0 255 mulss %xmm0, %xmm0 256 ret 257 258 //===---------------------------------------------------------------------===// 259 260 Here's a sick and twisted idea. Consider code like this: 261 262 __m128 test(__m128 a) { 263 float b = *(float*)&A; 264 ... 265 return _mm_set_ps(0.0, 0.0, 0.0, b); 266 } 267 268 This might compile to this code: 269 270 movaps c(%esp), %xmm1 271 xorps %xmm0, %xmm0 272 movss %xmm1, %xmm0 273 ret 274 275 Now consider if the ... code caused xmm1 to get spilled. This might produce 276 this code: 277 278 movaps c(%esp), %xmm1 279 movaps %xmm1, c2(%esp) 280 ... 281 282 xorps %xmm0, %xmm0 283 movaps c2(%esp), %xmm1 284 movss %xmm1, %xmm0 285 ret 286 287 However, since the reload is only used by these instructions, we could 288 "fold" it into the uses, producing something like this: 289 290 movaps c(%esp), %xmm1 291 movaps %xmm1, c2(%esp) 292 ... 293 294 movss c2(%esp), %xmm0 295 ret 296 297 ... saving two instructions. 298 299 The basic idea is that a reload from a spill slot, can, if only one 4-byte 300 chunk is used, bring in 3 zeros the one element instead of 4 elements. 301 This can be used to simplify a variety of shuffle operations, where the 302 elements are fixed zeros. 303 304 //===---------------------------------------------------------------------===// 305 306 This code generates ugly code, probably due to costs being off or something: 307 308 define void @test(float* %P, <4 x float>* %P2 ) { 309 %xFloat0.688 = load float* %P 310 %tmp = load <4 x float>* %P2 311 %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3 312 store <4 x float> %inFloat3.713, <4 x float>* %P2 313 ret void 314 } 315 316 Generates: 317 318 _test: 319 movl 8(%esp), %eax 320 movaps (%eax), %xmm0 321 pxor %xmm1, %xmm1 322 movaps %xmm0, %xmm2 323 shufps $50, %xmm1, %xmm2 324 shufps $132, %xmm2, %xmm0 325 movaps %xmm0, (%eax) 326 ret 327 328 Would it be better to generate: 329 330 _test: 331 movl 8(%esp), %ecx 332 movaps (%ecx), %xmm0 333 xor %eax, %eax 334 pinsrw $6, %eax, %xmm0 335 pinsrw $7, %eax, %xmm0 336 movaps %xmm0, (%ecx) 337 ret 338 339 ? 340 341 //===---------------------------------------------------------------------===// 342 343 Some useful information in the Apple Altivec / SSE Migration Guide: 344 345 http://developer.apple.com/documentation/Performance/Conceptual/ 346 Accelerate_sse_migration/index.html 347 348 e.g. SSE select using and, andnot, or. Various SSE compare translations. 349 350 //===---------------------------------------------------------------------===// 351 352 Add hooks to commute some CMPP operations. 353 354 //===---------------------------------------------------------------------===// 355 356 Apply the same transformation that merged four float into a single 128-bit load 357 to loads from constant pool. 358 359 //===---------------------------------------------------------------------===// 360 361 Floating point max / min are commutable when -enable-unsafe-fp-path is 362 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other 363 nodes which are selected to max / min instructions that are marked commutable. 364 365 //===---------------------------------------------------------------------===// 366 367 We should materialize vector constants like "all ones" and "signbit" with 368 code like: 369 370 cmpeqps xmm1, xmm1 ; xmm1 = all-ones 371 372 and: 373 cmpeqps xmm1, xmm1 ; xmm1 = all-ones 374 psrlq xmm1, 31 ; xmm1 = all 100000000000... 375 376 instead of using a load from the constant pool. The later is important for 377 ABS/NEG/copysign etc. 378 379 //===---------------------------------------------------------------------===// 380 381 These functions: 382 383 #include <xmmintrin.h> 384 __m128i a; 385 void x(unsigned short n) { 386 a = _mm_slli_epi32 (a, n); 387 } 388 void y(unsigned n) { 389 a = _mm_slli_epi32 (a, n); 390 } 391 392 compile to ( -O3 -static -fomit-frame-pointer): 393 _x: 394 movzwl 4(%esp), %eax 395 movd %eax, %xmm0 396 movaps _a, %xmm1 397 pslld %xmm0, %xmm1 398 movaps %xmm1, _a 399 ret 400 _y: 401 movd 4(%esp), %xmm0 402 movaps _a, %xmm1 403 pslld %xmm0, %xmm1 404 movaps %xmm1, _a 405 ret 406 407 "y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems 408 like movd would be sufficient in both cases as the value is already zero 409 extended in the 32-bit stack slot IIRC. For signed short, it should also be 410 save, as a really-signed value would be undefined for pslld. 411 412 413 //===---------------------------------------------------------------------===// 414 415 #include <math.h> 416 int t1(double d) { return signbit(d); } 417 418 This currently compiles to: 419 subl $12, %esp 420 movsd 16(%esp), %xmm0 421 movsd %xmm0, (%esp) 422 movl 4(%esp), %eax 423 shrl $31, %eax 424 addl $12, %esp 425 ret 426 427 We should use movmskp{s|d} instead. 428 429 //===---------------------------------------------------------------------===// 430 431 CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single 432 (aligned) vector load. This functionality has a couple of problems. 433 434 1. The code to infer alignment from loads of globals is in the X86 backend, 435 not the dag combiner. This is because dagcombine2 needs to be able to see 436 through the X86ISD::Wrapper node, which DAGCombine can't really do. 437 2. The code for turning 4 x load into a single vector load is target 438 independent and should be moved to the dag combiner. 439 3. The code for turning 4 x load into a vector load can only handle a direct 440 load from a global or a direct load from the stack. It should be generalized 441 to handle any load from P, P+4, P+8, P+12, where P can be anything. 442 4. The alignment inference code cannot handle loads from globals in non-static 443 mode because it doesn't look through the extra dyld stub load. If you try 444 vec_align.ll without -relocation-model=static, you'll see what I mean. 445 446 //===---------------------------------------------------------------------===// 447 448 We should lower store(fneg(load p), q) into an integer load+xor+store, which 449 eliminates a constant pool load. For example, consider: 450 451 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly { 452 entry: 453 %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1] 454 %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly 455 ret i64 %tmp20 456 } 457 declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly 458 459 This currently compiles to: 460 461 LCPI1_0: # <4 x float> 462 .long 2147483648 # float -0 463 .long 2147483648 # float -0 464 .long 2147483648 # float -0 465 .long 2147483648 # float -0 466 _ccosf: 467 subl $12, %esp 468 movss 16(%esp), %xmm0 469 movss %xmm0, 4(%esp) 470 movss 20(%esp), %xmm0 471 xorps LCPI1_0, %xmm0 472 movss %xmm0, (%esp) 473 call L_ccoshf$stub 474 addl $12, %esp 475 ret 476 477 Note the load into xmm0, then xor (to negate), then store. In PIC mode, 478 this code computes the pic base and does two loads to do the constant pool 479 load, so the improvement is much bigger. 480 481 The tricky part about this xform is that the argument load/store isn't exposed 482 until post-legalize, and at that point, the fneg has been custom expanded into 483 an X86 fxor. This means that we need to handle this case in the x86 backend 484 instead of in target independent code. 485 486 //===---------------------------------------------------------------------===// 487 488 Non-SSE4 insert into 16 x i8 is atrociously bad. 489 490 //===---------------------------------------------------------------------===// 491 492 <2 x i64> extract is substantially worse than <2 x f64>, even if the destination 493 is memory. 494 495 //===---------------------------------------------------------------------===// 496 497 SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext 498 sitting between the truncate and the extract. 499 500 //===---------------------------------------------------------------------===// 501 502 INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert 503 any number of 0.0 simultaneously. Currently we only use it for simple 504 insertions. 505 506 See comments in LowerINSERT_VECTOR_ELT_SSE4. 507 508 //===---------------------------------------------------------------------===// 509 510 On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not 511 Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are 512 legal, it'll just take a few extra patterns written in the .td file. 513 514 Note: this is not a code quality issue; the custom lowered code happens to be 515 right, but we shouldn't have to custom lower anything. This is probably related 516 to <2 x i64> ops being so bad. 517 518 //===---------------------------------------------------------------------===// 519 520 'select' on vectors and scalars could be a whole lot better. We currently 521 lower them to conditional branches. On x86-64 for example, we compile this: 522 523 double test(double a, double b, double c, double d) { return a<b ? c : d; } 524 525 to: 526 527 _test: 528 ucomisd %xmm0, %xmm1 529 ja LBB1_2 # entry 530 LBB1_1: # entry 531 movapd %xmm3, %xmm2 532 LBB1_2: # entry 533 movapd %xmm2, %xmm0 534 ret 535 536 instead of: 537 538 _test: 539 cmpltsd %xmm1, %xmm0 540 andpd %xmm0, %xmm2 541 andnpd %xmm3, %xmm0 542 orpd %xmm2, %xmm0 543 ret 544 545 For unpredictable branches, the later is much more efficient. This should 546 just be a matter of having scalar sse map to SELECT_CC and custom expanding 547 or iseling it. 548 549 //===---------------------------------------------------------------------===// 550 551 LLVM currently generates stack realignment code, when it is not necessary 552 needed. The problem is that we need to know about stack alignment too early, 553 before RA runs. 554 555 At that point we don't know, whether there will be vector spill, or not. 556 Stack realignment logic is overly conservative here, but otherwise we can 557 produce unaligned loads/stores. 558 559 Fixing this will require some huge RA changes. 560 561 Testcase: 562 #include <emmintrin.h> 563 564 typedef short vSInt16 __attribute__ ((__vector_size__ (16))); 565 566 static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873, 567 - 22725, - 12873};; 568 569 vSInt16 madd(vSInt16 b) 570 { 571 return _mm_madd_epi16(a, b); 572 } 573 574 Generated code (x86-32, linux): 575 madd: 576 pushl %ebp 577 movl %esp, %ebp 578 andl $-16, %esp 579 movaps .LCPI1_0, %xmm1 580 pmaddwd %xmm1, %xmm0 581 movl %ebp, %esp 582 popl %ebp 583 ret 584 585 //===---------------------------------------------------------------------===// 586 587 Consider: 588 #include <emmintrin.h> 589 __m128 foo2 (float x) { 590 return _mm_set_ps (0, 0, x, 0); 591 } 592 593 In x86-32 mode, we generate this spiffy code: 594 595 _foo2: 596 movss 4(%esp), %xmm0 597 pshufd $81, %xmm0, %xmm0 598 ret 599 600 in x86-64 mode, we generate this code, which could be better: 601 602 _foo2: 603 xorps %xmm1, %xmm1 604 movss %xmm0, %xmm1 605 pshufd $81, %xmm1, %xmm0 606 ret 607 608 In sse4 mode, we could use insertps to make both better. 609 610 Here's another testcase that could use insertps [mem]: 611 612 #include <xmmintrin.h> 613 extern float x2, x3; 614 __m128 foo1 (float x1, float x4) { 615 return _mm_set_ps (x2, x1, x3, x4); 616 } 617 618 gcc mainline compiles it to: 619 620 foo1: 621 insertps $0x10, x2(%rip), %xmm0 622 insertps $0x10, x3(%rip), %xmm1 623 movaps %xmm1, %xmm2 624 movlhps %xmm0, %xmm2 625 movaps %xmm2, %xmm0 626 ret 627 628 //===---------------------------------------------------------------------===// 629 630 We compile vector multiply-by-constant into poor code: 631 632 define <4 x i32> @f(<4 x i32> %i) nounwind { 633 %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 > 634 ret <4 x i32> %A 635 } 636 637 On targets without SSE4.1, this compiles into: 638 639 LCPI1_0: ## <4 x i32> 640 .long 10 641 .long 10 642 .long 10 643 .long 10 644 .text 645 .align 4,0x90 646 .globl _f 647 _f: 648 pshufd $3, %xmm0, %xmm1 649 movd %xmm1, %eax 650 imull LCPI1_0+12, %eax 651 movd %eax, %xmm1 652 pshufd $1, %xmm0, %xmm2 653 movd %xmm2, %eax 654 imull LCPI1_0+4, %eax 655 movd %eax, %xmm2 656 punpckldq %xmm1, %xmm2 657 movd %xmm0, %eax 658 imull LCPI1_0, %eax 659 movd %eax, %xmm1 660 movhlps %xmm0, %xmm0 661 movd %xmm0, %eax 662 imull LCPI1_0+8, %eax 663 movd %eax, %xmm0 664 punpckldq %xmm0, %xmm1 665 movaps %xmm1, %xmm0 666 punpckldq %xmm2, %xmm0 667 ret 668 669 It would be better to synthesize integer vector multiplication by constants 670 using shifts and adds, pslld and paddd here. And even on targets with SSE4.1, 671 simple cases such as multiplication by powers of two would be better as 672 vector shifts than as multiplications. 673 674 //===---------------------------------------------------------------------===// 675 676 We compile this: 677 678 __m128i 679 foo2 (char x) 680 { 681 return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0); 682 } 683 684 into: 685 movl $1, %eax 686 xorps %xmm0, %xmm0 687 pinsrw $2, %eax, %xmm0 688 movzbl 4(%esp), %eax 689 pinsrw $3, %eax, %xmm0 690 movl $256, %eax 691 pinsrw $7, %eax, %xmm0 692 ret 693 694 695 gcc-4.2: 696 subl $12, %esp 697 movzbl 16(%esp), %eax 698 movdqa LC0, %xmm0 699 pinsrw $3, %eax, %xmm0 700 addl $12, %esp 701 ret 702 .const 703 .align 4 704 LC0: 705 .word 0 706 .word 0 707 .word 1 708 .word 0 709 .word 0 710 .word 0 711 .word 0 712 .word 256 713 714 With SSE4, it should be 715 movdqa .LC0(%rip), %xmm0 716 pinsrb $6, %edi, %xmm0 717 718 //===---------------------------------------------------------------------===// 719 720 We should transform a shuffle of two vectors of constants into a single vector 721 of constants. Also, insertelement of a constant into a vector of constants 722 should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll. 723 724 We compiled it to something horrible: 725 726 .align 4 727 LCPI1_1: ## float 728 .long 1065353216 ## float 1 729 .const 730 731 .align 4 732 LCPI1_0: ## <4 x float> 733 .space 4 734 .long 1065353216 ## float 1 735 .space 4 736 .long 1065353216 ## float 1 737 .text 738 .align 4,0x90 739 .globl _t 740 _t: 741 xorps %xmm0, %xmm0 742 movhps LCPI1_0, %xmm0 743 movss LCPI1_1, %xmm1 744 movaps %xmm0, %xmm2 745 shufps $2, %xmm1, %xmm2 746 shufps $132, %xmm2, %xmm0 747 movaps %xmm0, 0 748 749 //===---------------------------------------------------------------------===// 750 rdar://5907648 751 752 This function: 753 754 float foo(unsigned char x) { 755 return x; 756 } 757 758 compiles to (x86-32): 759 760 define float @foo(i8 zeroext %x) nounwind { 761 %tmp12 = uitofp i8 %x to float ; <float> [#uses=1] 762 ret float %tmp12 763 } 764 765 compiles to: 766 767 _foo: 768 subl $4, %esp 769 movzbl 8(%esp), %eax 770 cvtsi2ss %eax, %xmm0 771 movss %xmm0, (%esp) 772 flds (%esp) 773 addl $4, %esp 774 ret 775 776 We should be able to use: 777 cvtsi2ss 8($esp), %xmm0 778 since we know the stack slot is already zext'd. 779 780 //===---------------------------------------------------------------------===// 781 782 Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64)) 783 when code size is critical. movlps is slower than movsd on core2 but it's one 784 byte shorter. 785 786 //===---------------------------------------------------------------------===// 787 788 We should use a dynamic programming based approach to tell when using FPStack 789 operations is cheaper than SSE. SciMark montecarlo contains code like this 790 for example: 791 792 double MonteCarlo_num_flops(int Num_samples) { 793 return ((double) Num_samples)* 4.0; 794 } 795 796 In fpstack mode, this compiles into: 797 798 LCPI1_0: 799 .long 1082130432 ## float 4.000000e+00 800 _MonteCarlo_num_flops: 801 subl $4, %esp 802 movl 8(%esp), %eax 803 movl %eax, (%esp) 804 fildl (%esp) 805 fmuls LCPI1_0 806 addl $4, %esp 807 ret 808 809 in SSE mode, it compiles into significantly slower code: 810 811 _MonteCarlo_num_flops: 812 subl $12, %esp 813 cvtsi2sd 16(%esp), %xmm0 814 mulsd LCPI1_0, %xmm0 815 movsd %xmm0, (%esp) 816 fldl (%esp) 817 addl $12, %esp 818 ret 819 820 There are also other cases in scimark where using fpstack is better, it is 821 cheaper to do fld1 than load from a constant pool for example, so 822 "load, add 1.0, store" is better done in the fp stack, etc. 823 824 //===---------------------------------------------------------------------===// 825 826 The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to 827 "cmpsd". For example, this code: 828 829 double d1(double x) { return x == x ? x : x + x; } 830 831 Compiles into: 832 833 _d1: 834 ucomisd %xmm0, %xmm0 835 jnp LBB1_2 836 addsd %xmm0, %xmm0 837 ret 838 LBB1_2: 839 ret 840 841 Also, the 'ret's should be shared. This is PR6032. 842 843 //===---------------------------------------------------------------------===// 844 845 These should compile into the same code (PR6214): Perhaps instcombine should 846 canonicalize the former into the later? 847 848 define float @foo(float %x) nounwind { 849 %t = bitcast float %x to i32 850 %s = and i32 %t, 2147483647 851 %d = bitcast i32 %s to float 852 ret float %d 853 } 854 855 declare float @fabsf(float %n) 856 define float @bar(float %x) nounwind { 857 %d = call float @fabsf(float %x) 858 ret float %d 859 } 860 861 //===---------------------------------------------------------------------===// 862 863 This IR (from PR6194): 864 865 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" 866 target triple = "x86_64-apple-darwin10.0.0" 867 868 %0 = type { double, double } 869 %struct.float3 = type { float, float, float } 870 871 define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp { 872 entry: 873 %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1] 874 %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1] 875 %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1] 876 %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1] 877 %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1] 878 %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1] 879 %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1] 880 store float %tmp12, float* %tmp5 881 ret void 882 } 883 884 Compiles to: 885 886 _test: ## @test 887 movd %xmm0, %rax 888 shrq $32, %rax 889 movl %eax, 4(%rdi) 890 ret 891 892 This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and 893 doing a shuffle from v[1] to v[0] then a float store. 894 895 //===---------------------------------------------------------------------===// 896 897 On SSE4 machines, we compile this code: 898 899 define <2 x float> @test2(<2 x float> %Q, <2 x float> %R, 900 <2 x float> *%P) nounwind { 901 %Z = fadd <2 x float> %Q, %R 902 903 store <2 x float> %Z, <2 x float> *%P 904 ret <2 x float> %Z 905 } 906 907 into: 908 909 _test2: ## @test2 910 ## BB#0: 911 insertps $0, %xmm2, %xmm2 912 insertps $16, %xmm3, %xmm2 913 insertps $0, %xmm0, %xmm3 914 insertps $16, %xmm1, %xmm3 915 addps %xmm2, %xmm3 916 movq %xmm3, (%rdi) 917 movaps %xmm3, %xmm0 918 pshufd $1, %xmm3, %xmm1 919 ## kill: XMM1<def> XMM1<kill> 920 ret 921 922 The insertps's of $0 are pointless complex copies. 923 924 //===---------------------------------------------------------------------===// 925 926 If SSE4.1 is available we should inline rounding functions instead of emitting 927 a libcall. 928 929 floor: roundsd $0x01, %xmm, %xmm 930 ceil: roundsd $0x02, %xmm, %xmm 931 932 and likewise for the single precision versions. 933 934 Currently, SelectionDAGBuilder doesn't turn calls to these functions into the 935 corresponding nodes and some targets (including X86) aren't ready for them. 936 937 //===---------------------------------------------------------------------===// 938