1 //===---------------------------------------------------------------------===// 2 // Random ideas for the X86 backend: SSE-specific stuff. 3 //===---------------------------------------------------------------------===// 4 5 //===---------------------------------------------------------------------===// 6 7 SSE Variable shift can be custom lowered to something like this, which uses a 8 small table + unaligned load + shuffle instead of going through memory. 9 10 __m128i_shift_right: 11 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 12 .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 13 14 ... 15 __m128i shift_right(__m128i value, unsigned long offset) { 16 return _mm_shuffle_epi8(value, 17 _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset))); 18 } 19 20 //===---------------------------------------------------------------------===// 21 22 SSE has instructions for doing operations on complex numbers, we should pattern 23 match them. For example, this should turn into a horizontal add: 24 25 typedef float __attribute__((vector_size(16))) v4f32; 26 float f32(v4f32 A) { 27 return A[0]+A[1]+A[2]+A[3]; 28 } 29 30 Instead we get this: 31 32 _f32: ## @f32 33 pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0] 34 addss %xmm0, %xmm1 35 pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0] 36 movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1] 37 movaps %xmm0, %xmm3 38 addss %xmm1, %xmm3 39 movdqa %xmm2, %xmm0 40 addss %xmm3, %xmm0 41 ret 42 43 Also, there are cases where some simple local SLP would improve codegen a bit. 44 compiling this: 45 46 _Complex float f32(_Complex float A, _Complex float B) { 47 return A+B; 48 } 49 50 into: 51 52 _f32: ## @f32 53 movdqa %xmm0, %xmm2 54 addss %xmm1, %xmm2 55 pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0] 56 pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0] 57 addss %xmm1, %xmm3 58 movaps %xmm2, %xmm0 59 unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 60 ret 61 62 seems silly when it could just be one addps. 63 64 65 //===---------------------------------------------------------------------===// 66 67 Expand libm rounding functions inline: Significant speedups possible. 68 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html 69 70 //===---------------------------------------------------------------------===// 71 72 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and 73 other fast SSE modes. 74 75 //===---------------------------------------------------------------------===// 76 77 Think about doing i64 math in SSE regs on x86-32. 78 79 //===---------------------------------------------------------------------===// 80 81 This testcase should have no SSE instructions in it, and only one load from 82 a constant pool: 83 84 double %test3(bool %B) { 85 %C = select bool %B, double 123.412, double 523.01123123 86 ret double %C 87 } 88 89 Currently, the select is being lowered, which prevents the dag combiner from 90 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)' 91 92 The pattern isel got this one right. 93 94 //===---------------------------------------------------------------------===// 95 96 SSE should implement 'select_cc' using 'emulated conditional moves' that use 97 pcmp/pand/pandn/por to do a selection instead of a conditional branch: 98 99 double %X(double %Y, double %Z, double %A, double %B) { 100 %C = setlt double %A, %B 101 %z = fadd double %Z, 0.0 ;; select operand is not a load 102 %D = select bool %C, double %Y, double %z 103 ret double %D 104 } 105 106 We currently emit: 107 108 _X: 109 subl $12, %esp 110 xorpd %xmm0, %xmm0 111 addsd 24(%esp), %xmm0 112 movsd 32(%esp), %xmm1 113 movsd 16(%esp), %xmm2 114 ucomisd 40(%esp), %xmm1 115 jb LBB_X_2 116 LBB_X_1: 117 movsd %xmm0, %xmm2 118 LBB_X_2: 119 movsd %xmm2, (%esp) 120 fldl (%esp) 121 addl $12, %esp 122 ret 123 124 //===---------------------------------------------------------------------===// 125 126 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's 127 feasible. 128 129 //===---------------------------------------------------------------------===// 130 131 Codegen: 132 if (copysign(1.0, x) == copysign(1.0, y)) 133 into: 134 if (x^y & mask) 135 when using SSE. 136 137 //===---------------------------------------------------------------------===// 138 139 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half 140 of a v4sf value. 141 142 //===---------------------------------------------------------------------===// 143 144 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}. 145 Perhaps use pxor / xorp* to clear a XMM register first? 146 147 //===---------------------------------------------------------------------===// 148 149 External test Nurbs exposed some problems. Look for 150 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc 151 emits: 152 153 movaps (%edx), %xmm2 #59.21 154 movaps (%edx), %xmm5 #60.21 155 movaps (%edx), %xmm4 #61.21 156 movaps (%edx), %xmm3 #62.21 157 movl 40(%ecx), %ebp #69.49 158 shufps $0, %xmm2, %xmm5 #60.21 159 movl 100(%esp), %ebx #69.20 160 movl (%ebx), %edi #69.20 161 imull %ebp, %edi #69.49 162 addl (%eax), %edi #70.33 163 shufps $85, %xmm2, %xmm4 #61.21 164 shufps $170, %xmm2, %xmm3 #62.21 165 shufps $255, %xmm2, %xmm2 #63.21 166 lea (%ebp,%ebp,2), %ebx #69.49 167 negl %ebx #69.49 168 lea -3(%edi,%ebx), %ebx #70.33 169 shll $4, %ebx #68.37 170 addl 32(%ecx), %ebx #68.37 171 testb $15, %bl #91.13 172 jne L_B1.24 # Prob 5% #91.13 173 174 This is the llvm code after instruction scheduling: 175 176 cond_next140 (0xa910740, LLVM BB @0xa90beb0): 177 %reg1078 = MOV32ri -3 178 %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0 179 %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40 180 %reg1080 = IMUL32rr %reg1079, %reg1037 181 %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0 182 %reg1038 = LEA32r %reg1081, 1, %reg1080, -3 183 %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32 184 %reg1082 = SHL32ri %reg1038, 4 185 %reg1039 = ADD32rr %reg1036, %reg1082 186 %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0 187 %reg1034 = SHUFPSrr %reg1083, %reg1083, 170 188 %reg1032 = SHUFPSrr %reg1083, %reg1083, 0 189 %reg1035 = SHUFPSrr %reg1083, %reg1083, 255 190 %reg1033 = SHUFPSrr %reg1083, %reg1083, 85 191 %reg1040 = MOV32rr %reg1039 192 %reg1084 = AND32ri8 %reg1039, 15 193 CMP32ri8 %reg1084, 0 194 JE mbb<cond_next204,0xa914d30> 195 196 Still ok. After register allocation: 197 198 cond_next140 (0xa910740, LLVM BB @0xa90beb0): 199 %EAX = MOV32ri -3 200 %EDX = MOV32rm <fi#3>, 1, %NOREG, 0 201 ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0 202 %EDX = MOV32rm <fi#7>, 1, %NOREG, 0 203 %EDX = MOV32rm %EDX, 1, %NOREG, 40 204 IMUL32rr %EAX<def&use>, %EDX 205 %ESI = MOV32rm <fi#5>, 1, %NOREG, 0 206 %ESI = MOV32rm %ESI, 1, %NOREG, 0 207 MOV32mr <fi#4>, 1, %NOREG, 0, %ESI 208 %EAX = LEA32r %ESI, 1, %EAX, -3 209 %ESI = MOV32rm <fi#7>, 1, %NOREG, 0 210 %ESI = MOV32rm %ESI, 1, %NOREG, 32 211 %EDI = MOV32rr %EAX 212 SHL32ri %EDI<def&use>, 4 213 ADD32rr %EDI<def&use>, %ESI 214 %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0 215 %XMM1 = MOVAPSrr %XMM0 216 SHUFPSrr %XMM1<def&use>, %XMM1, 170 217 %XMM2 = MOVAPSrr %XMM0 218 SHUFPSrr %XMM2<def&use>, %XMM2, 0 219 %XMM3 = MOVAPSrr %XMM0 220 SHUFPSrr %XMM3<def&use>, %XMM3, 255 221 SHUFPSrr %XMM0<def&use>, %XMM0, 85 222 %EBX = MOV32rr %EDI 223 AND32ri8 %EBX<def&use>, 15 224 CMP32ri8 %EBX, 0 225 JE mbb<cond_next204,0xa914d30> 226 227 This looks really bad. The problem is shufps is a destructive opcode. Since it 228 appears as operand two in more than one shufps ops. It resulted in a number of 229 copies. Note icc also suffers from the same problem. Either the instruction 230 selector should select pshufd or The register allocator can made the two-address 231 to three-address transformation. 232 233 It also exposes some other problems. See MOV32ri -3 and the spills. 234 235 //===---------------------------------------------------------------------===// 236 237 Consider: 238 239 __m128 test(float a) { 240 return _mm_set_ps(0.0, 0.0, 0.0, a*a); 241 } 242 243 This compiles into: 244 245 movss 4(%esp), %xmm1 246 mulss %xmm1, %xmm1 247 xorps %xmm0, %xmm0 248 movss %xmm1, %xmm0 249 ret 250 251 Because mulss doesn't modify the top 3 elements, the top elements of 252 xmm1 are already zero'd. We could compile this to: 253 254 movss 4(%esp), %xmm0 255 mulss %xmm0, %xmm0 256 ret 257 258 //===---------------------------------------------------------------------===// 259 260 Here's a sick and twisted idea. Consider code like this: 261 262 __m128 test(__m128 a) { 263 float b = *(float*)&A; 264 ... 265 return _mm_set_ps(0.0, 0.0, 0.0, b); 266 } 267 268 This might compile to this code: 269 270 movaps c(%esp), %xmm1 271 xorps %xmm0, %xmm0 272 movss %xmm1, %xmm0 273 ret 274 275 Now consider if the ... code caused xmm1 to get spilled. This might produce 276 this code: 277 278 movaps c(%esp), %xmm1 279 movaps %xmm1, c2(%esp) 280 ... 281 282 xorps %xmm0, %xmm0 283 movaps c2(%esp), %xmm1 284 movss %xmm1, %xmm0 285 ret 286 287 However, since the reload is only used by these instructions, we could 288 "fold" it into the uses, producing something like this: 289 290 movaps c(%esp), %xmm1 291 movaps %xmm1, c2(%esp) 292 ... 293 294 movss c2(%esp), %xmm0 295 ret 296 297 ... saving two instructions. 298 299 The basic idea is that a reload from a spill slot, can, if only one 4-byte 300 chunk is used, bring in 3 zeros the one element instead of 4 elements. 301 This can be used to simplify a variety of shuffle operations, where the 302 elements are fixed zeros. 303 304 //===---------------------------------------------------------------------===// 305 306 This code generates ugly code, probably due to costs being off or something: 307 308 define void @test(float* %P, <4 x float>* %P2 ) { 309 %xFloat0.688 = load float* %P 310 %tmp = load <4 x float>* %P2 311 %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3 312 store <4 x float> %inFloat3.713, <4 x float>* %P2 313 ret void 314 } 315 316 Generates: 317 318 _test: 319 movl 8(%esp), %eax 320 movaps (%eax), %xmm0 321 pxor %xmm1, %xmm1 322 movaps %xmm0, %xmm2 323 shufps $50, %xmm1, %xmm2 324 shufps $132, %xmm2, %xmm0 325 movaps %xmm0, (%eax) 326 ret 327 328 Would it be better to generate: 329 330 _test: 331 movl 8(%esp), %ecx 332 movaps (%ecx), %xmm0 333 xor %eax, %eax 334 pinsrw $6, %eax, %xmm0 335 pinsrw $7, %eax, %xmm0 336 movaps %xmm0, (%ecx) 337 ret 338 339 ? 340 341 //===---------------------------------------------------------------------===// 342 343 Some useful information in the Apple Altivec / SSE Migration Guide: 344 345 http://developer.apple.com/documentation/Performance/Conceptual/ 346 Accelerate_sse_migration/index.html 347 348 e.g. SSE select using and, andnot, or. Various SSE compare translations. 349 350 //===---------------------------------------------------------------------===// 351 352 Add hooks to commute some CMPP operations. 353 354 //===---------------------------------------------------------------------===// 355 356 Apply the same transformation that merged four float into a single 128-bit load 357 to loads from constant pool. 358 359 //===---------------------------------------------------------------------===// 360 361 Floating point max / min are commutable when -enable-unsafe-fp-path is 362 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other 363 nodes which are selected to max / min instructions that are marked commutable. 364 365 //===---------------------------------------------------------------------===// 366 367 We should materialize vector constants like "all ones" and "signbit" with 368 code like: 369 370 cmpeqps xmm1, xmm1 ; xmm1 = all-ones 371 372 and: 373 cmpeqps xmm1, xmm1 ; xmm1 = all-ones 374 psrlq xmm1, 31 ; xmm1 = all 100000000000... 375 376 instead of using a load from the constant pool. The later is important for 377 ABS/NEG/copysign etc. 378 379 //===---------------------------------------------------------------------===// 380 381 These functions: 382 383 #include <xmmintrin.h> 384 __m128i a; 385 void x(unsigned short n) { 386 a = _mm_slli_epi32 (a, n); 387 } 388 void y(unsigned n) { 389 a = _mm_slli_epi32 (a, n); 390 } 391 392 compile to ( -O3 -static -fomit-frame-pointer): 393 _x: 394 movzwl 4(%esp), %eax 395 movd %eax, %xmm0 396 movaps _a, %xmm1 397 pslld %xmm0, %xmm1 398 movaps %xmm1, _a 399 ret 400 _y: 401 movd 4(%esp), %xmm0 402 movaps _a, %xmm1 403 pslld %xmm0, %xmm1 404 movaps %xmm1, _a 405 ret 406 407 "y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems 408 like movd would be sufficient in both cases as the value is already zero 409 extended in the 32-bit stack slot IIRC. For signed short, it should also be 410 save, as a really-signed value would be undefined for pslld. 411 412 413 //===---------------------------------------------------------------------===// 414 415 #include <math.h> 416 int t1(double d) { return signbit(d); } 417 418 This currently compiles to: 419 subl $12, %esp 420 movsd 16(%esp), %xmm0 421 movsd %xmm0, (%esp) 422 movl 4(%esp), %eax 423 shrl $31, %eax 424 addl $12, %esp 425 ret 426 427 We should use movmskp{s|d} instead. 428 429 //===---------------------------------------------------------------------===// 430 431 CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single 432 (aligned) vector load. This functionality has a couple of problems. 433 434 1. The code to infer alignment from loads of globals is in the X86 backend, 435 not the dag combiner. This is because dagcombine2 needs to be able to see 436 through the X86ISD::Wrapper node, which DAGCombine can't really do. 437 2. The code for turning 4 x load into a single vector load is target 438 independent and should be moved to the dag combiner. 439 3. The code for turning 4 x load into a vector load can only handle a direct 440 load from a global or a direct load from the stack. It should be generalized 441 to handle any load from P, P+4, P+8, P+12, where P can be anything. 442 4. The alignment inference code cannot handle loads from globals in non-static 443 mode because it doesn't look through the extra dyld stub load. If you try 444 vec_align.ll without -relocation-model=static, you'll see what I mean. 445 446 //===---------------------------------------------------------------------===// 447 448 We should lower store(fneg(load p), q) into an integer load+xor+store, which 449 eliminates a constant pool load. For example, consider: 450 451 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly { 452 entry: 453 %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1] 454 %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly 455 ret i64 %tmp20 456 } 457 declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly 458 459 This currently compiles to: 460 461 LCPI1_0: # <4 x float> 462 .long 2147483648 # float -0 463 .long 2147483648 # float -0 464 .long 2147483648 # float -0 465 .long 2147483648 # float -0 466 _ccosf: 467 subl $12, %esp 468 movss 16(%esp), %xmm0 469 movss %xmm0, 4(%esp) 470 movss 20(%esp), %xmm0 471 xorps LCPI1_0, %xmm0 472 movss %xmm0, (%esp) 473 call L_ccoshf$stub 474 addl $12, %esp 475 ret 476 477 Note the load into xmm0, then xor (to negate), then store. In PIC mode, 478 this code computes the pic base and does two loads to do the constant pool 479 load, so the improvement is much bigger. 480 481 The tricky part about this xform is that the argument load/store isn't exposed 482 until post-legalize, and at that point, the fneg has been custom expanded into 483 an X86 fxor. This means that we need to handle this case in the x86 backend 484 instead of in target independent code. 485 486 //===---------------------------------------------------------------------===// 487 488 Non-SSE4 insert into 16 x i8 is atrociously bad. 489 490 //===---------------------------------------------------------------------===// 491 492 <2 x i64> extract is substantially worse than <2 x f64>, even if the destination 493 is memory. 494 495 //===---------------------------------------------------------------------===// 496 497 INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert 498 any number of 0.0 simultaneously. Currently we only use it for simple 499 insertions. 500 501 See comments in LowerINSERT_VECTOR_ELT_SSE4. 502 503 //===---------------------------------------------------------------------===// 504 505 On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not 506 Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are 507 legal, it'll just take a few extra patterns written in the .td file. 508 509 Note: this is not a code quality issue; the custom lowered code happens to be 510 right, but we shouldn't have to custom lower anything. This is probably related 511 to <2 x i64> ops being so bad. 512 513 //===---------------------------------------------------------------------===// 514 515 LLVM currently generates stack realignment code, when it is not necessary 516 needed. The problem is that we need to know about stack alignment too early, 517 before RA runs. 518 519 At that point we don't know, whether there will be vector spill, or not. 520 Stack realignment logic is overly conservative here, but otherwise we can 521 produce unaligned loads/stores. 522 523 Fixing this will require some huge RA changes. 524 525 Testcase: 526 #include <emmintrin.h> 527 528 typedef short vSInt16 __attribute__ ((__vector_size__ (16))); 529 530 static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873, 531 - 22725, - 12873};; 532 533 vSInt16 madd(vSInt16 b) 534 { 535 return _mm_madd_epi16(a, b); 536 } 537 538 Generated code (x86-32, linux): 539 madd: 540 pushl %ebp 541 movl %esp, %ebp 542 andl $-16, %esp 543 movaps .LCPI1_0, %xmm1 544 pmaddwd %xmm1, %xmm0 545 movl %ebp, %esp 546 popl %ebp 547 ret 548 549 //===---------------------------------------------------------------------===// 550 551 Consider: 552 #include <emmintrin.h> 553 __m128 foo2 (float x) { 554 return _mm_set_ps (0, 0, x, 0); 555 } 556 557 In x86-32 mode, we generate this spiffy code: 558 559 _foo2: 560 movss 4(%esp), %xmm0 561 pshufd $81, %xmm0, %xmm0 562 ret 563 564 in x86-64 mode, we generate this code, which could be better: 565 566 _foo2: 567 xorps %xmm1, %xmm1 568 movss %xmm0, %xmm1 569 pshufd $81, %xmm1, %xmm0 570 ret 571 572 In sse4 mode, we could use insertps to make both better. 573 574 Here's another testcase that could use insertps [mem]: 575 576 #include <xmmintrin.h> 577 extern float x2, x3; 578 __m128 foo1 (float x1, float x4) { 579 return _mm_set_ps (x2, x1, x3, x4); 580 } 581 582 gcc mainline compiles it to: 583 584 foo1: 585 insertps $0x10, x2(%rip), %xmm0 586 insertps $0x10, x3(%rip), %xmm1 587 movaps %xmm1, %xmm2 588 movlhps %xmm0, %xmm2 589 movaps %xmm2, %xmm0 590 ret 591 592 //===---------------------------------------------------------------------===// 593 594 We compile vector multiply-by-constant into poor code: 595 596 define <4 x i32> @f(<4 x i32> %i) nounwind { 597 %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 > 598 ret <4 x i32> %A 599 } 600 601 On targets without SSE4.1, this compiles into: 602 603 LCPI1_0: ## <4 x i32> 604 .long 10 605 .long 10 606 .long 10 607 .long 10 608 .text 609 .align 4,0x90 610 .globl _f 611 _f: 612 pshufd $3, %xmm0, %xmm1 613 movd %xmm1, %eax 614 imull LCPI1_0+12, %eax 615 movd %eax, %xmm1 616 pshufd $1, %xmm0, %xmm2 617 movd %xmm2, %eax 618 imull LCPI1_0+4, %eax 619 movd %eax, %xmm2 620 punpckldq %xmm1, %xmm2 621 movd %xmm0, %eax 622 imull LCPI1_0, %eax 623 movd %eax, %xmm1 624 movhlps %xmm0, %xmm0 625 movd %xmm0, %eax 626 imull LCPI1_0+8, %eax 627 movd %eax, %xmm0 628 punpckldq %xmm0, %xmm1 629 movaps %xmm1, %xmm0 630 punpckldq %xmm2, %xmm0 631 ret 632 633 It would be better to synthesize integer vector multiplication by constants 634 using shifts and adds, pslld and paddd here. And even on targets with SSE4.1, 635 simple cases such as multiplication by powers of two would be better as 636 vector shifts than as multiplications. 637 638 //===---------------------------------------------------------------------===// 639 640 We compile this: 641 642 __m128i 643 foo2 (char x) 644 { 645 return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0); 646 } 647 648 into: 649 movl $1, %eax 650 xorps %xmm0, %xmm0 651 pinsrw $2, %eax, %xmm0 652 movzbl 4(%esp), %eax 653 pinsrw $3, %eax, %xmm0 654 movl $256, %eax 655 pinsrw $7, %eax, %xmm0 656 ret 657 658 659 gcc-4.2: 660 subl $12, %esp 661 movzbl 16(%esp), %eax 662 movdqa LC0, %xmm0 663 pinsrw $3, %eax, %xmm0 664 addl $12, %esp 665 ret 666 .const 667 .align 4 668 LC0: 669 .word 0 670 .word 0 671 .word 1 672 .word 0 673 .word 0 674 .word 0 675 .word 0 676 .word 256 677 678 With SSE4, it should be 679 movdqa .LC0(%rip), %xmm0 680 pinsrb $6, %edi, %xmm0 681 682 //===---------------------------------------------------------------------===// 683 684 We should transform a shuffle of two vectors of constants into a single vector 685 of constants. Also, insertelement of a constant into a vector of constants 686 should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll. 687 688 We compiled it to something horrible: 689 690 .align 4 691 LCPI1_1: ## float 692 .long 1065353216 ## float 1 693 .const 694 695 .align 4 696 LCPI1_0: ## <4 x float> 697 .space 4 698 .long 1065353216 ## float 1 699 .space 4 700 .long 1065353216 ## float 1 701 .text 702 .align 4,0x90 703 .globl _t 704 _t: 705 xorps %xmm0, %xmm0 706 movhps LCPI1_0, %xmm0 707 movss LCPI1_1, %xmm1 708 movaps %xmm0, %xmm2 709 shufps $2, %xmm1, %xmm2 710 shufps $132, %xmm2, %xmm0 711 movaps %xmm0, 0 712 713 //===---------------------------------------------------------------------===// 714 rdar://5907648 715 716 This function: 717 718 float foo(unsigned char x) { 719 return x; 720 } 721 722 compiles to (x86-32): 723 724 define float @foo(i8 zeroext %x) nounwind { 725 %tmp12 = uitofp i8 %x to float ; <float> [#uses=1] 726 ret float %tmp12 727 } 728 729 compiles to: 730 731 _foo: 732 subl $4, %esp 733 movzbl 8(%esp), %eax 734 cvtsi2ss %eax, %xmm0 735 movss %xmm0, (%esp) 736 flds (%esp) 737 addl $4, %esp 738 ret 739 740 We should be able to use: 741 cvtsi2ss 8($esp), %xmm0 742 since we know the stack slot is already zext'd. 743 744 //===---------------------------------------------------------------------===// 745 746 Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64)) 747 when code size is critical. movlps is slower than movsd on core2 but it's one 748 byte shorter. 749 750 //===---------------------------------------------------------------------===// 751 752 We should use a dynamic programming based approach to tell when using FPStack 753 operations is cheaper than SSE. SciMark montecarlo contains code like this 754 for example: 755 756 double MonteCarlo_num_flops(int Num_samples) { 757 return ((double) Num_samples)* 4.0; 758 } 759 760 In fpstack mode, this compiles into: 761 762 LCPI1_0: 763 .long 1082130432 ## float 4.000000e+00 764 _MonteCarlo_num_flops: 765 subl $4, %esp 766 movl 8(%esp), %eax 767 movl %eax, (%esp) 768 fildl (%esp) 769 fmuls LCPI1_0 770 addl $4, %esp 771 ret 772 773 in SSE mode, it compiles into significantly slower code: 774 775 _MonteCarlo_num_flops: 776 subl $12, %esp 777 cvtsi2sd 16(%esp), %xmm0 778 mulsd LCPI1_0, %xmm0 779 movsd %xmm0, (%esp) 780 fldl (%esp) 781 addl $12, %esp 782 ret 783 784 There are also other cases in scimark where using fpstack is better, it is 785 cheaper to do fld1 than load from a constant pool for example, so 786 "load, add 1.0, store" is better done in the fp stack, etc. 787 788 //===---------------------------------------------------------------------===// 789 790 The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to 791 "cmpsd". For example, this code: 792 793 double d1(double x) { return x == x ? x : x + x; } 794 795 Compiles into: 796 797 _d1: 798 ucomisd %xmm0, %xmm0 799 jnp LBB1_2 800 addsd %xmm0, %xmm0 801 ret 802 LBB1_2: 803 ret 804 805 Also, the 'ret's should be shared. This is PR6032. 806 807 //===---------------------------------------------------------------------===// 808 809 These should compile into the same code (PR6214): Perhaps instcombine should 810 canonicalize the former into the later? 811 812 define float @foo(float %x) nounwind { 813 %t = bitcast float %x to i32 814 %s = and i32 %t, 2147483647 815 %d = bitcast i32 %s to float 816 ret float %d 817 } 818 819 declare float @fabsf(float %n) 820 define float @bar(float %x) nounwind { 821 %d = call float @fabsf(float %x) 822 ret float %d 823 } 824 825 //===---------------------------------------------------------------------===// 826 827 This IR (from PR6194): 828 829 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" 830 target triple = "x86_64-apple-darwin10.0.0" 831 832 %0 = type { double, double } 833 %struct.float3 = type { float, float, float } 834 835 define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp { 836 entry: 837 %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1] 838 %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1] 839 %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1] 840 %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1] 841 %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1] 842 %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1] 843 %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1] 844 store float %tmp12, float* %tmp5 845 ret void 846 } 847 848 Compiles to: 849 850 _test: ## @test 851 movd %xmm0, %rax 852 shrq $32, %rax 853 movl %eax, 4(%rdi) 854 ret 855 856 This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and 857 doing a shuffle from v[1] to v[0] then a float store. 858 859 //===---------------------------------------------------------------------===// 860 861 On SSE4 machines, we compile this code: 862 863 define <2 x float> @test2(<2 x float> %Q, <2 x float> %R, 864 <2 x float> *%P) nounwind { 865 %Z = fadd <2 x float> %Q, %R 866 867 store <2 x float> %Z, <2 x float> *%P 868 ret <2 x float> %Z 869 } 870 871 into: 872 873 _test2: ## @test2 874 ## BB#0: 875 insertps $0, %xmm2, %xmm2 876 insertps $16, %xmm3, %xmm2 877 insertps $0, %xmm0, %xmm3 878 insertps $16, %xmm1, %xmm3 879 addps %xmm2, %xmm3 880 movq %xmm3, (%rdi) 881 movaps %xmm3, %xmm0 882 pshufd $1, %xmm3, %xmm1 883 ## kill: XMM1<def> XMM1<kill> 884 ret 885 886 The insertps's of $0 are pointless complex copies. 887 888 //===---------------------------------------------------------------------===// 889 890 [UNSAFE FP] 891 892 void foo(double, double, double); 893 void norm(double x, double y, double z) { 894 double scale = __builtin_sqrt(x*x + y*y + z*z); 895 foo(x/scale, y/scale, z/scale); 896 } 897 898 We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is 899 slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first 900 and emit 3 mulsd in place of the divs. This can be done as a target-independent 901 transform. 902 903 If we're dealing with floats instead of doubles we could even replace the sqrtss 904 and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the 905 cost of reduced accuracy. 906 907 //===---------------------------------------------------------------------===// 908 909 This function should be matched to haddpd when the appropriate CPU is enabled: 910 911 #include <x86intrin.h> 912 double f (__m128d p) { 913 return p[0] + p[1]; 914 } 915 916 similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should 917 turn into hsubpd also. 918 919 //===---------------------------------------------------------------------===// 920 921 define <2 x i32> @foo(<2 x double> %in) { 922 %x = fptosi <2 x double> %in to <2 x i32> 923 ret <2 x i32> %x 924 } 925 926 Should compile into cvttpd2dq instead of being scalarized into 2 cvttsd2si. 927 928 //===---------------------------------------------------------------------===// 929