1 //===---------------------------------------------------------------------===// 2 // Random ideas for the X86 backend: SSE-specific stuff. 3 //===---------------------------------------------------------------------===// 4 5 //===---------------------------------------------------------------------===// 6 7 SSE Variable shift can be custom lowered to something like this, which uses a 8 small table + unaligned load + shuffle instead of going through memory. 9 10 __m128i_shift_right: 11 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 12 .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 13 14 ... 15 __m128i shift_right(__m128i value, unsigned long offset) { 16 return _mm_shuffle_epi8(value, 17 _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset))); 18 } 19 20 //===---------------------------------------------------------------------===// 21 22 SSE has instructions for doing operations on complex numbers, we should pattern 23 match them. For example, this should turn into a horizontal add: 24 25 typedef float __attribute__((vector_size(16))) v4f32; 26 float f32(v4f32 A) { 27 return A[0]+A[1]+A[2]+A[3]; 28 } 29 30 Instead we get this: 31 32 _f32: ## @f32 33 pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0] 34 addss %xmm0, %xmm1 35 pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0] 36 movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1] 37 movaps %xmm0, %xmm3 38 addss %xmm1, %xmm3 39 movdqa %xmm2, %xmm0 40 addss %xmm3, %xmm0 41 ret 42 43 Also, there are cases where some simple local SLP would improve codegen a bit. 44 compiling this: 45 46 _Complex float f32(_Complex float A, _Complex float B) { 47 return A+B; 48 } 49 50 into: 51 52 _f32: ## @f32 53 movdqa %xmm0, %xmm2 54 addss %xmm1, %xmm2 55 pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0] 56 pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0] 57 addss %xmm1, %xmm3 58 movaps %xmm2, %xmm0 59 unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 60 ret 61 62 seems silly when it could just be one addps. 63 64 65 //===---------------------------------------------------------------------===// 66 67 Expand libm rounding functions inline: Significant speedups possible. 68 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html 69 70 //===---------------------------------------------------------------------===// 71 72 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and 73 other fast SSE modes. 74 75 //===---------------------------------------------------------------------===// 76 77 Think about doing i64 math in SSE regs on x86-32. 78 79 //===---------------------------------------------------------------------===// 80 81 This testcase should have no SSE instructions in it, and only one load from 82 a constant pool: 83 84 double %test3(bool %B) { 85 %C = select bool %B, double 123.412, double 523.01123123 86 ret double %C 87 } 88 89 Currently, the select is being lowered, which prevents the dag combiner from 90 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)' 91 92 The pattern isel got this one right. 93 94 //===---------------------------------------------------------------------===// 95 96 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's 97 feasible. 98 99 //===---------------------------------------------------------------------===// 100 101 Codegen: 102 if (copysign(1.0, x) == copysign(1.0, y)) 103 into: 104 if (x^y & mask) 105 when using SSE. 106 107 //===---------------------------------------------------------------------===// 108 109 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half 110 of a v4sf value. 111 112 //===---------------------------------------------------------------------===// 113 114 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}. 115 Perhaps use pxor / xorp* to clear a XMM register first? 116 117 //===---------------------------------------------------------------------===// 118 119 External test Nurbs exposed some problems. Look for 120 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc 121 emits: 122 123 movaps (%edx), %xmm2 #59.21 124 movaps (%edx), %xmm5 #60.21 125 movaps (%edx), %xmm4 #61.21 126 movaps (%edx), %xmm3 #62.21 127 movl 40(%ecx), %ebp #69.49 128 shufps $0, %xmm2, %xmm5 #60.21 129 movl 100(%esp), %ebx #69.20 130 movl (%ebx), %edi #69.20 131 imull %ebp, %edi #69.49 132 addl (%eax), %edi #70.33 133 shufps $85, %xmm2, %xmm4 #61.21 134 shufps $170, %xmm2, %xmm3 #62.21 135 shufps $255, %xmm2, %xmm2 #63.21 136 lea (%ebp,%ebp,2), %ebx #69.49 137 negl %ebx #69.49 138 lea -3(%edi,%ebx), %ebx #70.33 139 shll $4, %ebx #68.37 140 addl 32(%ecx), %ebx #68.37 141 testb $15, %bl #91.13 142 jne L_B1.24 # Prob 5% #91.13 143 144 This is the llvm code after instruction scheduling: 145 146 cond_next140 (0xa910740, LLVM BB @0xa90beb0): 147 %reg1078 = MOV32ri -3 148 %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0 149 %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40 150 %reg1080 = IMUL32rr %reg1079, %reg1037 151 %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0 152 %reg1038 = LEA32r %reg1081, 1, %reg1080, -3 153 %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32 154 %reg1082 = SHL32ri %reg1038, 4 155 %reg1039 = ADD32rr %reg1036, %reg1082 156 %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0 157 %reg1034 = SHUFPSrr %reg1083, %reg1083, 170 158 %reg1032 = SHUFPSrr %reg1083, %reg1083, 0 159 %reg1035 = SHUFPSrr %reg1083, %reg1083, 255 160 %reg1033 = SHUFPSrr %reg1083, %reg1083, 85 161 %reg1040 = MOV32rr %reg1039 162 %reg1084 = AND32ri8 %reg1039, 15 163 CMP32ri8 %reg1084, 0 164 JE mbb<cond_next204,0xa914d30> 165 166 Still ok. After register allocation: 167 168 cond_next140 (0xa910740, LLVM BB @0xa90beb0): 169 %EAX = MOV32ri -3 170 %EDX = MOV32rm <fi#3>, 1, %NOREG, 0 171 ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0 172 %EDX = MOV32rm <fi#7>, 1, %NOREG, 0 173 %EDX = MOV32rm %EDX, 1, %NOREG, 40 174 IMUL32rr %EAX<def&use>, %EDX 175 %ESI = MOV32rm <fi#5>, 1, %NOREG, 0 176 %ESI = MOV32rm %ESI, 1, %NOREG, 0 177 MOV32mr <fi#4>, 1, %NOREG, 0, %ESI 178 %EAX = LEA32r %ESI, 1, %EAX, -3 179 %ESI = MOV32rm <fi#7>, 1, %NOREG, 0 180 %ESI = MOV32rm %ESI, 1, %NOREG, 32 181 %EDI = MOV32rr %EAX 182 SHL32ri %EDI<def&use>, 4 183 ADD32rr %EDI<def&use>, %ESI 184 %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0 185 %XMM1 = MOVAPSrr %XMM0 186 SHUFPSrr %XMM1<def&use>, %XMM1, 170 187 %XMM2 = MOVAPSrr %XMM0 188 SHUFPSrr %XMM2<def&use>, %XMM2, 0 189 %XMM3 = MOVAPSrr %XMM0 190 SHUFPSrr %XMM3<def&use>, %XMM3, 255 191 SHUFPSrr %XMM0<def&use>, %XMM0, 85 192 %EBX = MOV32rr %EDI 193 AND32ri8 %EBX<def&use>, 15 194 CMP32ri8 %EBX, 0 195 JE mbb<cond_next204,0xa914d30> 196 197 This looks really bad. The problem is shufps is a destructive opcode. Since it 198 appears as operand two in more than one shufps ops. It resulted in a number of 199 copies. Note icc also suffers from the same problem. Either the instruction 200 selector should select pshufd or The register allocator can made the two-address 201 to three-address transformation. 202 203 It also exposes some other problems. See MOV32ri -3 and the spills. 204 205 //===---------------------------------------------------------------------===// 206 207 Consider: 208 209 __m128 test(float a) { 210 return _mm_set_ps(0.0, 0.0, 0.0, a*a); 211 } 212 213 This compiles into: 214 215 movss 4(%esp), %xmm1 216 mulss %xmm1, %xmm1 217 xorps %xmm0, %xmm0 218 movss %xmm1, %xmm0 219 ret 220 221 Because mulss doesn't modify the top 3 elements, the top elements of 222 xmm1 are already zero'd. We could compile this to: 223 224 movss 4(%esp), %xmm0 225 mulss %xmm0, %xmm0 226 ret 227 228 //===---------------------------------------------------------------------===// 229 230 Here's a sick and twisted idea. Consider code like this: 231 232 __m128 test(__m128 a) { 233 float b = *(float*)&A; 234 ... 235 return _mm_set_ps(0.0, 0.0, 0.0, b); 236 } 237 238 This might compile to this code: 239 240 movaps c(%esp), %xmm1 241 xorps %xmm0, %xmm0 242 movss %xmm1, %xmm0 243 ret 244 245 Now consider if the ... code caused xmm1 to get spilled. This might produce 246 this code: 247 248 movaps c(%esp), %xmm1 249 movaps %xmm1, c2(%esp) 250 ... 251 252 xorps %xmm0, %xmm0 253 movaps c2(%esp), %xmm1 254 movss %xmm1, %xmm0 255 ret 256 257 However, since the reload is only used by these instructions, we could 258 "fold" it into the uses, producing something like this: 259 260 movaps c(%esp), %xmm1 261 movaps %xmm1, c2(%esp) 262 ... 263 264 movss c2(%esp), %xmm0 265 ret 266 267 ... saving two instructions. 268 269 The basic idea is that a reload from a spill slot, can, if only one 4-byte 270 chunk is used, bring in 3 zeros the one element instead of 4 elements. 271 This can be used to simplify a variety of shuffle operations, where the 272 elements are fixed zeros. 273 274 //===---------------------------------------------------------------------===// 275 276 This code generates ugly code, probably due to costs being off or something: 277 278 define void @test(float* %P, <4 x float>* %P2 ) { 279 %xFloat0.688 = load float* %P 280 %tmp = load <4 x float>* %P2 281 %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3 282 store <4 x float> %inFloat3.713, <4 x float>* %P2 283 ret void 284 } 285 286 Generates: 287 288 _test: 289 movl 8(%esp), %eax 290 movaps (%eax), %xmm0 291 pxor %xmm1, %xmm1 292 movaps %xmm0, %xmm2 293 shufps $50, %xmm1, %xmm2 294 shufps $132, %xmm2, %xmm0 295 movaps %xmm0, (%eax) 296 ret 297 298 Would it be better to generate: 299 300 _test: 301 movl 8(%esp), %ecx 302 movaps (%ecx), %xmm0 303 xor %eax, %eax 304 pinsrw $6, %eax, %xmm0 305 pinsrw $7, %eax, %xmm0 306 movaps %xmm0, (%ecx) 307 ret 308 309 ? 310 311 //===---------------------------------------------------------------------===// 312 313 Some useful information in the Apple Altivec / SSE Migration Guide: 314 315 http://developer.apple.com/documentation/Performance/Conceptual/ 316 Accelerate_sse_migration/index.html 317 318 e.g. SSE select using and, andnot, or. Various SSE compare translations. 319 320 //===---------------------------------------------------------------------===// 321 322 Add hooks to commute some CMPP operations. 323 324 //===---------------------------------------------------------------------===// 325 326 Apply the same transformation that merged four float into a single 128-bit load 327 to loads from constant pool. 328 329 //===---------------------------------------------------------------------===// 330 331 Floating point max / min are commutable when -enable-unsafe-fp-path is 332 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other 333 nodes which are selected to max / min instructions that are marked commutable. 334 335 //===---------------------------------------------------------------------===// 336 337 We should materialize vector constants like "all ones" and "signbit" with 338 code like: 339 340 cmpeqps xmm1, xmm1 ; xmm1 = all-ones 341 342 and: 343 cmpeqps xmm1, xmm1 ; xmm1 = all-ones 344 psrlq xmm1, 31 ; xmm1 = all 100000000000... 345 346 instead of using a load from the constant pool. The later is important for 347 ABS/NEG/copysign etc. 348 349 //===---------------------------------------------------------------------===// 350 351 These functions: 352 353 #include <xmmintrin.h> 354 __m128i a; 355 void x(unsigned short n) { 356 a = _mm_slli_epi32 (a, n); 357 } 358 void y(unsigned n) { 359 a = _mm_slli_epi32 (a, n); 360 } 361 362 compile to ( -O3 -static -fomit-frame-pointer): 363 _x: 364 movzwl 4(%esp), %eax 365 movd %eax, %xmm0 366 movaps _a, %xmm1 367 pslld %xmm0, %xmm1 368 movaps %xmm1, _a 369 ret 370 _y: 371 movd 4(%esp), %xmm0 372 movaps _a, %xmm1 373 pslld %xmm0, %xmm1 374 movaps %xmm1, _a 375 ret 376 377 "y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems 378 like movd would be sufficient in both cases as the value is already zero 379 extended in the 32-bit stack slot IIRC. For signed short, it should also be 380 save, as a really-signed value would be undefined for pslld. 381 382 383 //===---------------------------------------------------------------------===// 384 385 #include <math.h> 386 int t1(double d) { return signbit(d); } 387 388 This currently compiles to: 389 subl $12, %esp 390 movsd 16(%esp), %xmm0 391 movsd %xmm0, (%esp) 392 movl 4(%esp), %eax 393 shrl $31, %eax 394 addl $12, %esp 395 ret 396 397 We should use movmskp{s|d} instead. 398 399 //===---------------------------------------------------------------------===// 400 401 CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single 402 (aligned) vector load. This functionality has a couple of problems. 403 404 1. The code to infer alignment from loads of globals is in the X86 backend, 405 not the dag combiner. This is because dagcombine2 needs to be able to see 406 through the X86ISD::Wrapper node, which DAGCombine can't really do. 407 2. The code for turning 4 x load into a single vector load is target 408 independent and should be moved to the dag combiner. 409 3. The code for turning 4 x load into a vector load can only handle a direct 410 load from a global or a direct load from the stack. It should be generalized 411 to handle any load from P, P+4, P+8, P+12, where P can be anything. 412 4. The alignment inference code cannot handle loads from globals in non-static 413 mode because it doesn't look through the extra dyld stub load. If you try 414 vec_align.ll without -relocation-model=static, you'll see what I mean. 415 416 //===---------------------------------------------------------------------===// 417 418 We should lower store(fneg(load p), q) into an integer load+xor+store, which 419 eliminates a constant pool load. For example, consider: 420 421 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly { 422 entry: 423 %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1] 424 %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly 425 ret i64 %tmp20 426 } 427 declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly 428 429 This currently compiles to: 430 431 LCPI1_0: # <4 x float> 432 .long 2147483648 # float -0 433 .long 2147483648 # float -0 434 .long 2147483648 # float -0 435 .long 2147483648 # float -0 436 _ccosf: 437 subl $12, %esp 438 movss 16(%esp), %xmm0 439 movss %xmm0, 4(%esp) 440 movss 20(%esp), %xmm0 441 xorps LCPI1_0, %xmm0 442 movss %xmm0, (%esp) 443 call L_ccoshf$stub 444 addl $12, %esp 445 ret 446 447 Note the load into xmm0, then xor (to negate), then store. In PIC mode, 448 this code computes the pic base and does two loads to do the constant pool 449 load, so the improvement is much bigger. 450 451 The tricky part about this xform is that the argument load/store isn't exposed 452 until post-legalize, and at that point, the fneg has been custom expanded into 453 an X86 fxor. This means that we need to handle this case in the x86 backend 454 instead of in target independent code. 455 456 //===---------------------------------------------------------------------===// 457 458 Non-SSE4 insert into 16 x i8 is atrociously bad. 459 460 //===---------------------------------------------------------------------===// 461 462 <2 x i64> extract is substantially worse than <2 x f64>, even if the destination 463 is memory. 464 465 //===---------------------------------------------------------------------===// 466 467 INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert 468 any number of 0.0 simultaneously. Currently we only use it for simple 469 insertions. 470 471 See comments in LowerINSERT_VECTOR_ELT_SSE4. 472 473 //===---------------------------------------------------------------------===// 474 475 On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not 476 Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are 477 legal, it'll just take a few extra patterns written in the .td file. 478 479 Note: this is not a code quality issue; the custom lowered code happens to be 480 right, but we shouldn't have to custom lower anything. This is probably related 481 to <2 x i64> ops being so bad. 482 483 //===---------------------------------------------------------------------===// 484 485 LLVM currently generates stack realignment code, when it is not necessary 486 needed. The problem is that we need to know about stack alignment too early, 487 before RA runs. 488 489 At that point we don't know, whether there will be vector spill, or not. 490 Stack realignment logic is overly conservative here, but otherwise we can 491 produce unaligned loads/stores. 492 493 Fixing this will require some huge RA changes. 494 495 Testcase: 496 #include <emmintrin.h> 497 498 typedef short vSInt16 __attribute__ ((__vector_size__ (16))); 499 500 static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873, 501 - 22725, - 12873};; 502 503 vSInt16 madd(vSInt16 b) 504 { 505 return _mm_madd_epi16(a, b); 506 } 507 508 Generated code (x86-32, linux): 509 madd: 510 pushl %ebp 511 movl %esp, %ebp 512 andl $-16, %esp 513 movaps .LCPI1_0, %xmm1 514 pmaddwd %xmm1, %xmm0 515 movl %ebp, %esp 516 popl %ebp 517 ret 518 519 //===---------------------------------------------------------------------===// 520 521 Consider: 522 #include <emmintrin.h> 523 __m128 foo2 (float x) { 524 return _mm_set_ps (0, 0, x, 0); 525 } 526 527 In x86-32 mode, we generate this spiffy code: 528 529 _foo2: 530 movss 4(%esp), %xmm0 531 pshufd $81, %xmm0, %xmm0 532 ret 533 534 in x86-64 mode, we generate this code, which could be better: 535 536 _foo2: 537 xorps %xmm1, %xmm1 538 movss %xmm0, %xmm1 539 pshufd $81, %xmm1, %xmm0 540 ret 541 542 In sse4 mode, we could use insertps to make both better. 543 544 Here's another testcase that could use insertps [mem]: 545 546 #include <xmmintrin.h> 547 extern float x2, x3; 548 __m128 foo1 (float x1, float x4) { 549 return _mm_set_ps (x2, x1, x3, x4); 550 } 551 552 gcc mainline compiles it to: 553 554 foo1: 555 insertps $0x10, x2(%rip), %xmm0 556 insertps $0x10, x3(%rip), %xmm1 557 movaps %xmm1, %xmm2 558 movlhps %xmm0, %xmm2 559 movaps %xmm2, %xmm0 560 ret 561 562 //===---------------------------------------------------------------------===// 563 564 We compile vector multiply-by-constant into poor code: 565 566 define <4 x i32> @f(<4 x i32> %i) nounwind { 567 %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 > 568 ret <4 x i32> %A 569 } 570 571 On targets without SSE4.1, this compiles into: 572 573 LCPI1_0: ## <4 x i32> 574 .long 10 575 .long 10 576 .long 10 577 .long 10 578 .text 579 .align 4,0x90 580 .globl _f 581 _f: 582 pshufd $3, %xmm0, %xmm1 583 movd %xmm1, %eax 584 imull LCPI1_0+12, %eax 585 movd %eax, %xmm1 586 pshufd $1, %xmm0, %xmm2 587 movd %xmm2, %eax 588 imull LCPI1_0+4, %eax 589 movd %eax, %xmm2 590 punpckldq %xmm1, %xmm2 591 movd %xmm0, %eax 592 imull LCPI1_0, %eax 593 movd %eax, %xmm1 594 movhlps %xmm0, %xmm0 595 movd %xmm0, %eax 596 imull LCPI1_0+8, %eax 597 movd %eax, %xmm0 598 punpckldq %xmm0, %xmm1 599 movaps %xmm1, %xmm0 600 punpckldq %xmm2, %xmm0 601 ret 602 603 It would be better to synthesize integer vector multiplication by constants 604 using shifts and adds, pslld and paddd here. And even on targets with SSE4.1, 605 simple cases such as multiplication by powers of two would be better as 606 vector shifts than as multiplications. 607 608 //===---------------------------------------------------------------------===// 609 610 We compile this: 611 612 __m128i 613 foo2 (char x) 614 { 615 return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0); 616 } 617 618 into: 619 movl $1, %eax 620 xorps %xmm0, %xmm0 621 pinsrw $2, %eax, %xmm0 622 movzbl 4(%esp), %eax 623 pinsrw $3, %eax, %xmm0 624 movl $256, %eax 625 pinsrw $7, %eax, %xmm0 626 ret 627 628 629 gcc-4.2: 630 subl $12, %esp 631 movzbl 16(%esp), %eax 632 movdqa LC0, %xmm0 633 pinsrw $3, %eax, %xmm0 634 addl $12, %esp 635 ret 636 .const 637 .align 4 638 LC0: 639 .word 0 640 .word 0 641 .word 1 642 .word 0 643 .word 0 644 .word 0 645 .word 0 646 .word 256 647 648 With SSE4, it should be 649 movdqa .LC0(%rip), %xmm0 650 pinsrb $6, %edi, %xmm0 651 652 //===---------------------------------------------------------------------===// 653 654 We should transform a shuffle of two vectors of constants into a single vector 655 of constants. Also, insertelement of a constant into a vector of constants 656 should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll. 657 658 We compiled it to something horrible: 659 660 .align 4 661 LCPI1_1: ## float 662 .long 1065353216 ## float 1 663 .const 664 665 .align 4 666 LCPI1_0: ## <4 x float> 667 .space 4 668 .long 1065353216 ## float 1 669 .space 4 670 .long 1065353216 ## float 1 671 .text 672 .align 4,0x90 673 .globl _t 674 _t: 675 xorps %xmm0, %xmm0 676 movhps LCPI1_0, %xmm0 677 movss LCPI1_1, %xmm1 678 movaps %xmm0, %xmm2 679 shufps $2, %xmm1, %xmm2 680 shufps $132, %xmm2, %xmm0 681 movaps %xmm0, 0 682 683 //===---------------------------------------------------------------------===// 684 rdar://5907648 685 686 This function: 687 688 float foo(unsigned char x) { 689 return x; 690 } 691 692 compiles to (x86-32): 693 694 define float @foo(i8 zeroext %x) nounwind { 695 %tmp12 = uitofp i8 %x to float ; <float> [#uses=1] 696 ret float %tmp12 697 } 698 699 compiles to: 700 701 _foo: 702 subl $4, %esp 703 movzbl 8(%esp), %eax 704 cvtsi2ss %eax, %xmm0 705 movss %xmm0, (%esp) 706 flds (%esp) 707 addl $4, %esp 708 ret 709 710 We should be able to use: 711 cvtsi2ss 8($esp), %xmm0 712 since we know the stack slot is already zext'd. 713 714 //===---------------------------------------------------------------------===// 715 716 Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64)) 717 when code size is critical. movlps is slower than movsd on core2 but it's one 718 byte shorter. 719 720 //===---------------------------------------------------------------------===// 721 722 We should use a dynamic programming based approach to tell when using FPStack 723 operations is cheaper than SSE. SciMark montecarlo contains code like this 724 for example: 725 726 double MonteCarlo_num_flops(int Num_samples) { 727 return ((double) Num_samples)* 4.0; 728 } 729 730 In fpstack mode, this compiles into: 731 732 LCPI1_0: 733 .long 1082130432 ## float 4.000000e+00 734 _MonteCarlo_num_flops: 735 subl $4, %esp 736 movl 8(%esp), %eax 737 movl %eax, (%esp) 738 fildl (%esp) 739 fmuls LCPI1_0 740 addl $4, %esp 741 ret 742 743 in SSE mode, it compiles into significantly slower code: 744 745 _MonteCarlo_num_flops: 746 subl $12, %esp 747 cvtsi2sd 16(%esp), %xmm0 748 mulsd LCPI1_0, %xmm0 749 movsd %xmm0, (%esp) 750 fldl (%esp) 751 addl $12, %esp 752 ret 753 754 There are also other cases in scimark where using fpstack is better, it is 755 cheaper to do fld1 than load from a constant pool for example, so 756 "load, add 1.0, store" is better done in the fp stack, etc. 757 758 //===---------------------------------------------------------------------===// 759 760 These should compile into the same code (PR6214): Perhaps instcombine should 761 canonicalize the former into the later? 762 763 define float @foo(float %x) nounwind { 764 %t = bitcast float %x to i32 765 %s = and i32 %t, 2147483647 766 %d = bitcast i32 %s to float 767 ret float %d 768 } 769 770 declare float @fabsf(float %n) 771 define float @bar(float %x) nounwind { 772 %d = call float @fabsf(float %x) 773 ret float %d 774 } 775 776 //===---------------------------------------------------------------------===// 777 778 This IR (from PR6194): 779 780 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" 781 target triple = "x86_64-apple-darwin10.0.0" 782 783 %0 = type { double, double } 784 %struct.float3 = type { float, float, float } 785 786 define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp { 787 entry: 788 %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1] 789 %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1] 790 %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1] 791 %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1] 792 %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1] 793 %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1] 794 %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1] 795 store float %tmp12, float* %tmp5 796 ret void 797 } 798 799 Compiles to: 800 801 _test: ## @test 802 movd %xmm0, %rax 803 shrq $32, %rax 804 movl %eax, 4(%rdi) 805 ret 806 807 This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and 808 doing a shuffle from v[1] to v[0] then a float store. 809 810 //===---------------------------------------------------------------------===// 811 812 [UNSAFE FP] 813 814 void foo(double, double, double); 815 void norm(double x, double y, double z) { 816 double scale = __builtin_sqrt(x*x + y*y + z*z); 817 foo(x/scale, y/scale, z/scale); 818 } 819 820 We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is 821 slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first 822 and emit 3 mulsd in place of the divs. This can be done as a target-independent 823 transform. 824 825 If we're dealing with floats instead of doubles we could even replace the sqrtss 826 and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the 827 cost of reduced accuracy. 828 829 //===---------------------------------------------------------------------===// 830 831 This function should be matched to haddpd when the appropriate CPU is enabled: 832 833 #include <x86intrin.h> 834 double f (__m128d p) { 835 return p[0] + p[1]; 836 } 837 838 similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should 839 turn into hsubpd also. 840 841 //===---------------------------------------------------------------------===// 842 843 define <2 x i32> @foo(<2 x double> %in) { 844 %x = fptosi <2 x double> %in to <2 x i32> 845 ret <2 x i32> %x 846 } 847 848 Should compile into cvttpd2dq instead of being scalarized into 2 cvttsd2si. 849 850 //===---------------------------------------------------------------------===// 851