1 //===---------------------------------------------------------------------===//
2 // Random ideas for the X86 backend: SSE-specific stuff.
3 //===---------------------------------------------------------------------===//
4
5 //===---------------------------------------------------------------------===//
6
7 SSE Variable shift can be custom lowered to something like this, which uses a
8 small table + unaligned load + shuffle instead of going through memory.
9
10 __m128i_shift_right:
11 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
12 .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
13
14 ...
15 __m128i shift_right(__m128i value, unsigned long offset) {
16 return _mm_shuffle_epi8(value,
17 _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
18 }
19
20 //===---------------------------------------------------------------------===//
21
22 SSE has instructions for doing operations on complex numbers, we should pattern
23 match them. For example, this should turn into a horizontal add:
24
25 typedef float __attribute__((vector_size(16))) v4f32;
26 float f32(v4f32 A) {
27 return A[0]+A[1]+A[2]+A[3];
28 }
29
30 Instead we get this:
31
32 _f32: ## @f32
33 pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0]
34 addss %xmm0, %xmm1
35 pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0]
36 movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1]
37 movaps %xmm0, %xmm3
38 addss %xmm1, %xmm3
39 movdqa %xmm2, %xmm0
40 addss %xmm3, %xmm0
41 ret
42
43 Also, there are cases where some simple local SLP would improve codegen a bit.
44 compiling this:
45
46 _Complex float f32(_Complex float A, _Complex float B) {
47 return A+B;
48 }
49
50 into:
51
52 _f32: ## @f32
53 movdqa %xmm0, %xmm2
54 addss %xmm1, %xmm2
55 pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0]
56 pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0]
57 addss %xmm1, %xmm3
58 movaps %xmm2, %xmm0
59 unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
60 ret
61
62 seems silly when it could just be one addps.
63
64
65 //===---------------------------------------------------------------------===//
66
67 Expand libm rounding functions inline: Significant speedups possible.
68 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
69
70 //===---------------------------------------------------------------------===//
71
72 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
73 other fast SSE modes.
74
75 //===---------------------------------------------------------------------===//
76
77 Think about doing i64 math in SSE regs on x86-32.
78
79 //===---------------------------------------------------------------------===//
80
81 This testcase should have no SSE instructions in it, and only one load from
82 a constant pool:
83
84 double %test3(bool %B) {
85 %C = select bool %B, double 123.412, double 523.01123123
86 ret double %C
87 }
88
89 Currently, the select is being lowered, which prevents the dag combiner from
90 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
91
92 The pattern isel got this one right.
93
94 //===---------------------------------------------------------------------===//
95
96 SSE should implement 'select_cc' using 'emulated conditional moves' that use
97 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
98
99 double %X(double %Y, double %Z, double %A, double %B) {
100 %C = setlt double %A, %B
101 %z = fadd double %Z, 0.0 ;; select operand is not a load
102 %D = select bool %C, double %Y, double %z
103 ret double %D
104 }
105
106 We currently emit:
107
108 _X:
109 subl $12, %esp
110 xorpd %xmm0, %xmm0
111 addsd 24(%esp), %xmm0
112 movsd 32(%esp), %xmm1
113 movsd 16(%esp), %xmm2
114 ucomisd 40(%esp), %xmm1
115 jb LBB_X_2
116 LBB_X_1:
117 movsd %xmm0, %xmm2
118 LBB_X_2:
119 movsd %xmm2, (%esp)
120 fldl (%esp)
121 addl $12, %esp
122 ret
123
124 //===---------------------------------------------------------------------===//
125
126 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
127 feasible.
128
129 //===---------------------------------------------------------------------===//
130
131 Codegen:
132 if (copysign(1.0, x) == copysign(1.0, y))
133 into:
134 if (x^y & mask)
135 when using SSE.
136
137 //===---------------------------------------------------------------------===//
138
139 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
140 of a v4sf value.
141
142 //===---------------------------------------------------------------------===//
143
144 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
145 Perhaps use pxor / xorp* to clear a XMM register first?
146
147 //===---------------------------------------------------------------------===//
148
149 External test Nurbs exposed some problems. Look for
150 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
151 emits:
152
153 movaps (%edx), %xmm2 #59.21
154 movaps (%edx), %xmm5 #60.21
155 movaps (%edx), %xmm4 #61.21
156 movaps (%edx), %xmm3 #62.21
157 movl 40(%ecx), %ebp #69.49
158 shufps $0, %xmm2, %xmm5 #60.21
159 movl 100(%esp), %ebx #69.20
160 movl (%ebx), %edi #69.20
161 imull %ebp, %edi #69.49
162 addl (%eax), %edi #70.33
163 shufps $85, %xmm2, %xmm4 #61.21
164 shufps $170, %xmm2, %xmm3 #62.21
165 shufps $255, %xmm2, %xmm2 #63.21
166 lea (%ebp,%ebp,2), %ebx #69.49
167 negl %ebx #69.49
168 lea -3(%edi,%ebx), %ebx #70.33
169 shll $4, %ebx #68.37
170 addl 32(%ecx), %ebx #68.37
171 testb $15, %bl #91.13
172 jne L_B1.24 # Prob 5% #91.13
173
174 This is the llvm code after instruction scheduling:
175
176 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
177 %reg1078 = MOV32ri -3
178 %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
179 %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
180 %reg1080 = IMUL32rr %reg1079, %reg1037
181 %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
182 %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
183 %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
184 %reg1082 = SHL32ri %reg1038, 4
185 %reg1039 = ADD32rr %reg1036, %reg1082
186 %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
187 %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
188 %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
189 %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
190 %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
191 %reg1040 = MOV32rr %reg1039
192 %reg1084 = AND32ri8 %reg1039, 15
193 CMP32ri8 %reg1084, 0
194 JE mbb<cond_next204,0xa914d30>
195
196 Still ok. After register allocation:
197
198 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
199 %EAX = MOV32ri -3
200 %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
201 ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
202 %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
203 %EDX = MOV32rm %EDX, 1, %NOREG, 40
204 IMUL32rr %EAX<def&use>, %EDX
205 %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
206 %ESI = MOV32rm %ESI, 1, %NOREG, 0
207 MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
208 %EAX = LEA32r %ESI, 1, %EAX, -3
209 %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
210 %ESI = MOV32rm %ESI, 1, %NOREG, 32
211 %EDI = MOV32rr %EAX
212 SHL32ri %EDI<def&use>, 4
213 ADD32rr %EDI<def&use>, %ESI
214 %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
215 %XMM1 = MOVAPSrr %XMM0
216 SHUFPSrr %XMM1<def&use>, %XMM1, 170
217 %XMM2 = MOVAPSrr %XMM0
218 SHUFPSrr %XMM2<def&use>, %XMM2, 0
219 %XMM3 = MOVAPSrr %XMM0
220 SHUFPSrr %XMM3<def&use>, %XMM3, 255
221 SHUFPSrr %XMM0<def&use>, %XMM0, 85
222 %EBX = MOV32rr %EDI
223 AND32ri8 %EBX<def&use>, 15
224 CMP32ri8 %EBX, 0
225 JE mbb<cond_next204,0xa914d30>
226
227 This looks really bad. The problem is shufps is a destructive opcode. Since it
228 appears as operand two in more than one shufps ops. It resulted in a number of
229 copies. Note icc also suffers from the same problem. Either the instruction
230 selector should select pshufd or The register allocator can made the two-address
231 to three-address transformation.
232
233 It also exposes some other problems. See MOV32ri -3 and the spills.
234
235 //===---------------------------------------------------------------------===//
236
237 Consider:
238
239 __m128 test(float a) {
240 return _mm_set_ps(0.0, 0.0, 0.0, a*a);
241 }
242
243 This compiles into:
244
245 movss 4(%esp), %xmm1
246 mulss %xmm1, %xmm1
247 xorps %xmm0, %xmm0
248 movss %xmm1, %xmm0
249 ret
250
251 Because mulss doesn't modify the top 3 elements, the top elements of
252 xmm1 are already zero'd. We could compile this to:
253
254 movss 4(%esp), %xmm0
255 mulss %xmm0, %xmm0
256 ret
257
258 //===---------------------------------------------------------------------===//
259
260 Here's a sick and twisted idea. Consider code like this:
261
262 __m128 test(__m128 a) {
263 float b = *(float*)&A;
264 ...
265 return _mm_set_ps(0.0, 0.0, 0.0, b);
266 }
267
268 This might compile to this code:
269
270 movaps c(%esp), %xmm1
271 xorps %xmm0, %xmm0
272 movss %xmm1, %xmm0
273 ret
274
275 Now consider if the ... code caused xmm1 to get spilled. This might produce
276 this code:
277
278 movaps c(%esp), %xmm1
279 movaps %xmm1, c2(%esp)
280 ...
281
282 xorps %xmm0, %xmm0
283 movaps c2(%esp), %xmm1
284 movss %xmm1, %xmm0
285 ret
286
287 However, since the reload is only used by these instructions, we could
288 "fold" it into the uses, producing something like this:
289
290 movaps c(%esp), %xmm1
291 movaps %xmm1, c2(%esp)
292 ...
293
294 movss c2(%esp), %xmm0
295 ret
296
297 ... saving two instructions.
298
299 The basic idea is that a reload from a spill slot, can, if only one 4-byte
300 chunk is used, bring in 3 zeros the one element instead of 4 elements.
301 This can be used to simplify a variety of shuffle operations, where the
302 elements are fixed zeros.
303
304 //===---------------------------------------------------------------------===//
305
306 This code generates ugly code, probably due to costs being off or something:
307
308 define void @test(float* %P, <4 x float>* %P2 ) {
309 %xFloat0.688 = load float* %P
310 %tmp = load <4 x float>* %P2
311 %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
312 store <4 x float> %inFloat3.713, <4 x float>* %P2
313 ret void
314 }
315
316 Generates:
317
318 _test:
319 movl 8(%esp), %eax
320 movaps (%eax), %xmm0
321 pxor %xmm1, %xmm1
322 movaps %xmm0, %xmm2
323 shufps $50, %xmm1, %xmm2
324 shufps $132, %xmm2, %xmm0
325 movaps %xmm0, (%eax)
326 ret
327
328 Would it be better to generate:
329
330 _test:
331 movl 8(%esp), %ecx
332 movaps (%ecx), %xmm0
333 xor %eax, %eax
334 pinsrw $6, %eax, %xmm0
335 pinsrw $7, %eax, %xmm0
336 movaps %xmm0, (%ecx)
337 ret
338
339 ?
340
341 //===---------------------------------------------------------------------===//
342
343 Some useful information in the Apple Altivec / SSE Migration Guide:
344
345 http://developer.apple.com/documentation/Performance/Conceptual/
346 Accelerate_sse_migration/index.html
347
348 e.g. SSE select using and, andnot, or. Various SSE compare translations.
349
350 //===---------------------------------------------------------------------===//
351
352 Add hooks to commute some CMPP operations.
353
354 //===---------------------------------------------------------------------===//
355
356 Apply the same transformation that merged four float into a single 128-bit load
357 to loads from constant pool.
358
359 //===---------------------------------------------------------------------===//
360
361 Floating point max / min are commutable when -enable-unsafe-fp-path is
362 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
363 nodes which are selected to max / min instructions that are marked commutable.
364
365 //===---------------------------------------------------------------------===//
366
367 We should materialize vector constants like "all ones" and "signbit" with
368 code like:
369
370 cmpeqps xmm1, xmm1 ; xmm1 = all-ones
371
372 and:
373 cmpeqps xmm1, xmm1 ; xmm1 = all-ones
374 psrlq xmm1, 31 ; xmm1 = all 100000000000...
375
376 instead of using a load from the constant pool. The later is important for
377 ABS/NEG/copysign etc.
378
379 //===---------------------------------------------------------------------===//
380
381 These functions:
382
383 #include <xmmintrin.h>
384 __m128i a;
385 void x(unsigned short n) {
386 a = _mm_slli_epi32 (a, n);
387 }
388 void y(unsigned n) {
389 a = _mm_slli_epi32 (a, n);
390 }
391
392 compile to ( -O3 -static -fomit-frame-pointer):
393 _x:
394 movzwl 4(%esp), %eax
395 movd %eax, %xmm0
396 movaps _a, %xmm1
397 pslld %xmm0, %xmm1
398 movaps %xmm1, _a
399 ret
400 _y:
401 movd 4(%esp), %xmm0
402 movaps _a, %xmm1
403 pslld %xmm0, %xmm1
404 movaps %xmm1, _a
405 ret
406
407 "y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems
408 like movd would be sufficient in both cases as the value is already zero
409 extended in the 32-bit stack slot IIRC. For signed short, it should also be
410 save, as a really-signed value would be undefined for pslld.
411
412
413 //===---------------------------------------------------------------------===//
414
415 #include <math.h>
416 int t1(double d) { return signbit(d); }
417
418 This currently compiles to:
419 subl $12, %esp
420 movsd 16(%esp), %xmm0
421 movsd %xmm0, (%esp)
422 movl 4(%esp), %eax
423 shrl $31, %eax
424 addl $12, %esp
425 ret
426
427 We should use movmskp{s|d} instead.
428
429 //===---------------------------------------------------------------------===//
430
431 CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
432 (aligned) vector load. This functionality has a couple of problems.
433
434 1. The code to infer alignment from loads of globals is in the X86 backend,
435 not the dag combiner. This is because dagcombine2 needs to be able to see
436 through the X86ISD::Wrapper node, which DAGCombine can't really do.
437 2. The code for turning 4 x load into a single vector load is target
438 independent and should be moved to the dag combiner.
439 3. The code for turning 4 x load into a vector load can only handle a direct
440 load from a global or a direct load from the stack. It should be generalized
441 to handle any load from P, P+4, P+8, P+12, where P can be anything.
442 4. The alignment inference code cannot handle loads from globals in non-static
443 mode because it doesn't look through the extra dyld stub load. If you try
444 vec_align.ll without -relocation-model=static, you'll see what I mean.
445
446 //===---------------------------------------------------------------------===//
447
448 We should lower store(fneg(load p), q) into an integer load+xor+store, which
449 eliminates a constant pool load. For example, consider:
450
451 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly {
452 entry:
453 %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1]
454 %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
455 ret i64 %tmp20
456 }
457 declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
458
459 This currently compiles to:
460
461 LCPI1_0: # <4 x float>
462 .long 2147483648 # float -0
463 .long 2147483648 # float -0
464 .long 2147483648 # float -0
465 .long 2147483648 # float -0
466 _ccosf:
467 subl $12, %esp
468 movss 16(%esp), %xmm0
469 movss %xmm0, 4(%esp)
470 movss 20(%esp), %xmm0
471 xorps LCPI1_0, %xmm0
472 movss %xmm0, (%esp)
473 call L_ccoshf$stub
474 addl $12, %esp
475 ret
476
477 Note the load into xmm0, then xor (to negate), then store. In PIC mode,
478 this code computes the pic base and does two loads to do the constant pool
479 load, so the improvement is much bigger.
480
481 The tricky part about this xform is that the argument load/store isn't exposed
482 until post-legalize, and at that point, the fneg has been custom expanded into
483 an X86 fxor. This means that we need to handle this case in the x86 backend
484 instead of in target independent code.
485
486 //===---------------------------------------------------------------------===//
487
488 Non-SSE4 insert into 16 x i8 is atrociously bad.
489
490 //===---------------------------------------------------------------------===//
491
492 <2 x i64> extract is substantially worse than <2 x f64>, even if the destination
493 is memory.
494
495 //===---------------------------------------------------------------------===//
496
497 SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
498 sitting between the truncate and the extract.
499
500 //===---------------------------------------------------------------------===//
501
502 INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
503 any number of 0.0 simultaneously. Currently we only use it for simple
504 insertions.
505
506 See comments in LowerINSERT_VECTOR_ELT_SSE4.
507
508 //===---------------------------------------------------------------------===//
509
510 On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
511 Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
512 legal, it'll just take a few extra patterns written in the .td file.
513
514 Note: this is not a code quality issue; the custom lowered code happens to be
515 right, but we shouldn't have to custom lower anything. This is probably related
516 to <2 x i64> ops being so bad.
517
518 //===---------------------------------------------------------------------===//
519
520 'select' on vectors and scalars could be a whole lot better. We currently
521 lower them to conditional branches. On x86-64 for example, we compile this:
522
523 double test(double a, double b, double c, double d) { return a<b ? c : d; }
524
525 to:
526
527 _test:
528 ucomisd %xmm0, %xmm1
529 ja LBB1_2 # entry
530 LBB1_1: # entry
531 movapd %xmm3, %xmm2
532 LBB1_2: # entry
533 movapd %xmm2, %xmm0
534 ret
535
536 instead of:
537
538 _test:
539 cmpltsd %xmm1, %xmm0
540 andpd %xmm0, %xmm2
541 andnpd %xmm3, %xmm0
542 orpd %xmm2, %xmm0
543 ret
544
545 For unpredictable branches, the later is much more efficient. This should
546 just be a matter of having scalar sse map to SELECT_CC and custom expanding
547 or iseling it.
548
549 //===---------------------------------------------------------------------===//
550
551 LLVM currently generates stack realignment code, when it is not necessary
552 needed. The problem is that we need to know about stack alignment too early,
553 before RA runs.
554
555 At that point we don't know, whether there will be vector spill, or not.
556 Stack realignment logic is overly conservative here, but otherwise we can
557 produce unaligned loads/stores.
558
559 Fixing this will require some huge RA changes.
560
561 Testcase:
562 #include <emmintrin.h>
563
564 typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
565
566 static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
567 - 22725, - 12873};;
568
569 vSInt16 madd(vSInt16 b)
570 {
571 return _mm_madd_epi16(a, b);
572 }
573
574 Generated code (x86-32, linux):
575 madd:
576 pushl %ebp
577 movl %esp, %ebp
578 andl $-16, %esp
579 movaps .LCPI1_0, %xmm1
580 pmaddwd %xmm1, %xmm0
581 movl %ebp, %esp
582 popl %ebp
583 ret
584
585 //===---------------------------------------------------------------------===//
586
587 Consider:
588 #include <emmintrin.h>
589 __m128 foo2 (float x) {
590 return _mm_set_ps (0, 0, x, 0);
591 }
592
593 In x86-32 mode, we generate this spiffy code:
594
595 _foo2:
596 movss 4(%esp), %xmm0
597 pshufd $81, %xmm0, %xmm0
598 ret
599
600 in x86-64 mode, we generate this code, which could be better:
601
602 _foo2:
603 xorps %xmm1, %xmm1
604 movss %xmm0, %xmm1
605 pshufd $81, %xmm1, %xmm0
606 ret
607
608 In sse4 mode, we could use insertps to make both better.
609
610 Here's another testcase that could use insertps [mem]:
611
612 #include <xmmintrin.h>
613 extern float x2, x3;
614 __m128 foo1 (float x1, float x4) {
615 return _mm_set_ps (x2, x1, x3, x4);
616 }
617
618 gcc mainline compiles it to:
619
620 foo1:
621 insertps $0x10, x2(%rip), %xmm0
622 insertps $0x10, x3(%rip), %xmm1
623 movaps %xmm1, %xmm2
624 movlhps %xmm0, %xmm2
625 movaps %xmm2, %xmm0
626 ret
627
628 //===---------------------------------------------------------------------===//
629
630 We compile vector multiply-by-constant into poor code:
631
632 define <4 x i32> @f(<4 x i32> %i) nounwind {
633 %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
634 ret <4 x i32> %A
635 }
636
637 On targets without SSE4.1, this compiles into:
638
639 LCPI1_0: ## <4 x i32>
640 .long 10
641 .long 10
642 .long 10
643 .long 10
644 .text
645 .align 4,0x90
646 .globl _f
647 _f:
648 pshufd $3, %xmm0, %xmm1
649 movd %xmm1, %eax
650 imull LCPI1_0+12, %eax
651 movd %eax, %xmm1
652 pshufd $1, %xmm0, %xmm2
653 movd %xmm2, %eax
654 imull LCPI1_0+4, %eax
655 movd %eax, %xmm2
656 punpckldq %xmm1, %xmm2
657 movd %xmm0, %eax
658 imull LCPI1_0, %eax
659 movd %eax, %xmm1
660 movhlps %xmm0, %xmm0
661 movd %xmm0, %eax
662 imull LCPI1_0+8, %eax
663 movd %eax, %xmm0
664 punpckldq %xmm0, %xmm1
665 movaps %xmm1, %xmm0
666 punpckldq %xmm2, %xmm0
667 ret
668
669 It would be better to synthesize integer vector multiplication by constants
670 using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
671 simple cases such as multiplication by powers of two would be better as
672 vector shifts than as multiplications.
673
674 //===---------------------------------------------------------------------===//
675
676 We compile this:
677
678 __m128i
679 foo2 (char x)
680 {
681 return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
682 }
683
684 into:
685 movl $1, %eax
686 xorps %xmm0, %xmm0
687 pinsrw $2, %eax, %xmm0
688 movzbl 4(%esp), %eax
689 pinsrw $3, %eax, %xmm0
690 movl $256, %eax
691 pinsrw $7, %eax, %xmm0
692 ret
693
694
695 gcc-4.2:
696 subl $12, %esp
697 movzbl 16(%esp), %eax
698 movdqa LC0, %xmm0
699 pinsrw $3, %eax, %xmm0
700 addl $12, %esp
701 ret
702 .const
703 .align 4
704 LC0:
705 .word 0
706 .word 0
707 .word 1
708 .word 0
709 .word 0
710 .word 0
711 .word 0
712 .word 256
713
714 With SSE4, it should be
715 movdqa .LC0(%rip), %xmm0
716 pinsrb $6, %edi, %xmm0
717
718 //===---------------------------------------------------------------------===//
719
720 We should transform a shuffle of two vectors of constants into a single vector
721 of constants. Also, insertelement of a constant into a vector of constants
722 should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
723
724 We compiled it to something horrible:
725
726 .align 4
727 LCPI1_1: ## float
728 .long 1065353216 ## float 1
729 .const
730
731 .align 4
732 LCPI1_0: ## <4 x float>
733 .space 4
734 .long 1065353216 ## float 1
735 .space 4
736 .long 1065353216 ## float 1
737 .text
738 .align 4,0x90
739 .globl _t
740 _t:
741 xorps %xmm0, %xmm0
742 movhps LCPI1_0, %xmm0
743 movss LCPI1_1, %xmm1
744 movaps %xmm0, %xmm2
745 shufps $2, %xmm1, %xmm2
746 shufps $132, %xmm2, %xmm0
747 movaps %xmm0, 0
748
749 //===---------------------------------------------------------------------===//
750 rdar://5907648
751
752 This function:
753
754 float foo(unsigned char x) {
755 return x;
756 }
757
758 compiles to (x86-32):
759
760 define float @foo(i8 zeroext %x) nounwind {
761 %tmp12 = uitofp i8 %x to float ; <float> [#uses=1]
762 ret float %tmp12
763 }
764
765 compiles to:
766
767 _foo:
768 subl $4, %esp
769 movzbl 8(%esp), %eax
770 cvtsi2ss %eax, %xmm0
771 movss %xmm0, (%esp)
772 flds (%esp)
773 addl $4, %esp
774 ret
775
776 We should be able to use:
777 cvtsi2ss 8($esp), %xmm0
778 since we know the stack slot is already zext'd.
779
780 //===---------------------------------------------------------------------===//
781
782 Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
783 when code size is critical. movlps is slower than movsd on core2 but it's one
784 byte shorter.
785
786 //===---------------------------------------------------------------------===//
787
788 We should use a dynamic programming based approach to tell when using FPStack
789 operations is cheaper than SSE. SciMark montecarlo contains code like this
790 for example:
791
792 double MonteCarlo_num_flops(int Num_samples) {
793 return ((double) Num_samples)* 4.0;
794 }
795
796 In fpstack mode, this compiles into:
797
798 LCPI1_0:
799 .long 1082130432 ## float 4.000000e+00
800 _MonteCarlo_num_flops:
801 subl $4, %esp
802 movl 8(%esp), %eax
803 movl %eax, (%esp)
804 fildl (%esp)
805 fmuls LCPI1_0
806 addl $4, %esp
807 ret
808
809 in SSE mode, it compiles into significantly slower code:
810
811 _MonteCarlo_num_flops:
812 subl $12, %esp
813 cvtsi2sd 16(%esp), %xmm0
814 mulsd LCPI1_0, %xmm0
815 movsd %xmm0, (%esp)
816 fldl (%esp)
817 addl $12, %esp
818 ret
819
820 There are also other cases in scimark where using fpstack is better, it is
821 cheaper to do fld1 than load from a constant pool for example, so
822 "load, add 1.0, store" is better done in the fp stack, etc.
823
824 //===---------------------------------------------------------------------===//
825
826 The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to
827 "cmpsd". For example, this code:
828
829 double d1(double x) { return x == x ? x : x + x; }
830
831 Compiles into:
832
833 _d1:
834 ucomisd %xmm0, %xmm0
835 jnp LBB1_2
836 addsd %xmm0, %xmm0
837 ret
838 LBB1_2:
839 ret
840
841 Also, the 'ret's should be shared. This is PR6032.
842
843 //===---------------------------------------------------------------------===//
844
845 These should compile into the same code (PR6214): Perhaps instcombine should
846 canonicalize the former into the later?
847
848 define float @foo(float %x) nounwind {
849 %t = bitcast float %x to i32
850 %s = and i32 %t, 2147483647
851 %d = bitcast i32 %s to float
852 ret float %d
853 }
854
855 declare float @fabsf(float %n)
856 define float @bar(float %x) nounwind {
857 %d = call float @fabsf(float %x)
858 ret float %d
859 }
860
861 //===---------------------------------------------------------------------===//
862
863 This IR (from PR6194):
864
865 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
866 target triple = "x86_64-apple-darwin10.0.0"
867
868 %0 = type { double, double }
869 %struct.float3 = type { float, float, float }
870
871 define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
872 entry:
873 %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1]
874 %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1]
875 %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1]
876 %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1]
877 %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1]
878 %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1]
879 %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
880 store float %tmp12, float* %tmp5
881 ret void
882 }
883
884 Compiles to:
885
886 _test: ## @test
887 movd %xmm0, %rax
888 shrq $32, %rax
889 movl %eax, 4(%rdi)
890 ret
891
892 This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
893 doing a shuffle from v[1] to v[0] then a float store.
894
895 //===---------------------------------------------------------------------===//
896
897 On SSE4 machines, we compile this code:
898
899 define <2 x float> @test2(<2 x float> %Q, <2 x float> %R,
900 <2 x float> *%P) nounwind {
901 %Z = fadd <2 x float> %Q, %R
902
903 store <2 x float> %Z, <2 x float> *%P
904 ret <2 x float> %Z
905 }
906
907 into:
908
909 _test2: ## @test2
910 ## BB#0:
911 insertps $0, %xmm2, %xmm2
912 insertps $16, %xmm3, %xmm2
913 insertps $0, %xmm0, %xmm3
914 insertps $16, %xmm1, %xmm3
915 addps %xmm2, %xmm3
916 movq %xmm3, (%rdi)
917 movaps %xmm3, %xmm0
918 pshufd $1, %xmm3, %xmm1
919 ## kill: XMM1<def> XMM1<kill>
920 ret
921
922 The insertps's of $0 are pointless complex copies.
923
924 //===---------------------------------------------------------------------===//
925
926 [UNSAFE FP]
927
928 void foo(double, double, double);
929 void norm(double x, double y, double z) {
930 double scale = __builtin_sqrt(x*x + y*y + z*z);
931 foo(x/scale, y/scale, z/scale);
932 }
933
934 We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is
935 slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first
936 and emit 3 mulsd in place of the divs. This can be done as a target-independent
937 transform.
938
939 If we're dealing with floats instead of doubles we could even replace the sqrtss
940 and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
941 cost of reduced accuracy.
942
943 //===---------------------------------------------------------------------===//
944
1 //===---------------------------------------------------------------------===//
2 // Random ideas for the X86 backend.
3 //===---------------------------------------------------------------------===//
4
5 This should be one DIV/IDIV instruction, not a libcall:
6
7 unsigned test(unsigned long long X, unsigned Y) {
8 return X/Y;
9 }
10
11 This can be done trivially with a custom legalizer. What about overflow
12 though? http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
13
14 //===---------------------------------------------------------------------===//
15
16 Improvements to the multiply -> shift/add algorithm:
17 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
18
19 //===---------------------------------------------------------------------===//
20
21 Improve code like this (occurs fairly frequently, e.g. in LLVM):
22 long long foo(int x) { return 1LL << x; }
23
24 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
25 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
26 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
27
28 Another useful one would be ~0ULL >> X and ~0ULL << X.
29
30 One better solution for 1LL << x is:
31 xorl %eax, %eax
32 xorl %edx, %edx
33 testb $32, %cl
34 sete %al
35 setne %dl
36 sall %cl, %eax
37 sall %cl, %edx
38
39 But that requires good 8-bit subreg support.
40
41 Also, this might be better. It's an extra shift, but it's one instruction
42 shorter, and doesn't stress 8-bit subreg support.
43 (From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
44 but without the unnecessary and.)
45 movl %ecx, %eax
46 shrl $5, %eax
47 movl %eax, %edx
48 xorl $1, %edx
49 sall %cl, %eax
50 sall %cl. %edx
51
52 64-bit shifts (in general) expand to really bad code. Instead of using
53 cmovs, we should expand to a conditional branch like GCC produces.
54
55 //===---------------------------------------------------------------------===//
56
57 Some isel ideas:
58
59 1. Dynamic programming based approach when compile time is not an
60 issue.
61 2. Code duplication (addressing mode) during isel.
62 3. Other ideas from "Register-Sensitive Selection, Duplication, and
63 Sequencing of Instructions".
64 4. Scheduling for reduced register pressure. E.g. "Minimum Register
65 Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
66 and other related papers.
67 http://citeseer.ist.psu.edu/govindarajan01minimum.html
68
69 //===---------------------------------------------------------------------===//
70
71 Should we promote i16 to i32 to avoid partial register update stalls?
72
73 //===---------------------------------------------------------------------===//
74
75 Leave any_extend as pseudo instruction and hint to register
76 allocator. Delay codegen until post register allocation.
77 Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
78 the coalescer how to deal with it though.
79
80 //===---------------------------------------------------------------------===//
81
82 It appears icc use push for parameter passing. Need to investigate.
83
84 //===---------------------------------------------------------------------===//
85
86 This:
87
88 void foo(void);
89 void bar(int x, int *P) {
90 x >>= 2;
91 if (x)
92 foo();
93 *P = x;
94 }
95
96 compiles into:
97
98 movq %rsi, %rbx
99 movl %edi, %r14d
100 sarl $2, %r14d
101 testl %r14d, %r14d
102 je LBB0_2
103
104 Instead of doing an explicit test, we can use the flags off the sar. This
105 occurs in a bigger testcase like this, which is pretty common:
106
107 #include <vector>
108 int test1(std::vector<int> &X) {
109 int Sum = 0;
110 for (long i = 0, e = X.size(); i != e; ++i)
111 X[i] = 0;
112 return Sum;
113 }
114
115 //===---------------------------------------------------------------------===//
116
117 Only use inc/neg/not instructions on processors where they are faster than
118 add/sub/xor. They are slower on the P4 due to only updating some processor
119 flags.
120
121 //===---------------------------------------------------------------------===//
122
123 The instruction selector sometimes misses folding a load into a compare. The
124 pattern is written as (cmp reg, (load p)). Because the compare isn't
125 commutative, it is not matched with the load on both sides. The dag combiner
126 should be made smart enough to cannonicalize the load into the RHS of a compare
127 when it can invert the result of the compare for free.
128
129 //===---------------------------------------------------------------------===//
130
131 In many cases, LLVM generates code like this:
132
133 _test:
134 movl 8(%esp), %eax
135 cmpl %eax, 4(%esp)
136 setl %al
137 movzbl %al, %eax
138 ret
139
140 on some processors (which ones?), it is more efficient to do this:
141
142 _test:
143 movl 8(%esp), %ebx
144 xor %eax, %eax
145 cmpl %ebx, 4(%esp)
146 setl %al
147 ret
148
149 Doing this correctly is tricky though, as the xor clobbers the flags.
150
151 //===---------------------------------------------------------------------===//
152
153 We should generate bts/btr/etc instructions on targets where they are cheap or
154 when codesize is important. e.g., for:
155
156 void setbit(int *target, int bit) {
157 *target |= (1 << bit);
158 }
159 void clearbit(int *target, int bit) {
160 *target &= ~(1 << bit);
161 }
162
163 //===---------------------------------------------------------------------===//
164
165 Instead of the following for memset char*, 1, 10:
166
167 movl $16843009, 4(%edx)
168 movl $16843009, (%edx)
169 movw $257, 8(%edx)
170
171 It might be better to generate
172
173 movl $16843009, %eax
174 movl %eax, 4(%edx)
175 movl %eax, (%edx)
176 movw al, 8(%edx)
177
178 when we can spare a register. It reduces code size.
179
180 //===---------------------------------------------------------------------===//
181
182 Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently
183 get this:
184
185 define i32 @test1(i32 %X) {
186 %Y = sdiv i32 %X, 8
187 ret i32 %Y
188 }
189
190 _test1:
191 movl 4(%esp), %eax
192 movl %eax, %ecx
193 sarl $31, %ecx
194 shrl $29, %ecx
195 addl %ecx, %eax
196 sarl $3, %eax
197 ret
198
199 GCC knows several different ways to codegen it, one of which is this:
200
201 _test1:
202 movl 4(%esp), %eax
203 cmpl $-1, %eax
204 leal 7(%eax), %ecx
205 cmovle %ecx, %eax
206 sarl $3, %eax
207 ret
208
209 which is probably slower, but it's interesting at least :)
210
211 //===---------------------------------------------------------------------===//
212
213 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
214 We should leave these as libcalls for everything over a much lower threshold,
215 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
216 stores, TLB preheating, etc)
217
218 //===---------------------------------------------------------------------===//
219
220 Optimize this into something reasonable:
221 x * copysign(1.0, y) * copysign(1.0, z)
222
223 //===---------------------------------------------------------------------===//
224
225 Optimize copysign(x, *y) to use an integer load from y.
226
227 //===---------------------------------------------------------------------===//
228
229 The following tests perform worse with LSR:
230
231 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
232
233 //===---------------------------------------------------------------------===//
234
235 Adding to the list of cmp / test poor codegen issues:
236
237 int test(__m128 *A, __m128 *B) {
238 if (_mm_comige_ss(*A, *B))
239 return 3;
240 else
241 return 4;
242 }
243
244 _test:
245 movl 8(%esp), %eax
246 movaps (%eax), %xmm0
247 movl 4(%esp), %eax
248 movaps (%eax), %xmm1
249 comiss %xmm0, %xmm1
250 setae %al
251 movzbl %al, %ecx
252 movl $3, %eax
253 movl $4, %edx
254 cmpl $0, %ecx
255 cmove %edx, %eax
256 ret
257
258 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
259 are a number of issues. 1) We are introducing a setcc between the result of the
260 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
261 so a any extend (which becomes a zero extend) is added.
262
263 We probably need some kind of target DAG combine hook to fix this.
264
265 //===---------------------------------------------------------------------===//
266
267 We generate significantly worse code for this than GCC:
268 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
269 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
270
271 There is also one case we do worse on PPC.
272
273 //===---------------------------------------------------------------------===//
274
275 For this:
276
277 int test(int a)
278 {
279 return a * 3;
280 }
281
282 We currently emits
283 imull $3, 4(%esp), %eax
284
285 Perhaps this is what we really should generate is? Is imull three or four
286 cycles? Note: ICC generates this:
287 movl 4(%esp), %eax
288 leal (%eax,%eax,2), %eax
289
290 The current instruction priority is based on pattern complexity. The former is
291 more "complex" because it folds a load so the latter will not be emitted.
292
293 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
294 should always try to match LEA first since the LEA matching code does some
295 estimate to determine whether the match is profitable.
296
297 However, if we care more about code size, then imull is better. It's two bytes
298 shorter than movl + leal.
299
300 On a Pentium M, both variants have the same characteristics with regard
301 to throughput; however, the multiplication has a latency of four cycles, as
302 opposed to two cycles for the movl+lea variant.
303
304 //===---------------------------------------------------------------------===//
305
306 __builtin_ffs codegen is messy.
307
308 int ffs_(unsigned X) { return __builtin_ffs(X); }
309
310 llvm produces:
311 ffs_:
312 movl 4(%esp), %ecx
313 bsfl %ecx, %eax
314 movl $32, %edx
315 cmove %edx, %eax
316 incl %eax
317 xorl %edx, %edx
318 testl %ecx, %ecx
319 cmove %edx, %eax
320 ret
321
322 vs gcc:
323
324 _ffs_:
325 movl $-1, %edx
326 bsfl 4(%esp), %eax
327 cmove %edx, %eax
328 addl $1, %eax
329 ret
330
331 Another example of __builtin_ffs (use predsimplify to eliminate a select):
332
333 int foo (unsigned long j) {
334 if (j)
335 return __builtin_ffs (j) - 1;
336 else
337 return 0;
338 }
339
340 //===---------------------------------------------------------------------===//
341
342 It appears gcc place string data with linkonce linkage in
343 .section __TEXT,__const_coal,coalesced instead of
344 .section __DATA,__const_coal,coalesced.
345 Take a look at darwin.h, there are other Darwin assembler directives that we
346 do not make use of.
347
348 //===---------------------------------------------------------------------===//
349
350 define i32 @foo(i32* %a, i32 %t) {
351 entry:
352 br label %cond_true
353
354 cond_true: ; preds = %cond_true, %entry
355 %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; <i32> [#uses=3]
356 %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; <i32> [#uses=1]
357 %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; <i32*> [#uses=1]
358 %tmp3 = load i32* %tmp2 ; <i32> [#uses=1]
359 %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; <i32> [#uses=1]
360 %tmp7 = add i32 %tmp5, %tmp3 ; <i32> [#uses=2]
361 %tmp9 = add i32 %x.0.0, 1 ; <i32> [#uses=2]
362 %tmp = icmp sgt i32 %tmp9, 39 ; <i1> [#uses=1]
363 br i1 %tmp, label %bb12, label %cond_true
364
365 bb12: ; preds = %cond_true
366 ret i32 %tmp7
367 }
368 is pessimized by -loop-reduce and -indvars
369
370 //===---------------------------------------------------------------------===//
371
372 u32 to float conversion improvement:
373
374 float uint32_2_float( unsigned u ) {
375 float fl = (int) (u & 0xffff);
376 float fh = (int) (u >> 16);
377 fh *= 0x1.0p16f;
378 return fh + fl;
379 }
380
381 00000000 subl $0x04,%esp
382 00000003 movl 0x08(%esp,1),%eax
383 00000007 movl %eax,%ecx
384 00000009 shrl $0x10,%ecx
385 0000000c cvtsi2ss %ecx,%xmm0
386 00000010 andl $0x0000ffff,%eax
387 00000015 cvtsi2ss %eax,%xmm1
388 00000019 mulss 0x00000078,%xmm0
389 00000021 addss %xmm1,%xmm0
390 00000025 movss %xmm0,(%esp,1)
391 0000002a flds (%esp,1)
392 0000002d addl $0x04,%esp
393 00000030 ret
394
395 //===---------------------------------------------------------------------===//
396
397 When using fastcc abi, align stack slot of argument of type double on 8 byte
398 boundary to improve performance.
399
400 //===---------------------------------------------------------------------===//
401
402 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
403 simplifications for integer "x cmp y ? a : b".
404
405 //===---------------------------------------------------------------------===//
406
407 Consider the expansion of:
408
409 define i32 @test3(i32 %X) {
410 %tmp1 = urem i32 %X, 255
411 ret i32 %tmp1
412 }
413
414 Currently it compiles to:
415
416 ...
417 movl $2155905153, %ecx
418 movl 8(%esp), %esi
419 movl %esi, %eax
420 mull %ecx
421 ...
422
423 This could be "reassociated" into:
424
425 movl $2155905153, %eax
426 movl 8(%esp), %ecx
427 mull %ecx
428
429 to avoid the copy. In fact, the existing two-address stuff would do this
430 except that mul isn't a commutative 2-addr instruction. I guess this has
431 to be done at isel time based on the #uses to mul?
432
433 //===---------------------------------------------------------------------===//
434
435 Make sure the instruction which starts a loop does not cross a cacheline
436 boundary. This requires knowning the exact length of each machine instruction.
437 That is somewhat complicated, but doable. Example 256.bzip2:
438
439 In the new trace, the hot loop has an instruction which crosses a cacheline
440 boundary. In addition to potential cache misses, this can't help decoding as I
441 imagine there has to be some kind of complicated decoder reset and realignment
442 to grab the bytes from the next cacheline.
443
444 532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines
445 942 942 0x3d03 movl %dh, (1809(%esp, %esi)
446 937 937 0x3d0a incl %esi
447 3 3 0x3d0b cmpb %bl, %dl
448 27 27 0x3d0d jnz 0x000062db <main+11707>
449
450 //===---------------------------------------------------------------------===//
451
452 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
453
454 //===---------------------------------------------------------------------===//
455
456 This could be a single 16-bit load.
457
458 int f(char *p) {
459 if ((p[0] == 1) & (p[1] == 2)) return 1;
460 return 0;
461 }
462
463 //===---------------------------------------------------------------------===//
464
465 We should inline lrintf and probably other libc functions.
466
467 //===---------------------------------------------------------------------===//
468
469 Use the FLAGS values from arithmetic instructions more. For example, compile:
470
471 int add_zf(int *x, int y, int a, int b) {
472 if ((*x += y) == 0)
473 return a;
474 else
475 return b;
476 }
477
478 to:
479 addl %esi, (%rdi)
480 movl %edx, %eax
481 cmovne %ecx, %eax
482 ret
483 instead of:
484
485 _add_zf:
486 addl (%rdi), %esi
487 movl %esi, (%rdi)
488 testl %esi, %esi
489 cmove %edx, %ecx
490 movl %ecx, %eax
491 ret
492
493 As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll
494 without a test instruction.
495
496 //===---------------------------------------------------------------------===//
497
498 These two functions have identical effects:
499
500 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
501 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
502
503 We currently compile them to:
504
505 _f:
506 movl 4(%esp), %eax
507 movl %eax, %ecx
508 incl %ecx
509 movl 8(%esp), %edx
510 cmpl %edx, %ecx
511 jne LBB1_2 #UnifiedReturnBlock
512 LBB1_1: #cond_true
513 addl $2, %eax
514 ret
515 LBB1_2: #UnifiedReturnBlock
516 movl %ecx, %eax
517 ret
518 _f2:
519 movl 4(%esp), %eax
520 movl %eax, %ecx
521 incl %ecx
522 cmpl 8(%esp), %ecx
523 sete %cl
524 movzbl %cl, %ecx
525 leal 1(%ecx,%eax), %eax
526 ret
527
528 both of which are inferior to GCC's:
529
530 _f:
531 movl 4(%esp), %edx
532 leal 1(%edx), %eax
533 addl $2, %edx
534 cmpl 8(%esp), %eax
535 cmove %edx, %eax
536 ret
537 _f2:
538 movl 4(%esp), %eax
539 addl $1, %eax
540 xorl %edx, %edx
541 cmpl 8(%esp), %eax
542 sete %dl
543 addl %edx, %eax
544 ret
545
546 //===---------------------------------------------------------------------===//
547
548 This code:
549
550 void test(int X) {
551 if (X) abort();
552 }
553
554 is currently compiled to:
555
556 _test:
557 subl $12, %esp
558 cmpl $0, 16(%esp)
559 jne LBB1_1
560 addl $12, %esp
561 ret
562 LBB1_1:
563 call L_abort$stub
564
565 It would be better to produce:
566
567 _test:
568 subl $12, %esp
569 cmpl $0, 16(%esp)
570 jne L_abort$stub
571 addl $12, %esp
572 ret
573
574 This can be applied to any no-return function call that takes no arguments etc.
575 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
576 something like this:
577
578 _test:
579 cmpl $0, 4(%esp)
580 jne LBB1_1
581 ret
582 LBB1_1:
583 subl $12, %esp
584 call L_abort$stub
585
586 Both are useful in different situations. Finally, it could be shrink-wrapped
587 and tail called, like this:
588
589 _test:
590 cmpl $0, 4(%esp)
591 jne LBB1_1
592 ret
593 LBB1_1:
594 pop %eax # realign stack.
595 call L_abort$stub
596
597 Though this probably isn't worth it.
598
599 //===---------------------------------------------------------------------===//
600
601 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
602 a neg instead of a sub instruction. Consider:
603
604 int test(char X) { return 7-X; }
605
606 we currently produce:
607 _test:
608 movl $7, %eax
609 movsbl 4(%esp), %ecx
610 subl %ecx, %eax
611 ret
612
613 We would use one fewer register if codegen'd as:
614
615 movsbl 4(%esp), %eax
616 neg %eax
617 add $7, %eax
618 ret
619
620 Note that this isn't beneficial if the load can be folded into the sub. In
621 this case, we want a sub:
622
623 int test(int X) { return 7-X; }
624 _test:
625 movl $7, %eax
626 subl 4(%esp), %eax
627 ret
628
629 //===---------------------------------------------------------------------===//
630
631 Leaf functions that require one 4-byte spill slot have a prolog like this:
632
633 _foo:
634 pushl %esi
635 subl $4, %esp
636 ...
637 and an epilog like this:
638 addl $4, %esp
639 popl %esi
640 ret
641
642 It would be smaller, and potentially faster, to push eax on entry and to
643 pop into a dummy register instead of using addl/subl of esp. Just don't pop
644 into any return registers :)
645
646 //===---------------------------------------------------------------------===//
647
648 The X86 backend should fold (branch (or (setcc, setcc))) into multiple
649 branches. We generate really poor code for:
650
651 double testf(double a) {
652 return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
653 }
654
655 For example, the entry BB is:
656
657 _testf:
658 subl $20, %esp
659 pxor %xmm0, %xmm0
660 movsd 24(%esp), %xmm1
661 ucomisd %xmm0, %xmm1
662 setnp %al
663 sete %cl
664 testb %cl, %al
665 jne LBB1_5 # UnifiedReturnBlock
666 LBB1_1: # cond_true
667
668
669 it would be better to replace the last four instructions with:
670
671 jp LBB1_1
672 je LBB1_5
673 LBB1_1:
674
675 We also codegen the inner ?: into a diamond:
676
677 cvtss2sd LCPI1_0(%rip), %xmm2
678 cvtss2sd LCPI1_1(%rip), %xmm3
679 ucomisd %xmm1, %xmm0
680 ja LBB1_3 # cond_true
681 LBB1_2: # cond_true
682 movapd %xmm3, %xmm2
683 LBB1_3: # cond_true
684 movapd %xmm2, %xmm0
685 ret
686
687 We should sink the load into xmm3 into the LBB1_2 block. This should
688 be pretty easy, and will nuke all the copies.
689
690 //===---------------------------------------------------------------------===//
691
692 This:
693 #include <algorithm>
694 inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
695 { return std::make_pair(a + b, a + b < a); }
696 bool no_overflow(unsigned a, unsigned b)
697 { return !full_add(a, b).second; }
698
699 Should compile to:
700 addl %esi, %edi
701 setae %al
702 movzbl %al, %eax
703 ret
704
705 on x86-64, instead of the rather stupid-looking:
706 addl %esi, %edi
707 setb %al
708 xorb $1, %al
709 movzbl %al, %eax
710 ret
711
712
713 //===---------------------------------------------------------------------===//
714
715 The following code:
716
717 bb114.preheader: ; preds = %cond_next94
718 %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1]
719 %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1]
720 %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1]
721 %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1]
722 %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1]
723 %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2]
724 %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1]
725 %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1]
726 %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1]
727 %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1]
728 %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1]
729 br label %bb114
730
731 produces:
732
733 LBB3_5: # bb114.preheader
734 movswl -68(%ebp), %eax
735 movl $32, %ecx
736 movl %ecx, -80(%ebp)
737 subl %eax, -80(%ebp)
738 movswl -52(%ebp), %eax
739 movl %ecx, -84(%ebp)
740 subl %eax, -84(%ebp)
741 movswl -70(%ebp), %eax
742 movl %ecx, -88(%ebp)
743 subl %eax, -88(%ebp)
744 movswl -50(%ebp), %eax
745 subl %eax, %ecx
746 movl %ecx, -76(%ebp)
747 movswl -42(%ebp), %eax
748 movl %eax, -92(%ebp)
749 movswl -66(%ebp), %eax
750 movl %eax, -96(%ebp)
751 movw $0, -98(%ebp)
752
753 This appears to be bad because the RA is not folding the store to the stack
754 slot into the movl. The above instructions could be:
755 movl $32, -80(%ebp)
756 ...
757 movl $32, -84(%ebp)
758 ...
759 This seems like a cross between remat and spill folding.
760
761 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
762 change, so we could simply subtract %eax from %ecx first and then use %ecx (or
763 vice-versa).
764
765 //===---------------------------------------------------------------------===//
766
767 This code:
768
769 %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1]
770 br i1 %tmp659, label %cond_true662, label %cond_next715
771
772 produces this:
773
774 testw %cx, %cx
775 movswl %cx, %esi
776 jns LBB4_109 # cond_next715
777
778 Shark tells us that using %cx in the testw instruction is sub-optimal. It
779 suggests using the 32-bit register (which is what ICC uses).
780
781 //===---------------------------------------------------------------------===//
782
783 We compile this:
784
785 void compare (long long foo) {
786 if (foo < 4294967297LL)
787 abort();
788 }
789
790 to:
791
792 compare:
793 subl $4, %esp
794 cmpl $0, 8(%esp)
795 setne %al
796 movzbw %al, %ax
797 cmpl $1, 12(%esp)
798 setg %cl
799 movzbw %cl, %cx
800 cmove %ax, %cx
801 testb $1, %cl
802 jne .LBB1_2 # UnifiedReturnBlock
803 .LBB1_1: # ifthen
804 call abort
805 .LBB1_2: # UnifiedReturnBlock
806 addl $4, %esp
807 ret
808
809 (also really horrible code on ppc). This is due to the expand code for 64-bit
810 compares. GCC produces multiple branches, which is much nicer:
811
812 compare:
813 subl $12, %esp
814 movl 20(%esp), %edx
815 movl 16(%esp), %eax
816 decl %edx
817 jle .L7
818 .L5:
819 addl $12, %esp
820 ret
821 .p2align 4,,7
822 .L7:
823 jl .L4
824 cmpl $0, %eax
825 .p2align 4,,8
826 ja .L5
827 .L4:
828 .p2align 4,,9
829 call abort
830
831 //===---------------------------------------------------------------------===//
832
833 Tail call optimization improvements: Tail call optimization currently
834 pushes all arguments on the top of the stack (their normal place for
835 non-tail call optimized calls) that source from the callers arguments
836 or that source from a virtual register (also possibly sourcing from
837 callers arguments).
838 This is done to prevent overwriting of parameters (see example
839 below) that might be used later.
840
841 example:
842
843 int callee(int32, int64);
844 int caller(int32 arg1, int32 arg2) {
845 int64 local = arg2 * 2;
846 return callee(arg2, (int64)local);
847 }
848
849 [arg1] [!arg2 no longer valid since we moved local onto it]
850 [arg2] -> [(int64)
851 [RETADDR] local ]
852
853 Moving arg1 onto the stack slot of callee function would overwrite
854 arg2 of the caller.
855
856 Possible optimizations:
857
858
859 - Analyse the actual parameters of the callee to see which would
860 overwrite a caller parameter which is used by the callee and only
861 push them onto the top of the stack.
862
863 int callee (int32 arg1, int32 arg2);
864 int caller (int32 arg1, int32 arg2) {
865 return callee(arg1,arg2);
866 }
867
868 Here we don't need to write any variables to the top of the stack
869 since they don't overwrite each other.
870
871 int callee (int32 arg1, int32 arg2);
872 int caller (int32 arg1, int32 arg2) {
873 return callee(arg2,arg1);
874 }
875
876 Here we need to push the arguments because they overwrite each
877 other.
878
879 //===---------------------------------------------------------------------===//
880
881 main ()
882 {
883 int i = 0;
884 unsigned long int z = 0;
885
886 do {
887 z -= 0x00004000;
888 i++;
889 if (i > 0x00040000)
890 abort ();
891 } while (z > 0);
892 exit (0);
893 }
894
895 gcc compiles this to:
896
897 _main:
898 subl $28, %esp
899 xorl %eax, %eax
900 jmp L2
901 L3:
902 cmpl $262144, %eax
903 je L10
904 L2:
905 addl $1, %eax
906 cmpl $262145, %eax
907 jne L3
908 call L_abort$stub
909 L10:
910 movl $0, (%esp)
911 call L_exit$stub
912
913 llvm:
914
915 _main:
916 subl $12, %esp
917 movl $1, %eax
918 movl $16384, %ecx
919 LBB1_1: # bb
920 cmpl $262145, %eax
921 jge LBB1_4 # cond_true
922 LBB1_2: # cond_next
923 incl %eax
924 addl $4294950912, %ecx
925 cmpl $16384, %ecx
926 jne LBB1_1 # bb
927 LBB1_3: # bb11
928 xorl %eax, %eax
929 addl $12, %esp
930 ret
931 LBB1_4: # cond_true
932 call L_abort$stub
933
934 1. LSR should rewrite the first cmp with induction variable %ecx.
935 2. DAG combiner should fold
936 leal 1(%eax), %edx
937 cmpl $262145, %edx
938 =>
939 cmpl $262144, %eax
940
941 //===---------------------------------------------------------------------===//
942
943 define i64 @test(double %X) {
944 %Y = fptosi double %X to i64
945 ret i64 %Y
946 }
947
948 compiles to:
949
950 _test:
951 subl $20, %esp
952 movsd 24(%esp), %xmm0
953 movsd %xmm0, 8(%esp)
954 fldl 8(%esp)
955 fisttpll (%esp)
956 movl 4(%esp), %edx
957 movl (%esp), %eax
958 addl $20, %esp
959 #FP_REG_KILL
960 ret
961
962 This should just fldl directly from the input stack slot.
963
964 //===---------------------------------------------------------------------===//
965
966 This code:
967 int foo (int x) { return (x & 65535) | 255; }
968
969 Should compile into:
970
971 _foo:
972 movzwl 4(%esp), %eax
973 orl $255, %eax
974 ret
975
976 instead of:
977 _foo:
978 movl $65280, %eax
979 andl 4(%esp), %eax
980 orl $255, %eax
981 ret
982
983 //===---------------------------------------------------------------------===//
984
985 We're codegen'ing multiply of long longs inefficiently:
986
987 unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
988 return arg1 * arg2;
989 }
990
991 We compile to (fomit-frame-pointer):
992
993 _LLM:
994 pushl %esi
995 movl 8(%esp), %ecx
996 movl 16(%esp), %esi
997 movl %esi, %eax
998 mull %ecx
999 imull 12(%esp), %esi
1000 addl %edx, %esi
1001 imull 20(%esp), %ecx
1002 movl %esi, %edx
1003 addl %ecx, %edx
1004 popl %esi
1005 ret
1006
1007 This looks like a scheduling deficiency and lack of remat of the load from
1008 the argument area. ICC apparently produces:
1009
1010 movl 8(%esp), %ecx
1011 imull 12(%esp), %ecx
1012 movl 16(%esp), %eax
1013 imull 4(%esp), %eax
1014 addl %eax, %ecx
1015 movl 4(%esp), %eax
1016 mull 12(%esp)
1017 addl %ecx, %edx
1018 ret
1019
1020 Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR:
1021 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
1022
1023 //===---------------------------------------------------------------------===//
1024
1025 We can fold a store into "zeroing a reg". Instead of:
1026
1027 xorl %eax, %eax
1028 movl %eax, 124(%esp)
1029
1030 we should get:
1031
1032 movl $0, 124(%esp)
1033
1034 if the flags of the xor are dead.
1035
1036 Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should
1037 be folded into: shl [mem], 1
1038
1039 //===---------------------------------------------------------------------===//
1040
1041 In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
1042 or and instruction, for example:
1043
1044 xorpd LCPI1_0, %xmm2
1045
1046 However, if xmm2 gets spilled, we end up with really ugly code like this:
1047
1048 movsd (%esp), %xmm0
1049 xorpd LCPI1_0, %xmm0
1050 movsd %xmm0, (%esp)
1051
1052 Since we 'know' that this is a 'neg', we can actually "fold" the spill into
1053 the neg/abs instruction, turning it into an *integer* operation, like this:
1054
1055 xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31)
1056
1057 you could also use xorb, but xorl is less likely to lead to a partial register
1058 stall. Here is a contrived testcase:
1059
1060 double a, b, c;
1061 void test(double *P) {
1062 double X = *P;
1063 a = X;
1064 bar();
1065 X = -X;
1066 b = X;
1067 bar();
1068 c = X;
1069 }
1070
1071 //===---------------------------------------------------------------------===//
1072
1073 The generated code on x86 for checking for signed overflow on a multiply the
1074 obvious way is much longer than it needs to be.
1075
1076 int x(int a, int b) {
1077 long long prod = (long long)a*b;
1078 return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
1079 }
1080
1081 See PR2053 for more details.
1082
1083 //===---------------------------------------------------------------------===//
1084
1085 We should investigate using cdq/ctld (effect: edx = sar eax, 31)
1086 more aggressively; it should cost the same as a move+shift on any modern
1087 processor, but it's a lot shorter. Downside is that it puts more
1088 pressure on register allocation because it has fixed operands.
1089
1090 Example:
1091 int abs(int x) {return x < 0 ? -x : x;}
1092
1093 gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
1094 abs:
1095 movl 4(%esp), %eax
1096 cltd
1097 xorl %edx, %eax
1098 subl %edx, %eax
1099 ret
1100
1101 //===---------------------------------------------------------------------===//
1102
1103 Take the following code (from
1104 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
1105
1106 extern unsigned char first_one[65536];
1107 int FirstOnet(unsigned long long arg1)
1108 {
1109 if (arg1 >> 48)
1110 return (first_one[arg1 >> 48]);
1111 return 0;
1112 }
1113
1114
1115 The following code is currently generated:
1116 FirstOnet:
1117 movl 8(%esp), %eax
1118 cmpl $65536, %eax
1119 movl 4(%esp), %ecx
1120 jb .LBB1_2 # UnifiedReturnBlock
1121 .LBB1_1: # ifthen
1122 shrl $16, %eax
1123 movzbl first_one(%eax), %eax
1124 ret
1125 .LBB1_2: # UnifiedReturnBlock
1126 xorl %eax, %eax
1127 ret
1128
1129 We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
1130 lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
1131
1132 //===---------------------------------------------------------------------===//
1133
1134 We compile this function:
1135
1136 define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind {
1137 entry:
1138 %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1]
1139 br i1 %tmp2, label %bb7, label %bb
1140
1141 bb: ; preds = %entry
1142 %tmp6 = add i32 %b, %a ; <i32> [#uses=1]
1143 ret i32 %tmp6
1144
1145 bb7: ; preds = %entry
1146 %tmp10 = sub i32 %a, %c ; <i32> [#uses=1]
1147 ret i32 %tmp10
1148 }
1149
1150 to:
1151
1152 foo: # @foo
1153 # BB#0: # %entry
1154 movl 4(%esp), %ecx
1155 cmpb $0, 16(%esp)
1156 je .LBB0_2
1157 # BB#1: # %bb
1158 movl 8(%esp), %eax
1159 addl %ecx, %eax
1160 ret
1161 .LBB0_2: # %bb7
1162 movl 12(%esp), %edx
1163 movl %ecx, %eax
1164 subl %edx, %eax
1165 ret
1166
1167 There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
1168 couple more movls by putting 4(%esp) into %eax instead of %ecx.
1169
1170 //===---------------------------------------------------------------------===//
1171
1172 See rdar://4653682.
1173
1174 From flops:
1175
1176 LBB1_15: # bb310
1177 cvtss2sd LCPI1_0, %xmm1
1178 addsd %xmm1, %xmm0
1179 movsd 176(%esp), %xmm2
1180 mulsd %xmm0, %xmm2
1181 movapd %xmm2, %xmm3
1182 mulsd %xmm3, %xmm3
1183 movapd %xmm3, %xmm4
1184 mulsd LCPI1_23, %xmm4
1185 addsd LCPI1_24, %xmm4
1186 mulsd %xmm3, %xmm4
1187 addsd LCPI1_25, %xmm4
1188 mulsd %xmm3, %xmm4
1189 addsd LCPI1_26, %xmm4
1190 mulsd %xmm3, %xmm4
1191 addsd LCPI1_27, %xmm4
1192 mulsd %xmm3, %xmm4
1193 addsd LCPI1_28, %xmm4
1194 mulsd %xmm3, %xmm4
1195 addsd %xmm1, %xmm4
1196 mulsd %xmm2, %xmm4
1197 movsd 152(%esp), %xmm1
1198 addsd %xmm4, %xmm1
1199 movsd %xmm1, 152(%esp)
1200 incl %eax
1201 cmpl %eax, %esi
1202 jge LBB1_15 # bb310
1203 LBB1_16: # bb358.loopexit
1204 movsd 152(%esp), %xmm0
1205 addsd %xmm0, %xmm0
1206 addsd LCPI1_22, %xmm0
1207 movsd %xmm0, 152(%esp)
1208
1209 Rather than spilling the result of the last addsd in the loop, we should have
1210 insert a copy to split the interval (one for the duration of the loop, one
1211 extending to the fall through). The register pressure in the loop isn't high
1212 enough to warrant the spill.
1213
1214 Also check why xmm7 is not used at all in the function.
1215
1216 //===---------------------------------------------------------------------===//
1217
1218 Take the following:
1219
1220 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128"
1221 target triple = "i386-apple-darwin8"
1222 @in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2]
1223 define fastcc void @abort_gzip() noreturn nounwind {
1224 entry:
1225 %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1]
1226 br i1 %tmp.b.i, label %bb.i, label %bb4.i
1227 bb.i: ; preds = %entry
1228 tail call void @exit( i32 1 ) noreturn nounwind
1229 unreachable
1230 bb4.i: ; preds = %entry
1231 store i1 true, i1* @in_exit.4870.b
1232 tail call void @exit( i32 1 ) noreturn nounwind
1233 unreachable
1234 }
1235 declare void @exit(i32) noreturn nounwind
1236
1237 This compiles into:
1238 _abort_gzip: ## @abort_gzip
1239 ## BB#0: ## %entry
1240 subl $12, %esp
1241 movb _in_exit.4870.b, %al
1242 cmpb $1, %al
1243 jne LBB0_2
1244
1245 We somehow miss folding the movb into the cmpb.
1246
1247 //===---------------------------------------------------------------------===//
1248
1249 We compile:
1250
1251 int test(int x, int y) {
1252 return x-y-1;
1253 }
1254
1255 into (-m64):
1256
1257 _test:
1258 decl %edi
1259 movl %edi, %eax
1260 subl %esi, %eax
1261 ret
1262
1263 it would be better to codegen as: x+~y (notl+addl)
1264
1265 //===---------------------------------------------------------------------===//
1266
1267 This code:
1268
1269 int foo(const char *str,...)
1270 {
1271 __builtin_va_list a; int x;
1272 __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
1273 return x;
1274 }
1275
1276 gets compiled into this on x86-64:
1277 subq $200, %rsp
1278 movaps %xmm7, 160(%rsp)
1279 movaps %xmm6, 144(%rsp)
1280 movaps %xmm5, 128(%rsp)
1281 movaps %xmm4, 112(%rsp)
1282 movaps %xmm3, 96(%rsp)
1283 movaps %xmm2, 80(%rsp)
1284 movaps %xmm1, 64(%rsp)
1285 movaps %xmm0, 48(%rsp)
1286 movq %r9, 40(%rsp)
1287 movq %r8, 32(%rsp)
1288 movq %rcx, 24(%rsp)
1289 movq %rdx, 16(%rsp)
1290 movq %rsi, 8(%rsp)
1291 leaq (%rsp), %rax
1292 movq %rax, 192(%rsp)
1293 leaq 208(%rsp), %rax
1294 movq %rax, 184(%rsp)
1295 movl $48, 180(%rsp)
1296 movl $8, 176(%rsp)
1297 movl 176(%rsp), %eax
1298 cmpl $47, %eax
1299 jbe .LBB1_3 # bb
1300 .LBB1_1: # bb3
1301 movq 184(%rsp), %rcx
1302 leaq 8(%rcx), %rax
1303 movq %rax, 184(%rsp)
1304 .LBB1_2: # bb4
1305 movl (%rcx), %eax
1306 addq $200, %rsp
1307 ret
1308 .LBB1_3: # bb
1309 movl %eax, %ecx
1310 addl $8, %eax
1311 addq 192(%rsp), %rcx
1312 movl %eax, 176(%rsp)
1313 jmp .LBB1_2 # bb4
1314
1315 gcc 4.3 generates:
1316 subq $96, %rsp
1317 .LCFI0:
1318 leaq 104(%rsp), %rax
1319 movq %rsi, -80(%rsp)
1320 movl $8, -120(%rsp)
1321 movq %rax, -112(%rsp)
1322 leaq -88(%rsp), %rax
1323 movq %rax, -104(%rsp)
1324 movl $8, %eax
1325 cmpl $48, %eax
1326 jb .L6
1327 movq -112(%rsp), %rdx
1328 movl (%rdx), %eax
1329 addq $96, %rsp
1330 ret
1331 .p2align 4,,10
1332 .p2align 3
1333 .L6:
1334 mov %eax, %edx
1335 addq -104(%rsp), %rdx
1336 addl $8, %eax
1337 movl %eax, -120(%rsp)
1338 movl (%rdx), %eax
1339 addq $96, %rsp
1340 ret
1341
1342 and it gets compiled into this on x86:
1343 pushl %ebp
1344 movl %esp, %ebp
1345 subl $4, %esp
1346 leal 12(%ebp), %eax
1347 movl %eax, -4(%ebp)
1348 leal 16(%ebp), %eax
1349 movl %eax, -4(%ebp)
1350 movl 12(%ebp), %eax
1351 addl $4, %esp
1352 popl %ebp
1353 ret
1354
1355 gcc 4.3 generates:
1356 pushl %ebp
1357 movl %esp, %ebp
1358 movl 12(%ebp), %eax
1359 popl %ebp
1360 ret
1361
1362 //===---------------------------------------------------------------------===//
1363
1364 Teach tblgen not to check bitconvert source type in some cases. This allows us
1365 to consolidate the following patterns in X86InstrMMX.td:
1366
1367 def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1368 (iPTR 0))))),
1369 (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
1370 def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1371 (iPTR 0))))),
1372 (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
1373 def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1374 (iPTR 0))))),
1375 (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
1376
1377 There are other cases in various td files.
1378
1379 //===---------------------------------------------------------------------===//
1380
1381 Take something like the following on x86-32:
1382 unsigned a(unsigned long long x, unsigned y) {return x % y;}
1383
1384 We currently generate a libcall, but we really shouldn't: the expansion is
1385 shorter and likely faster than the libcall. The expected code is something
1386 like the following:
1387
1388 movl 12(%ebp), %eax
1389 movl 16(%ebp), %ecx
1390 xorl %edx, %edx
1391 divl %ecx
1392 movl 8(%ebp), %eax
1393 divl %ecx
1394 movl %edx, %eax
1395 ret
1396
1397 A similar code sequence works for division.
1398
1399 //===---------------------------------------------------------------------===//
1400
1401 These should compile to the same code, but the later codegen's to useless
1402 instructions on X86. This may be a trivial dag combine (GCC PR7061):
1403
1404 struct s1 { unsigned char a, b; };
1405 unsigned long f1(struct s1 x) {
1406 return x.a + x.b;
1407 }
1408 struct s2 { unsigned a: 8, b: 8; };
1409 unsigned long f2(struct s2 x) {
1410 return x.a + x.b;
1411 }
1412
1413 //===---------------------------------------------------------------------===//
1414
1415 We currently compile this:
1416
1417 define i32 @func1(i32 %v1, i32 %v2) nounwind {
1418 entry:
1419 %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
1420 %sum = extractvalue {i32, i1} %t, 0
1421 %obit = extractvalue {i32, i1} %t, 1
1422 br i1 %obit, label %overflow, label %normal
1423 normal:
1424 ret i32 %sum
1425 overflow:
1426 call void @llvm.trap()
1427 unreachable
1428 }
1429 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
1430 declare void @llvm.trap()
1431
1432 to:
1433
1434 _func1:
1435 movl 4(%esp), %eax
1436 addl 8(%esp), %eax
1437 jo LBB1_2 ## overflow
1438 LBB1_1: ## normal
1439 ret
1440 LBB1_2: ## overflow
1441 ud2
1442
1443 it would be nice to produce "into" someday.
1444
1445 //===---------------------------------------------------------------------===//
1446
1447 This code:
1448
1449 void vec_mpys1(int y[], const int x[], int scaler) {
1450 int i;
1451 for (i = 0; i < 150; i++)
1452 y[i] += (((long long)scaler * (long long)x[i]) >> 31);
1453 }
1454
1455 Compiles to this loop with GCC 3.x:
1456
1457 .L5:
1458 movl %ebx, %eax
1459 imull (%edi,%ecx,4)
1460 shrdl $31, %edx, %eax
1461 addl %eax, (%esi,%ecx,4)
1462 incl %ecx
1463 cmpl $149, %ecx
1464 jle .L5
1465
1466 llvm-gcc compiles it to the much uglier:
1467
1468 LBB1_1: ## bb1
1469 movl 24(%esp), %eax
1470 movl (%eax,%edi,4), %ebx
1471 movl %ebx, %ebp
1472 imull %esi, %ebp
1473 movl %ebx, %eax
1474 mull %ecx
1475 addl %ebp, %edx
1476 sarl $31, %ebx
1477 imull %ecx, %ebx
1478 addl %edx, %ebx
1479 shldl $1, %eax, %ebx
1480 movl 20(%esp), %eax
1481 addl %ebx, (%eax,%edi,4)
1482 incl %edi
1483 cmpl $150, %edi
1484 jne LBB1_1 ## bb1
1485
1486 The issue is that we hoist the cast of "scaler" to long long outside of the
1487 loop, the value comes into the loop as two values, and
1488 RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
1489 constructed BUILD_PAIR which represents the cast value.
1490
1491 This can be handled by making CodeGenPrepare sink the cast.
1492
1493 //===---------------------------------------------------------------------===//
1494
1495 Test instructions can be eliminated by using EFLAGS values from arithmetic
1496 instructions. This is currently not done for mul, and, or, xor, neg, shl,
1497 sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
1498 for read-modify-write instructions. It is also current not done if the
1499 OF or CF flags are needed.
1500
1501 The shift operators have the complication that when the shift count is
1502 zero, EFLAGS is not set, so they can only subsume a test instruction if
1503 the shift count is known to be non-zero. Also, using the EFLAGS value
1504 from a shift is apparently very slow on some x86 implementations.
1505
1506 In read-modify-write instructions, the root node in the isel match is
1507 the store, and isel has no way for the use of the EFLAGS result of the
1508 arithmetic to be remapped to the new node.
1509
1510 Add and subtract instructions set OF on signed overflow and CF on unsiged
1511 overflow, while test instructions always clear OF and CF. In order to
1512 replace a test with an add or subtract in a situation where OF or CF is
1513 needed, codegen must be able to prove that the operation cannot see
1514 signed or unsigned overflow, respectively.
1515
1516 //===---------------------------------------------------------------------===//
1517
1518 memcpy/memmove do not lower to SSE copies when possible. A silly example is:
1519 define <16 x float> @foo(<16 x float> %A) nounwind {
1520 %tmp = alloca <16 x float>, align 16
1521 %tmp2 = alloca <16 x float>, align 16
1522 store <16 x float> %A, <16 x float>* %tmp
1523 %s = bitcast <16 x float>* %tmp to i8*
1524 %s2 = bitcast <16 x float>* %tmp2 to i8*
1525 call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
1526 %R = load <16 x float>* %tmp2
1527 ret <16 x float> %R
1528 }
1529
1530 declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
1531
1532 which compiles to:
1533
1534 _foo:
1535 subl $140, %esp
1536 movaps %xmm3, 112(%esp)
1537 movaps %xmm2, 96(%esp)
1538 movaps %xmm1, 80(%esp)
1539 movaps %xmm0, 64(%esp)
1540 movl 60(%esp), %eax
1541 movl %eax, 124(%esp)
1542 movl 56(%esp), %eax
1543 movl %eax, 120(%esp)
1544 movl 52(%esp), %eax
1545 <many many more 32-bit copies>
1546 movaps (%esp), %xmm0
1547 movaps 16(%esp), %xmm1
1548 movaps 32(%esp), %xmm2
1549 movaps 48(%esp), %xmm3
1550 addl $140, %esp
1551 ret
1552
1553 On Nehalem, it may even be cheaper to just use movups when unaligned than to
1554 fall back to lower-granularity chunks.
1555
1556 //===---------------------------------------------------------------------===//
1557
1558 Implement processor-specific optimizations for parity with GCC on these
1559 processors. GCC does two optimizations:
1560
1561 1. ix86_pad_returns inserts a noop before ret instructions if immediately
1562 preceded by a conditional branch or is the target of a jump.
1563 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
1564 code contains more than 3 branches.
1565
1566 The first one is done for all AMDs, Core2, and "Generic"
1567 The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
1568 Core 2, and "Generic"
1569
1570 //===---------------------------------------------------------------------===//
1571
1572 Testcase:
1573 int a(int x) { return (x & 127) > 31; }
1574
1575 Current output:
1576 movl 4(%esp), %eax
1577 andl $127, %eax
1578 cmpl $31, %eax
1579 seta %al
1580 movzbl %al, %eax
1581 ret
1582
1583 Ideal output:
1584 xorl %eax, %eax
1585 testl $96, 4(%esp)
1586 setne %al
1587 ret
1588
1589 This should definitely be done in instcombine, canonicalizing the range
1590 condition into a != condition. We get this IR:
1591
1592 define i32 @a(i32 %x) nounwind readnone {
1593 entry:
1594 %0 = and i32 %x, 127 ; <i32> [#uses=1]
1595 %1 = icmp ugt i32 %0, 31 ; <i1> [#uses=1]
1596 %2 = zext i1 %1 to i32 ; <i32> [#uses=1]
1597 ret i32 %2
1598 }
1599
1600 Instcombine prefers to strength reduce relational comparisons to equality
1601 comparisons when possible, this should be another case of that. This could
1602 be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it
1603 looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already
1604 be redesigned to use ComputeMaskedBits and friends.
1605
1606
1607 //===---------------------------------------------------------------------===//
1608 Testcase:
1609 int x(int a) { return (a&0xf0)>>4; }
1610
1611 Current output:
1612 movl 4(%esp), %eax
1613 shrl $4, %eax
1614 andl $15, %eax
1615 ret
1616
1617 Ideal output:
1618 movzbl 4(%esp), %eax
1619 shrl $4, %eax
1620 ret
1621
1622 //===---------------------------------------------------------------------===//
1623
1624 Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
1625 properly.
1626
1627 When the return value is not used (i.e. only care about the value in the
1628 memory), x86 does not have to use add to implement these. Instead, it can use
1629 add, sub, inc, dec instructions with the "lock" prefix.
1630
1631 This is currently implemented using a bit of instruction selection trick. The
1632 issue is the target independent pattern produces one output and a chain and we
1633 want to map it into one that just output a chain. The current trick is to select
1634 it into a MERGE_VALUES with the first definition being an implicit_def. The
1635 proper solution is to add new ISD opcodes for the no-output variant. DAG
1636 combiner can then transform the node before it gets to target node selection.
1637
1638 Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
1639 fact these instructions are identical to the non-lock versions. We need a way to
1640 add target specific information to target nodes and have this information
1641 carried over to machine instructions. Asm printer (or JIT) can use this
1642 information to add the "lock" prefix.
1643
1644 //===---------------------------------------------------------------------===//
1645
1646 struct B {
1647 unsigned char y0 : 1;
1648 };
1649
1650 int bar(struct B* a) { return a->y0; }
1651
1652 define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
1653 %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
1654 %2 = load i8* %1, align 1
1655 %3 = and i8 %2, 1
1656 %4 = zext i8 %3 to i32
1657 ret i32 %4
1658 }
1659
1660 bar: # @bar
1661 # BB#0:
1662 movb (%rdi), %al
1663 andb $1, %al
1664 movzbl %al, %eax
1665 ret
1666
1667 Missed optimization: should be movl+andl.
1668
1669 //===---------------------------------------------------------------------===//
1670
1671 The x86_64 abi says:
1672
1673 Booleans, when stored in a memory object, are stored as single byte objects the
1674 value of which is always 0 (false) or 1 (true).
1675
1676 We are not using this fact:
1677
1678 int bar(_Bool *a) { return *a; }
1679
1680 define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
1681 %1 = load i8* %a, align 1, !tbaa !0
1682 %tmp = and i8 %1, 1
1683 %2 = zext i8 %tmp to i32
1684 ret i32 %2
1685 }
1686
1687 bar:
1688 movb (%rdi), %al
1689 andb $1, %al
1690 movzbl %al, %eax
1691 ret
1692
1693 GCC produces
1694
1695 bar:
1696 movzbl (%rdi), %eax
1697 ret
1698
1699 //===---------------------------------------------------------------------===//
1700
1701 Consider the following two functions compiled with clang:
1702 _Bool foo(int *x) { return !(*x & 4); }
1703 unsigned bar(int *x) { return !(*x & 4); }
1704
1705 foo:
1706 movl 4(%esp), %eax
1707 testb $4, (%eax)
1708 sete %al
1709 movzbl %al, %eax
1710 ret
1711
1712 bar:
1713 movl 4(%esp), %eax
1714 movl (%eax), %eax
1715 shrl $2, %eax
1716 andl $1, %eax
1717 xorl $1, %eax
1718 ret
1719
1720 The second function generates more code even though the two functions are
1721 are functionally identical.
1722
1723 //===---------------------------------------------------------------------===//
1724
1725 Take the following C code:
1726 int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
1727
1728 We generate the following IR with clang:
1729 define i32 @f(i32 %a, i32 %b) nounwind readnone {
1730 entry:
1731 %tmp = xor i32 %b, %a ; <i32> [#uses=1]
1732 %tmp6 = and i32 %tmp, 255 ; <i32> [#uses=1]
1733 %cmp = icmp eq i32 %tmp6, 0 ; <i1> [#uses=1]
1734 %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1]
1735 ret i32 %conv5
1736 }
1737
1738 And the following x86 code:
1739 xorl %esi, %edi
1740 testb $-1, %dil
1741 sete %al
1742 movzbl %al, %eax
1743 ret
1744
1745 A cmpb instead of the xorl+testb would be one instruction shorter.
1746
1747 //===---------------------------------------------------------------------===//
1748
1749 Given the following C code:
1750 int f(int a, int b) { return (signed char)a == (signed char)b; }
1751
1752 We generate the following IR with clang:
1753 define i32 @f(i32 %a, i32 %b) nounwind readnone {
1754 entry:
1755 %sext = shl i32 %a, 24 ; <i32> [#uses=1]
1756 %conv1 = ashr i32 %sext, 24 ; <i32> [#uses=1]
1757 %sext6 = shl i32 %b, 24 ; <i32> [#uses=1]
1758 %conv4 = ashr i32 %sext6, 24 ; <i32> [#uses=1]
1759 %cmp = icmp eq i32 %conv1, %conv4 ; <i1> [#uses=1]
1760 %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1]
1761 ret i32 %conv5
1762 }
1763
1764 And the following x86 code:
1765 movsbl %sil, %eax
1766 movsbl %dil, %ecx
1767 cmpl %eax, %ecx
1768 sete %al
1769 movzbl %al, %eax
1770 ret
1771
1772
1773 It should be possible to eliminate the sign extensions.
1774
1775 //===---------------------------------------------------------------------===//
1776
1777 LLVM misses a load+store narrowing opportunity in this code:
1778
1779 %struct.bf = type { i64, i16, i16, i32 }
1780
1781 @bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2]
1782
1783 define void @t1() nounwind ssp {
1784 entry:
1785 %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1]
1786 %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
1787 %2 = bitcast i16* %1 to i32* ; <i32*> [#uses=2]
1788 %3 = load i32* %2, align 1 ; <i32> [#uses=1]
1789 %4 = and i32 %3, -65537 ; <i32> [#uses=1]
1790 store i32 %4, i32* %2, align 1
1791 %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1]
1792 %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
1793 %7 = bitcast i16* %6 to i32* ; <i32*> [#uses=2]
1794 %8 = load i32* %7, align 1 ; <i32> [#uses=1]
1795 %9 = and i32 %8, -131073 ; <i32> [#uses=1]
1796 store i32 %9, i32* %7, align 1
1797 ret void
1798 }
1799
1800 LLVM currently emits this:
1801
1802 movq bfi(%rip), %rax
1803 andl $-65537, 8(%rax)
1804 movq bfi(%rip), %rax
1805 andl $-131073, 8(%rax)
1806 ret
1807
1808 It could narrow the loads and stores to emit this:
1809
1810 movq bfi(%rip), %rax
1811 andb $-2, 10(%rax)
1812 movq bfi(%rip), %rax
1813 andb $-3, 10(%rax)
1814 ret
1815
1816 The trouble is that there is a TokenFactor between the store and the
1817 load, making it non-trivial to determine if there's anything between
1818 the load and the store which would prohibit narrowing.
1819
1820 //===---------------------------------------------------------------------===//
1821
1822 This code:
1823 void foo(unsigned x) {
1824 if (x == 0) bar();
1825 else if (x == 1) qux();
1826 }
1827
1828 currently compiles into:
1829 _foo:
1830 movl 4(%esp), %eax
1831 cmpl $1, %eax
1832 je LBB0_3
1833 testl %eax, %eax
1834 jne LBB0_4
1835
1836 the testl could be removed:
1837 _foo:
1838 movl 4(%esp), %eax
1839 cmpl $1, %eax
1840 je LBB0_3
1841 jb LBB0_4
1842
1843 0 is the only unsigned number < 1.
1844
1845 //===---------------------------------------------------------------------===//
1846
1847 This code:
1848
1849 %0 = type { i32, i1 }
1850
1851 define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
1852 entry:
1853 %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
1854 %cmp = extractvalue %0 %uadd, 1
1855 %inc = zext i1 %cmp to i32
1856 %add = add i32 %x, %sum
1857 %z.0 = add i32 %add, %inc
1858 ret i32 %z.0
1859 }
1860
1861 declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
1862
1863 compiles to:
1864
1865 _add32carry: ## @add32carry
1866 addl %esi, %edi
1867 sbbl %ecx, %ecx
1868 movl %edi, %eax
1869 subl %ecx, %eax
1870 ret
1871
1872 But it could be:
1873
1874 _add32carry:
1875 leal (%rsi,%rdi), %eax
1876 cmpl %esi, %eax
1877 adcl $0, %eax
1878 ret
1879
1880 //===---------------------------------------------------------------------===//
1881
1882 The hot loop of 256.bzip2 contains code that looks a bit like this:
1883
1884 int foo(char *P, char *Q, int x, int y) {
1885 if (P[0] != Q[0])
1886 return P[0] < Q[0];
1887 if (P[1] != Q[1])
1888 return P[1] < Q[1];
1889 if (P[2] != Q[2])
1890 return P[2] < Q[2];
1891 return P[3] < Q[3];
1892 }
1893
1894 In the real code, we get a lot more wrong than this. However, even in this
1895 code we generate:
1896
1897 _foo: ## @foo
1898 ## BB#0: ## %entry
1899 movb (%rsi), %al
1900 movb (%rdi), %cl
1901 cmpb %al, %cl
1902 je LBB0_2
1903 LBB0_1: ## %if.then
1904 cmpb %al, %cl
1905 jmp LBB0_5
1906 LBB0_2: ## %if.end
1907 movb 1(%rsi), %al
1908 movb 1(%rdi), %cl
1909 cmpb %al, %cl
1910 jne LBB0_1
1911 ## BB#3: ## %if.end38
1912 movb 2(%rsi), %al
1913 movb 2(%rdi), %cl
1914 cmpb %al, %cl
1915 jne LBB0_1
1916 ## BB#4: ## %if.end60
1917 movb 3(%rdi), %al
1918 cmpb 3(%rsi), %al
1919 LBB0_5: ## %if.end60
1920 setl %al
1921 movzbl %al, %eax
1922 ret
1923
1924 Note that we generate jumps to LBB0_1 which does a redundant compare. The
1925 redundant compare also forces the register values to be live, which prevents
1926 folding one of the loads into the compare. In contrast, GCC 4.2 produces:
1927
1928 _foo:
1929 movzbl (%rsi), %eax
1930 cmpb %al, (%rdi)
1931 jne L10
1932 L12:
1933 movzbl 1(%rsi), %eax
1934 cmpb %al, 1(%rdi)
1935 jne L10
1936 movzbl 2(%rsi), %eax
1937 cmpb %al, 2(%rdi)
1938 jne L10
1939 movzbl 3(%rdi), %eax
1940 cmpb 3(%rsi), %al
1941 L10:
1942 setl %al
1943 movzbl %al, %eax
1944 ret
1945
1946 which is "perfect".
1947
1948 //===---------------------------------------------------------------------===//
1949
1950 For the branch in the following code:
1951 int a();
1952 int b(int x, int y) {
1953 if (x & (1<<(y&7)))
1954 return a();
1955 return y;
1956 }
1957
1958 We currently generate:
1959 movb %sil, %al
1960 andb $7, %al
1961 movzbl %al, %eax
1962 btl %eax, %edi
1963 jae .LBB0_2
1964
1965 movl+andl would be shorter than the movb+andb+movzbl sequence.
1966
1967 //===---------------------------------------------------------------------===//
1968
1969 For the following:
1970 struct u1 {
1971 float x, y;
1972 };
1973 float foo(struct u1 u) {
1974 return u.x + u.y;
1975 }
1976
1977 We currently generate:
1978 movdqa %xmm0, %xmm1
1979 pshufd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,0]
1980 addss %xmm1, %xmm0
1981 ret
1982
1983 We could save an instruction here by commuting the addss.
1984
1985 //===---------------------------------------------------------------------===//
1986
1987 This (from PR9661):
1988
1989 float clamp_float(float a) {
1990 if (a > 1.0f)
1991 return 1.0f;
1992 else if (a < 0.0f)
1993 return 0.0f;
1994 else
1995 return a;
1996 }
1997
1998 Could compile to:
1999
2000 clamp_float: # @clamp_float
2001 movss .LCPI0_0(%rip), %xmm1
2002 minss %xmm1, %xmm0
2003 pxor %xmm1, %xmm1
2004 maxss %xmm1, %xmm0
2005 ret
2006
2007 with -ffast-math.
2008
2009 //===---------------------------------------------------------------------===//
2010
2011 This function (from PR9803):
2012
2013 int clamp2(int a) {
2014 if (a > 5)
2015 a = 5;
2016 if (a < 0)
2017 return 0;
2018 return a;
2019 }
2020
2021 Compiles to:
2022
2023 _clamp2: ## @clamp2
2024 pushq %rbp
2025 movq %rsp, %rbp
2026 cmpl $5, %edi
2027 movl $5, %ecx
2028 cmovlel %edi, %ecx
2029 testl %ecx, %ecx
2030 movl $0, %eax
2031 cmovnsl %ecx, %eax
2032 popq %rbp
2033 ret
2034
2035 The move of 0 could be scheduled above the test to make it is xor reg,reg.
2036
2037 //===---------------------------------------------------------------------===//
2038
2039 GCC PR48986. We currently compile this:
2040
2041 void bar(void);
2042 void yyy(int* p) {
2043 if (__sync_fetch_and_add(p, -1) == 1)
2044 bar();
2045 }
2046
2047 into:
2048 movl $-1, %eax
2049 lock
2050 xaddl %eax, (%rdi)
2051 cmpl $1, %eax
2052 je LBB0_2
2053
2054 Instead we could generate:
2055
2056 lock
2057 dec %rdi
2058 je LBB0_2
2059
2060 The trick is to match "fetch_and_add(X, -C) == C".
2061
2062 //===---------------------------------------------------------------------===//
2063
2064 unsigned t(unsigned a, unsigned b) {
2065 return a <= b ? 5 : -5;
2066 }
2067
2068 We generate:
2069 movl $5, %ecx
2070 cmpl %esi, %edi
2071 movl $-5, %eax
2072 cmovbel %ecx, %eax
2073
2074 GCC:
2075 cmpl %edi, %esi
2076 sbbl %eax, %eax
2077 andl $-10, %eax
2078 addl $5, %eax
2079
2080 //===---------------------------------------------------------------------===//
2081