1 //===---------------------------------------------------------------------===//
2 // Random ideas for the X86 backend: SSE-specific stuff.
3 //===---------------------------------------------------------------------===//
4
5 //===---------------------------------------------------------------------===//
6
7 SSE Variable shift can be custom lowered to something like this, which uses a
8 small table + unaligned load + shuffle instead of going through memory.
9
10 __m128i_shift_right:
11 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
12 .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
13
14 ...
15 __m128i shift_right(__m128i value, unsigned long offset) {
16 return _mm_shuffle_epi8(value,
17 _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
18 }
19
20 //===---------------------------------------------------------------------===//
21
22 SSE has instructions for doing operations on complex numbers, we should pattern
23 match them. For example, this should turn into a horizontal add:
24
25 typedef float __attribute__((vector_size(16))) v4f32;
26 float f32(v4f32 A) {
27 return A[0]+A[1]+A[2]+A[3];
28 }
29
30 Instead we get this:
31
32 _f32: ## @f32
33 pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0]
34 addss %xmm0, %xmm1
35 pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0]
36 movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1]
37 movaps %xmm0, %xmm3
38 addss %xmm1, %xmm3
39 movdqa %xmm2, %xmm0
40 addss %xmm3, %xmm0
41 ret
42
43 Also, there are cases where some simple local SLP would improve codegen a bit.
44 compiling this:
45
46 _Complex float f32(_Complex float A, _Complex float B) {
47 return A+B;
48 }
49
50 into:
51
52 _f32: ## @f32
53 movdqa %xmm0, %xmm2
54 addss %xmm1, %xmm2
55 pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0]
56 pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0]
57 addss %xmm1, %xmm3
58 movaps %xmm2, %xmm0
59 unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
60 ret
61
62 seems silly when it could just be one addps.
63
64
65 //===---------------------------------------------------------------------===//
66
67 Expand libm rounding functions inline: Significant speedups possible.
68 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
69
70 //===---------------------------------------------------------------------===//
71
72 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
73 other fast SSE modes.
74
75 //===---------------------------------------------------------------------===//
76
77 Think about doing i64 math in SSE regs on x86-32.
78
79 //===---------------------------------------------------------------------===//
80
81 This testcase should have no SSE instructions in it, and only one load from
82 a constant pool:
83
84 double %test3(bool %B) {
85 %C = select bool %B, double 123.412, double 523.01123123
86 ret double %C
87 }
88
89 Currently, the select is being lowered, which prevents the dag combiner from
90 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
91
92 The pattern isel got this one right.
93
94 //===---------------------------------------------------------------------===//
95
96 SSE should implement 'select_cc' using 'emulated conditional moves' that use
97 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
98
99 double %X(double %Y, double %Z, double %A, double %B) {
100 %C = setlt double %A, %B
101 %z = fadd double %Z, 0.0 ;; select operand is not a load
102 %D = select bool %C, double %Y, double %z
103 ret double %D
104 }
105
106 We currently emit:
107
108 _X:
109 subl $12, %esp
110 xorpd %xmm0, %xmm0
111 addsd 24(%esp), %xmm0
112 movsd 32(%esp), %xmm1
113 movsd 16(%esp), %xmm2
114 ucomisd 40(%esp), %xmm1
115 jb LBB_X_2
116 LBB_X_1:
117 movsd %xmm0, %xmm2
118 LBB_X_2:
119 movsd %xmm2, (%esp)
120 fldl (%esp)
121 addl $12, %esp
122 ret
123
124 //===---------------------------------------------------------------------===//
125
126 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
127 feasible.
128
129 //===---------------------------------------------------------------------===//
130
131 Codegen:
132 if (copysign(1.0, x) == copysign(1.0, y))
133 into:
134 if (x^y & mask)
135 when using SSE.
136
137 //===---------------------------------------------------------------------===//
138
139 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
140 of a v4sf value.
141
142 //===---------------------------------------------------------------------===//
143
144 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
145 Perhaps use pxor / xorp* to clear a XMM register first?
146
147 //===---------------------------------------------------------------------===//
148
149 External test Nurbs exposed some problems. Look for
150 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
151 emits:
152
153 movaps (%edx), %xmm2 #59.21
154 movaps (%edx), %xmm5 #60.21
155 movaps (%edx), %xmm4 #61.21
156 movaps (%edx), %xmm3 #62.21
157 movl 40(%ecx), %ebp #69.49
158 shufps $0, %xmm2, %xmm5 #60.21
159 movl 100(%esp), %ebx #69.20
160 movl (%ebx), %edi #69.20
161 imull %ebp, %edi #69.49
162 addl (%eax), %edi #70.33
163 shufps $85, %xmm2, %xmm4 #61.21
164 shufps $170, %xmm2, %xmm3 #62.21
165 shufps $255, %xmm2, %xmm2 #63.21
166 lea (%ebp,%ebp,2), %ebx #69.49
167 negl %ebx #69.49
168 lea -3(%edi,%ebx), %ebx #70.33
169 shll $4, %ebx #68.37
170 addl 32(%ecx), %ebx #68.37
171 testb $15, %bl #91.13
172 jne L_B1.24 # Prob 5% #91.13
173
174 This is the llvm code after instruction scheduling:
175
176 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
177 %reg1078 = MOV32ri -3
178 %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
179 %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
180 %reg1080 = IMUL32rr %reg1079, %reg1037
181 %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
182 %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
183 %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
184 %reg1082 = SHL32ri %reg1038, 4
185 %reg1039 = ADD32rr %reg1036, %reg1082
186 %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
187 %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
188 %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
189 %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
190 %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
191 %reg1040 = MOV32rr %reg1039
192 %reg1084 = AND32ri8 %reg1039, 15
193 CMP32ri8 %reg1084, 0
194 JE mbb<cond_next204,0xa914d30>
195
196 Still ok. After register allocation:
197
198 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
199 %EAX = MOV32ri -3
200 %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
201 ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
202 %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
203 %EDX = MOV32rm %EDX, 1, %NOREG, 40
204 IMUL32rr %EAX<def&use>, %EDX
205 %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
206 %ESI = MOV32rm %ESI, 1, %NOREG, 0
207 MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
208 %EAX = LEA32r %ESI, 1, %EAX, -3
209 %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
210 %ESI = MOV32rm %ESI, 1, %NOREG, 32
211 %EDI = MOV32rr %EAX
212 SHL32ri %EDI<def&use>, 4
213 ADD32rr %EDI<def&use>, %ESI
214 %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
215 %XMM1 = MOVAPSrr %XMM0
216 SHUFPSrr %XMM1<def&use>, %XMM1, 170
217 %XMM2 = MOVAPSrr %XMM0
218 SHUFPSrr %XMM2<def&use>, %XMM2, 0
219 %XMM3 = MOVAPSrr %XMM0
220 SHUFPSrr %XMM3<def&use>, %XMM3, 255
221 SHUFPSrr %XMM0<def&use>, %XMM0, 85
222 %EBX = MOV32rr %EDI
223 AND32ri8 %EBX<def&use>, 15
224 CMP32ri8 %EBX, 0
225 JE mbb<cond_next204,0xa914d30>
226
227 This looks really bad. The problem is shufps is a destructive opcode. Since it
228 appears as operand two in more than one shufps ops. It resulted in a number of
229 copies. Note icc also suffers from the same problem. Either the instruction
230 selector should select pshufd or The register allocator can made the two-address
231 to three-address transformation.
232
233 It also exposes some other problems. See MOV32ri -3 and the spills.
234
235 //===---------------------------------------------------------------------===//
236
237 Consider:
238
239 __m128 test(float a) {
240 return _mm_set_ps(0.0, 0.0, 0.0, a*a);
241 }
242
243 This compiles into:
244
245 movss 4(%esp), %xmm1
246 mulss %xmm1, %xmm1
247 xorps %xmm0, %xmm0
248 movss %xmm1, %xmm0
249 ret
250
251 Because mulss doesn't modify the top 3 elements, the top elements of
252 xmm1 are already zero'd. We could compile this to:
253
254 movss 4(%esp), %xmm0
255 mulss %xmm0, %xmm0
256 ret
257
258 //===---------------------------------------------------------------------===//
259
260 Here's a sick and twisted idea. Consider code like this:
261
262 __m128 test(__m128 a) {
263 float b = *(float*)&A;
264 ...
265 return _mm_set_ps(0.0, 0.0, 0.0, b);
266 }
267
268 This might compile to this code:
269
270 movaps c(%esp), %xmm1
271 xorps %xmm0, %xmm0
272 movss %xmm1, %xmm0
273 ret
274
275 Now consider if the ... code caused xmm1 to get spilled. This might produce
276 this code:
277
278 movaps c(%esp), %xmm1
279 movaps %xmm1, c2(%esp)
280 ...
281
282 xorps %xmm0, %xmm0
283 movaps c2(%esp), %xmm1
284 movss %xmm1, %xmm0
285 ret
286
287 However, since the reload is only used by these instructions, we could
288 "fold" it into the uses, producing something like this:
289
290 movaps c(%esp), %xmm1
291 movaps %xmm1, c2(%esp)
292 ...
293
294 movss c2(%esp), %xmm0
295 ret
296
297 ... saving two instructions.
298
299 The basic idea is that a reload from a spill slot, can, if only one 4-byte
300 chunk is used, bring in 3 zeros the one element instead of 4 elements.
301 This can be used to simplify a variety of shuffle operations, where the
302 elements are fixed zeros.
303
304 //===---------------------------------------------------------------------===//
305
306 This code generates ugly code, probably due to costs being off or something:
307
308 define void @test(float* %P, <4 x float>* %P2 ) {
309 %xFloat0.688 = load float* %P
310 %tmp = load <4 x float>* %P2
311 %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
312 store <4 x float> %inFloat3.713, <4 x float>* %P2
313 ret void
314 }
315
316 Generates:
317
318 _test:
319 movl 8(%esp), %eax
320 movaps (%eax), %xmm0
321 pxor %xmm1, %xmm1
322 movaps %xmm0, %xmm2
323 shufps $50, %xmm1, %xmm2
324 shufps $132, %xmm2, %xmm0
325 movaps %xmm0, (%eax)
326 ret
327
328 Would it be better to generate:
329
330 _test:
331 movl 8(%esp), %ecx
332 movaps (%ecx), %xmm0
333 xor %eax, %eax
334 pinsrw $6, %eax, %xmm0
335 pinsrw $7, %eax, %xmm0
336 movaps %xmm0, (%ecx)
337 ret
338
339 ?
340
341 //===---------------------------------------------------------------------===//
342
343 Some useful information in the Apple Altivec / SSE Migration Guide:
344
345 http://developer.apple.com/documentation/Performance/Conceptual/
346 Accelerate_sse_migration/index.html
347
348 e.g. SSE select using and, andnot, or. Various SSE compare translations.
349
350 //===---------------------------------------------------------------------===//
351
352 Add hooks to commute some CMPP operations.
353
354 //===---------------------------------------------------------------------===//
355
356 Apply the same transformation that merged four float into a single 128-bit load
357 to loads from constant pool.
358
359 //===---------------------------------------------------------------------===//
360
361 Floating point max / min are commutable when -enable-unsafe-fp-path is
362 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
363 nodes which are selected to max / min instructions that are marked commutable.
364
365 //===---------------------------------------------------------------------===//
366
367 We should materialize vector constants like "all ones" and "signbit" with
368 code like:
369
370 cmpeqps xmm1, xmm1 ; xmm1 = all-ones
371
372 and:
373 cmpeqps xmm1, xmm1 ; xmm1 = all-ones
374 psrlq xmm1, 31 ; xmm1 = all 100000000000...
375
376 instead of using a load from the constant pool. The later is important for
377 ABS/NEG/copysign etc.
378
379 //===---------------------------------------------------------------------===//
380
381 These functions:
382
383 #include <xmmintrin.h>
384 __m128i a;
385 void x(unsigned short n) {
386 a = _mm_slli_epi32 (a, n);
387 }
388 void y(unsigned n) {
389 a = _mm_slli_epi32 (a, n);
390 }
391
392 compile to ( -O3 -static -fomit-frame-pointer):
393 _x:
394 movzwl 4(%esp), %eax
395 movd %eax, %xmm0
396 movaps _a, %xmm1
397 pslld %xmm0, %xmm1
398 movaps %xmm1, _a
399 ret
400 _y:
401 movd 4(%esp), %xmm0
402 movaps _a, %xmm1
403 pslld %xmm0, %xmm1
404 movaps %xmm1, _a
405 ret
406
407 "y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems
408 like movd would be sufficient in both cases as the value is already zero
409 extended in the 32-bit stack slot IIRC. For signed short, it should also be
410 save, as a really-signed value would be undefined for pslld.
411
412
413 //===---------------------------------------------------------------------===//
414
415 #include <math.h>
416 int t1(double d) { return signbit(d); }
417
418 This currently compiles to:
419 subl $12, %esp
420 movsd 16(%esp), %xmm0
421 movsd %xmm0, (%esp)
422 movl 4(%esp), %eax
423 shrl $31, %eax
424 addl $12, %esp
425 ret
426
427 We should use movmskp{s|d} instead.
428
429 //===---------------------------------------------------------------------===//
430
431 CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
432 (aligned) vector load. This functionality has a couple of problems.
433
434 1. The code to infer alignment from loads of globals is in the X86 backend,
435 not the dag combiner. This is because dagcombine2 needs to be able to see
436 through the X86ISD::Wrapper node, which DAGCombine can't really do.
437 2. The code for turning 4 x load into a single vector load is target
438 independent and should be moved to the dag combiner.
439 3. The code for turning 4 x load into a vector load can only handle a direct
440 load from a global or a direct load from the stack. It should be generalized
441 to handle any load from P, P+4, P+8, P+12, where P can be anything.
442 4. The alignment inference code cannot handle loads from globals in non-static
443 mode because it doesn't look through the extra dyld stub load. If you try
444 vec_align.ll without -relocation-model=static, you'll see what I mean.
445
446 //===---------------------------------------------------------------------===//
447
448 We should lower store(fneg(load p), q) into an integer load+xor+store, which
449 eliminates a constant pool load. For example, consider:
450
451 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly {
452 entry:
453 %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1]
454 %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
455 ret i64 %tmp20
456 }
457 declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
458
459 This currently compiles to:
460
461 LCPI1_0: # <4 x float>
462 .long 2147483648 # float -0
463 .long 2147483648 # float -0
464 .long 2147483648 # float -0
465 .long 2147483648 # float -0
466 _ccosf:
467 subl $12, %esp
468 movss 16(%esp), %xmm0
469 movss %xmm0, 4(%esp)
470 movss 20(%esp), %xmm0
471 xorps LCPI1_0, %xmm0
472 movss %xmm0, (%esp)
473 call L_ccoshf$stub
474 addl $12, %esp
475 ret
476
477 Note the load into xmm0, then xor (to negate), then store. In PIC mode,
478 this code computes the pic base and does two loads to do the constant pool
479 load, so the improvement is much bigger.
480
481 The tricky part about this xform is that the argument load/store isn't exposed
482 until post-legalize, and at that point, the fneg has been custom expanded into
483 an X86 fxor. This means that we need to handle this case in the x86 backend
484 instead of in target independent code.
485
486 //===---------------------------------------------------------------------===//
487
488 Non-SSE4 insert into 16 x i8 is atrociously bad.
489
490 //===---------------------------------------------------------------------===//
491
492 <2 x i64> extract is substantially worse than <2 x f64>, even if the destination
493 is memory.
494
495 //===---------------------------------------------------------------------===//
496
497 SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
498 sitting between the truncate and the extract.
499
500 //===---------------------------------------------------------------------===//
501
502 INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
503 any number of 0.0 simultaneously. Currently we only use it for simple
504 insertions.
505
506 See comments in LowerINSERT_VECTOR_ELT_SSE4.
507
508 //===---------------------------------------------------------------------===//
509
510 On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
511 Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
512 legal, it'll just take a few extra patterns written in the .td file.
513
514 Note: this is not a code quality issue; the custom lowered code happens to be
515 right, but we shouldn't have to custom lower anything. This is probably related
516 to <2 x i64> ops being so bad.
517
518 //===---------------------------------------------------------------------===//
519
520 'select' on vectors and scalars could be a whole lot better. We currently
521 lower them to conditional branches. On x86-64 for example, we compile this:
522
523 double test(double a, double b, double c, double d) { return a<b ? c : d; }
524
525 to:
526
527 _test:
528 ucomisd %xmm0, %xmm1
529 ja LBB1_2 # entry
530 LBB1_1: # entry
531 movapd %xmm3, %xmm2
532 LBB1_2: # entry
533 movapd %xmm2, %xmm0
534 ret
535
536 instead of:
537
538 _test:
539 cmpltsd %xmm1, %xmm0
540 andpd %xmm0, %xmm2
541 andnpd %xmm3, %xmm0
542 orpd %xmm2, %xmm0
543 ret
544
545 For unpredictable branches, the later is much more efficient. This should
546 just be a matter of having scalar sse map to SELECT_CC and custom expanding
547 or iseling it.
548
549 //===---------------------------------------------------------------------===//
550
551 LLVM currently generates stack realignment code, when it is not necessary
552 needed. The problem is that we need to know about stack alignment too early,
553 before RA runs.
554
555 At that point we don't know, whether there will be vector spill, or not.
556 Stack realignment logic is overly conservative here, but otherwise we can
557 produce unaligned loads/stores.
558
559 Fixing this will require some huge RA changes.
560
561 Testcase:
562 #include <emmintrin.h>
563
564 typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
565
566 static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
567 - 22725, - 12873};;
568
569 vSInt16 madd(vSInt16 b)
570 {
571 return _mm_madd_epi16(a, b);
572 }
573
574 Generated code (x86-32, linux):
575 madd:
576 pushl %ebp
577 movl %esp, %ebp
578 andl $-16, %esp
579 movaps .LCPI1_0, %xmm1
580 pmaddwd %xmm1, %xmm0
581 movl %ebp, %esp
582 popl %ebp
583 ret
584
585 //===---------------------------------------------------------------------===//
586
587 Consider:
588 #include <emmintrin.h>
589 __m128 foo2 (float x) {
590 return _mm_set_ps (0, 0, x, 0);
591 }
592
593 In x86-32 mode, we generate this spiffy code:
594
595 _foo2:
596 movss 4(%esp), %xmm0
597 pshufd $81, %xmm0, %xmm0
598 ret
599
600 in x86-64 mode, we generate this code, which could be better:
601
602 _foo2:
603 xorps %xmm1, %xmm1
604 movss %xmm0, %xmm1
605 pshufd $81, %xmm1, %xmm0
606 ret
607
608 In sse4 mode, we could use insertps to make both better.
609
610 Here's another testcase that could use insertps [mem]:
611
612 #include <xmmintrin.h>
613 extern float x2, x3;
614 __m128 foo1 (float x1, float x4) {
615 return _mm_set_ps (x2, x1, x3, x4);
616 }
617
618 gcc mainline compiles it to:
619
620 foo1:
621 insertps $0x10, x2(%rip), %xmm0
622 insertps $0x10, x3(%rip), %xmm1
623 movaps %xmm1, %xmm2
624 movlhps %xmm0, %xmm2
625 movaps %xmm2, %xmm0
626 ret
627
628 //===---------------------------------------------------------------------===//
629
630 We compile vector multiply-by-constant into poor code:
631
632 define <4 x i32> @f(<4 x i32> %i) nounwind {
633 %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
634 ret <4 x i32> %A
635 }
636
637 On targets without SSE4.1, this compiles into:
638
639 LCPI1_0: ## <4 x i32>
640 .long 10
641 .long 10
642 .long 10
643 .long 10
644 .text
645 .align 4,0x90
646 .globl _f
647 _f:
648 pshufd $3, %xmm0, %xmm1
649 movd %xmm1, %eax
650 imull LCPI1_0+12, %eax
651 movd %eax, %xmm1
652 pshufd $1, %xmm0, %xmm2
653 movd %xmm2, %eax
654 imull LCPI1_0+4, %eax
655 movd %eax, %xmm2
656 punpckldq %xmm1, %xmm2
657 movd %xmm0, %eax
658 imull LCPI1_0, %eax
659 movd %eax, %xmm1
660 movhlps %xmm0, %xmm0
661 movd %xmm0, %eax
662 imull LCPI1_0+8, %eax
663 movd %eax, %xmm0
664 punpckldq %xmm0, %xmm1
665 movaps %xmm1, %xmm0
666 punpckldq %xmm2, %xmm0
667 ret
668
669 It would be better to synthesize integer vector multiplication by constants
670 using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
671 simple cases such as multiplication by powers of two would be better as
672 vector shifts than as multiplications.
673
674 //===---------------------------------------------------------------------===//
675
676 We compile this:
677
678 __m128i
679 foo2 (char x)
680 {
681 return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
682 }
683
684 into:
685 movl $1, %eax
686 xorps %xmm0, %xmm0
687 pinsrw $2, %eax, %xmm0
688 movzbl 4(%esp), %eax
689 pinsrw $3, %eax, %xmm0
690 movl $256, %eax
691 pinsrw $7, %eax, %xmm0
692 ret
693
694
695 gcc-4.2:
696 subl $12, %esp
697 movzbl 16(%esp), %eax
698 movdqa LC0, %xmm0
699 pinsrw $3, %eax, %xmm0
700 addl $12, %esp
701 ret
702 .const
703 .align 4
704 LC0:
705 .word 0
706 .word 0
707 .word 1
708 .word 0
709 .word 0
710 .word 0
711 .word 0
712 .word 256
713
714 With SSE4, it should be
715 movdqa .LC0(%rip), %xmm0
716 pinsrb $6, %edi, %xmm0
717
718 //===---------------------------------------------------------------------===//
719
720 We should transform a shuffle of two vectors of constants into a single vector
721 of constants. Also, insertelement of a constant into a vector of constants
722 should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
723
724 We compiled it to something horrible:
725
726 .align 4
727 LCPI1_1: ## float
728 .long 1065353216 ## float 1
729 .const
730
731 .align 4
732 LCPI1_0: ## <4 x float>
733 .space 4
734 .long 1065353216 ## float 1
735 .space 4
736 .long 1065353216 ## float 1
737 .text
738 .align 4,0x90
739 .globl _t
740 _t:
741 xorps %xmm0, %xmm0
742 movhps LCPI1_0, %xmm0
743 movss LCPI1_1, %xmm1
744 movaps %xmm0, %xmm2
745 shufps $2, %xmm1, %xmm2
746 shufps $132, %xmm2, %xmm0
747 movaps %xmm0, 0
748
749 //===---------------------------------------------------------------------===//
750 rdar://5907648
751
752 This function:
753
754 float foo(unsigned char x) {
755 return x;
756 }
757
758 compiles to (x86-32):
759
760 define float @foo(i8 zeroext %x) nounwind {
761 %tmp12 = uitofp i8 %x to float ; <float> [#uses=1]
762 ret float %tmp12
763 }
764
765 compiles to:
766
767 _foo:
768 subl $4, %esp
769 movzbl 8(%esp), %eax
770 cvtsi2ss %eax, %xmm0
771 movss %xmm0, (%esp)
772 flds (%esp)
773 addl $4, %esp
774 ret
775
776 We should be able to use:
777 cvtsi2ss 8($esp), %xmm0
778 since we know the stack slot is already zext'd.
779
780 //===---------------------------------------------------------------------===//
781
782 Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
783 when code size is critical. movlps is slower than movsd on core2 but it's one
784 byte shorter.
785
786 //===---------------------------------------------------------------------===//
787
788 We should use a dynamic programming based approach to tell when using FPStack
789 operations is cheaper than SSE. SciMark montecarlo contains code like this
790 for example:
791
792 double MonteCarlo_num_flops(int Num_samples) {
793 return ((double) Num_samples)* 4.0;
794 }
795
796 In fpstack mode, this compiles into:
797
798 LCPI1_0:
799 .long 1082130432 ## float 4.000000e+00
800 _MonteCarlo_num_flops:
801 subl $4, %esp
802 movl 8(%esp), %eax
803 movl %eax, (%esp)
804 fildl (%esp)
805 fmuls LCPI1_0
806 addl $4, %esp
807 ret
808
809 in SSE mode, it compiles into significantly slower code:
810
811 _MonteCarlo_num_flops:
812 subl $12, %esp
813 cvtsi2sd 16(%esp), %xmm0
814 mulsd LCPI1_0, %xmm0
815 movsd %xmm0, (%esp)
816 fldl (%esp)
817 addl $12, %esp
818 ret
819
820 There are also other cases in scimark where using fpstack is better, it is
821 cheaper to do fld1 than load from a constant pool for example, so
822 "load, add 1.0, store" is better done in the fp stack, etc.
823
824 //===---------------------------------------------------------------------===//
825
826 The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to
827 "cmpsd". For example, this code:
828
829 double d1(double x) { return x == x ? x : x + x; }
830
831 Compiles into:
832
833 _d1:
834 ucomisd %xmm0, %xmm0
835 jnp LBB1_2
836 addsd %xmm0, %xmm0
837 ret
838 LBB1_2:
839 ret
840
841 Also, the 'ret's should be shared. This is PR6032.
842
843 //===---------------------------------------------------------------------===//
844
845 These should compile into the same code (PR6214): Perhaps instcombine should
846 canonicalize the former into the later?
847
848 define float @foo(float %x) nounwind {
849 %t = bitcast float %x to i32
850 %s = and i32 %t, 2147483647
851 %d = bitcast i32 %s to float
852 ret float %d
853 }
854
855 declare float @fabsf(float %n)
856 define float @bar(float %x) nounwind {
857 %d = call float @fabsf(float %x)
858 ret float %d
859 }
860
861 //===---------------------------------------------------------------------===//
862
863 This IR (from PR6194):
864
865 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
866 target triple = "x86_64-apple-darwin10.0.0"
867
868 %0 = type { double, double }
869 %struct.float3 = type { float, float, float }
870
871 define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
872 entry:
873 %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1]
874 %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1]
875 %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1]
876 %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1]
877 %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1]
878 %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1]
879 %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
880 store float %tmp12, float* %tmp5
881 ret void
882 }
883
884 Compiles to:
885
886 _test: ## @test
887 movd %xmm0, %rax
888 shrq $32, %rax
889 movl %eax, 4(%rdi)
890 ret
891
892 This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
893 doing a shuffle from v[1] to v[0] then a float store.
894
895 //===---------------------------------------------------------------------===//
896
897 On SSE4 machines, we compile this code:
898
899 define <2 x float> @test2(<2 x float> %Q, <2 x float> %R,
900 <2 x float> *%P) nounwind {
901 %Z = fadd <2 x float> %Q, %R
902
903 store <2 x float> %Z, <2 x float> *%P
904 ret <2 x float> %Z
905 }
906
907 into:
908
909 _test2: ## @test2
910 ## BB#0:
911 insertps $0, %xmm2, %xmm2
912 insertps $16, %xmm3, %xmm2
913 insertps $0, %xmm0, %xmm3
914 insertps $16, %xmm1, %xmm3
915 addps %xmm2, %xmm3
916 movq %xmm3, (%rdi)
917 movaps %xmm3, %xmm0
918 pshufd $1, %xmm3, %xmm1
919 ## kill: XMM1<def> XMM1<kill>
920 ret
921
922 The insertps's of $0 are pointless complex copies.
923
924 //===---------------------------------------------------------------------===//
925
926 If SSE4.1 is available we should inline rounding functions instead of emitting
927 a libcall.
928
929 floor: roundsd $0x01, %xmm, %xmm
930 ceil: roundsd $0x02, %xmm, %xmm
931
932 and likewise for the single precision versions.
933
934 Currently, SelectionDAGBuilder doesn't turn calls to these functions into the
935 corresponding nodes and some targets (including X86) aren't ready for them.
936
937 //===---------------------------------------------------------------------===//
938
1 //===---------------------------------------------------------------------===//
2 // Random ideas for the X86 backend.
3 //===---------------------------------------------------------------------===//
4
5 We should add support for the "movbe" instruction, which does a byte-swapping
6 copy (3-addr bswap + memory support?) This is available on Atom processors.
7
8 //===---------------------------------------------------------------------===//
9
10 This should be one DIV/IDIV instruction, not a libcall:
11
12 unsigned test(unsigned long long X, unsigned Y) {
13 return X/Y;
14 }
15
16 This can be done trivially with a custom legalizer. What about overflow
17 though? http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
18
19 //===---------------------------------------------------------------------===//
20
21 Improvements to the multiply -> shift/add algorithm:
22 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
23
24 //===---------------------------------------------------------------------===//
25
26 Improve code like this (occurs fairly frequently, e.g. in LLVM):
27 long long foo(int x) { return 1LL << x; }
28
29 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
30 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
31 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
32
33 Another useful one would be ~0ULL >> X and ~0ULL << X.
34
35 One better solution for 1LL << x is:
36 xorl %eax, %eax
37 xorl %edx, %edx
38 testb $32, %cl
39 sete %al
40 setne %dl
41 sall %cl, %eax
42 sall %cl, %edx
43
44 But that requires good 8-bit subreg support.
45
46 Also, this might be better. It's an extra shift, but it's one instruction
47 shorter, and doesn't stress 8-bit subreg support.
48 (From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
49 but without the unnecessary and.)
50 movl %ecx, %eax
51 shrl $5, %eax
52 movl %eax, %edx
53 xorl $1, %edx
54 sall %cl, %eax
55 sall %cl. %edx
56
57 64-bit shifts (in general) expand to really bad code. Instead of using
58 cmovs, we should expand to a conditional branch like GCC produces.
59
60 //===---------------------------------------------------------------------===//
61
62 Some isel ideas:
63
64 1. Dynamic programming based approach when compile time if not an
65 issue.
66 2. Code duplication (addressing mode) during isel.
67 3. Other ideas from "Register-Sensitive Selection, Duplication, and
68 Sequencing of Instructions".
69 4. Scheduling for reduced register pressure. E.g. "Minimum Register
70 Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
71 and other related papers.
72 http://citeseer.ist.psu.edu/govindarajan01minimum.html
73
74 //===---------------------------------------------------------------------===//
75
76 Should we promote i16 to i32 to avoid partial register update stalls?
77
78 //===---------------------------------------------------------------------===//
79
80 Leave any_extend as pseudo instruction and hint to register
81 allocator. Delay codegen until post register allocation.
82 Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
83 the coalescer how to deal with it though.
84
85 //===---------------------------------------------------------------------===//
86
87 It appears icc use push for parameter passing. Need to investigate.
88
89 //===---------------------------------------------------------------------===//
90
91 This:
92
93 void foo(void);
94 void bar(int x, int *P) {
95 x >>= 2;
96 if (x)
97 foo();
98 *P = x;
99 }
100
101 compiles into:
102
103 movq %rsi, %rbx
104 movl %edi, %r14d
105 sarl $2, %r14d
106 testl %r14d, %r14d
107 je LBB0_2
108
109 Instead of doing an explicit test, we can use the flags off the sar. This
110 occurs in a bigger testcase like this, which is pretty common:
111
112 #include <vector>
113 int test1(std::vector<int> &X) {
114 int Sum = 0;
115 for (long i = 0, e = X.size(); i != e; ++i)
116 X[i] = 0;
117 return Sum;
118 }
119
120 //===---------------------------------------------------------------------===//
121
122 Only use inc/neg/not instructions on processors where they are faster than
123 add/sub/xor. They are slower on the P4 due to only updating some processor
124 flags.
125
126 //===---------------------------------------------------------------------===//
127
128 The instruction selector sometimes misses folding a load into a compare. The
129 pattern is written as (cmp reg, (load p)). Because the compare isn't
130 commutative, it is not matched with the load on both sides. The dag combiner
131 should be made smart enough to cannonicalize the load into the RHS of a compare
132 when it can invert the result of the compare for free.
133
134 //===---------------------------------------------------------------------===//
135
136 In many cases, LLVM generates code like this:
137
138 _test:
139 movl 8(%esp), %eax
140 cmpl %eax, 4(%esp)
141 setl %al
142 movzbl %al, %eax
143 ret
144
145 on some processors (which ones?), it is more efficient to do this:
146
147 _test:
148 movl 8(%esp), %ebx
149 xor %eax, %eax
150 cmpl %ebx, 4(%esp)
151 setl %al
152 ret
153
154 Doing this correctly is tricky though, as the xor clobbers the flags.
155
156 //===---------------------------------------------------------------------===//
157
158 We should generate bts/btr/etc instructions on targets where they are cheap or
159 when codesize is important. e.g., for:
160
161 void setbit(int *target, int bit) {
162 *target |= (1 << bit);
163 }
164 void clearbit(int *target, int bit) {
165 *target &= ~(1 << bit);
166 }
167
168 //===---------------------------------------------------------------------===//
169
170 Instead of the following for memset char*, 1, 10:
171
172 movl $16843009, 4(%edx)
173 movl $16843009, (%edx)
174 movw $257, 8(%edx)
175
176 It might be better to generate
177
178 movl $16843009, %eax
179 movl %eax, 4(%edx)
180 movl %eax, (%edx)
181 movw al, 8(%edx)
182
183 when we can spare a register. It reduces code size.
184
185 //===---------------------------------------------------------------------===//
186
187 Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently
188 get this:
189
190 define i32 @test1(i32 %X) {
191 %Y = sdiv i32 %X, 8
192 ret i32 %Y
193 }
194
195 _test1:
196 movl 4(%esp), %eax
197 movl %eax, %ecx
198 sarl $31, %ecx
199 shrl $29, %ecx
200 addl %ecx, %eax
201 sarl $3, %eax
202 ret
203
204 GCC knows several different ways to codegen it, one of which is this:
205
206 _test1:
207 movl 4(%esp), %eax
208 cmpl $-1, %eax
209 leal 7(%eax), %ecx
210 cmovle %ecx, %eax
211 sarl $3, %eax
212 ret
213
214 which is probably slower, but it's interesting at least :)
215
216 //===---------------------------------------------------------------------===//
217
218 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
219 We should leave these as libcalls for everything over a much lower threshold,
220 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
221 stores, TLB preheating, etc)
222
223 //===---------------------------------------------------------------------===//
224
225 Optimize this into something reasonable:
226 x * copysign(1.0, y) * copysign(1.0, z)
227
228 //===---------------------------------------------------------------------===//
229
230 Optimize copysign(x, *y) to use an integer load from y.
231
232 //===---------------------------------------------------------------------===//
233
234 The following tests perform worse with LSR:
235
236 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
237
238 //===---------------------------------------------------------------------===//
239
240 Adding to the list of cmp / test poor codegen issues:
241
242 int test(__m128 *A, __m128 *B) {
243 if (_mm_comige_ss(*A, *B))
244 return 3;
245 else
246 return 4;
247 }
248
249 _test:
250 movl 8(%esp), %eax
251 movaps (%eax), %xmm0
252 movl 4(%esp), %eax
253 movaps (%eax), %xmm1
254 comiss %xmm0, %xmm1
255 setae %al
256 movzbl %al, %ecx
257 movl $3, %eax
258 movl $4, %edx
259 cmpl $0, %ecx
260 cmove %edx, %eax
261 ret
262
263 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
264 are a number of issues. 1) We are introducing a setcc between the result of the
265 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
266 so a any extend (which becomes a zero extend) is added.
267
268 We probably need some kind of target DAG combine hook to fix this.
269
270 //===---------------------------------------------------------------------===//
271
272 We generate significantly worse code for this than GCC:
273 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
274 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
275
276 There is also one case we do worse on PPC.
277
278 //===---------------------------------------------------------------------===//
279
280 For this:
281
282 int test(int a)
283 {
284 return a * 3;
285 }
286
287 We currently emits
288 imull $3, 4(%esp), %eax
289
290 Perhaps this is what we really should generate is? Is imull three or four
291 cycles? Note: ICC generates this:
292 movl 4(%esp), %eax
293 leal (%eax,%eax,2), %eax
294
295 The current instruction priority is based on pattern complexity. The former is
296 more "complex" because it folds a load so the latter will not be emitted.
297
298 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
299 should always try to match LEA first since the LEA matching code does some
300 estimate to determine whether the match is profitable.
301
302 However, if we care more about code size, then imull is better. It's two bytes
303 shorter than movl + leal.
304
305 On a Pentium M, both variants have the same characteristics with regard
306 to throughput; however, the multiplication has a latency of four cycles, as
307 opposed to two cycles for the movl+lea variant.
308
309 //===---------------------------------------------------------------------===//
310
311 __builtin_ffs codegen is messy.
312
313 int ffs_(unsigned X) { return __builtin_ffs(X); }
314
315 llvm produces:
316 ffs_:
317 movl 4(%esp), %ecx
318 bsfl %ecx, %eax
319 movl $32, %edx
320 cmove %edx, %eax
321 incl %eax
322 xorl %edx, %edx
323 testl %ecx, %ecx
324 cmove %edx, %eax
325 ret
326
327 vs gcc:
328
329 _ffs_:
330 movl $-1, %edx
331 bsfl 4(%esp), %eax
332 cmove %edx, %eax
333 addl $1, %eax
334 ret
335
336 Another example of __builtin_ffs (use predsimplify to eliminate a select):
337
338 int foo (unsigned long j) {
339 if (j)
340 return __builtin_ffs (j) - 1;
341 else
342 return 0;
343 }
344
345 //===---------------------------------------------------------------------===//
346
347 It appears gcc place string data with linkonce linkage in
348 .section __TEXT,__const_coal,coalesced instead of
349 .section __DATA,__const_coal,coalesced.
350 Take a look at darwin.h, there are other Darwin assembler directives that we
351 do not make use of.
352
353 //===---------------------------------------------------------------------===//
354
355 define i32 @foo(i32* %a, i32 %t) {
356 entry:
357 br label %cond_true
358
359 cond_true: ; preds = %cond_true, %entry
360 %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; <i32> [#uses=3]
361 %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; <i32> [#uses=1]
362 %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; <i32*> [#uses=1]
363 %tmp3 = load i32* %tmp2 ; <i32> [#uses=1]
364 %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; <i32> [#uses=1]
365 %tmp7 = add i32 %tmp5, %tmp3 ; <i32> [#uses=2]
366 %tmp9 = add i32 %x.0.0, 1 ; <i32> [#uses=2]
367 %tmp = icmp sgt i32 %tmp9, 39 ; <i1> [#uses=1]
368 br i1 %tmp, label %bb12, label %cond_true
369
370 bb12: ; preds = %cond_true
371 ret i32 %tmp7
372 }
373 is pessimized by -loop-reduce and -indvars
374
375 //===---------------------------------------------------------------------===//
376
377 u32 to float conversion improvement:
378
379 float uint32_2_float( unsigned u ) {
380 float fl = (int) (u & 0xffff);
381 float fh = (int) (u >> 16);
382 fh *= 0x1.0p16f;
383 return fh + fl;
384 }
385
386 00000000 subl $0x04,%esp
387 00000003 movl 0x08(%esp,1),%eax
388 00000007 movl %eax,%ecx
389 00000009 shrl $0x10,%ecx
390 0000000c cvtsi2ss %ecx,%xmm0
391 00000010 andl $0x0000ffff,%eax
392 00000015 cvtsi2ss %eax,%xmm1
393 00000019 mulss 0x00000078,%xmm0
394 00000021 addss %xmm1,%xmm0
395 00000025 movss %xmm0,(%esp,1)
396 0000002a flds (%esp,1)
397 0000002d addl $0x04,%esp
398 00000030 ret
399
400 //===---------------------------------------------------------------------===//
401
402 When using fastcc abi, align stack slot of argument of type double on 8 byte
403 boundary to improve performance.
404
405 //===---------------------------------------------------------------------===//
406
407 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
408 simplifications for integer "x cmp y ? a : b".
409
410 //===---------------------------------------------------------------------===//
411
412 Consider the expansion of:
413
414 define i32 @test3(i32 %X) {
415 %tmp1 = urem i32 %X, 255
416 ret i32 %tmp1
417 }
418
419 Currently it compiles to:
420
421 ...
422 movl $2155905153, %ecx
423 movl 8(%esp), %esi
424 movl %esi, %eax
425 mull %ecx
426 ...
427
428 This could be "reassociated" into:
429
430 movl $2155905153, %eax
431 movl 8(%esp), %ecx
432 mull %ecx
433
434 to avoid the copy. In fact, the existing two-address stuff would do this
435 except that mul isn't a commutative 2-addr instruction. I guess this has
436 to be done at isel time based on the #uses to mul?
437
438 //===---------------------------------------------------------------------===//
439
440 Make sure the instruction which starts a loop does not cross a cacheline
441 boundary. This requires knowning the exact length of each machine instruction.
442 That is somewhat complicated, but doable. Example 256.bzip2:
443
444 In the new trace, the hot loop has an instruction which crosses a cacheline
445 boundary. In addition to potential cache misses, this can't help decoding as I
446 imagine there has to be some kind of complicated decoder reset and realignment
447 to grab the bytes from the next cacheline.
448
449 532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines
450 942 942 0x3d03 movl %dh, (1809(%esp, %esi)
451 937 937 0x3d0a incl %esi
452 3 3 0x3d0b cmpb %bl, %dl
453 27 27 0x3d0d jnz 0x000062db <main+11707>
454
455 //===---------------------------------------------------------------------===//
456
457 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
458
459 //===---------------------------------------------------------------------===//
460
461 This could be a single 16-bit load.
462
463 int f(char *p) {
464 if ((p[0] == 1) & (p[1] == 2)) return 1;
465 return 0;
466 }
467
468 //===---------------------------------------------------------------------===//
469
470 We should inline lrintf and probably other libc functions.
471
472 //===---------------------------------------------------------------------===//
473
474 Use the FLAGS values from arithmetic instructions more. For example, compile:
475
476 int add_zf(int *x, int y, int a, int b) {
477 if ((*x += y) == 0)
478 return a;
479 else
480 return b;
481 }
482
483 to:
484 addl %esi, (%rdi)
485 movl %edx, %eax
486 cmovne %ecx, %eax
487 ret
488 instead of:
489
490 _add_zf:
491 addl (%rdi), %esi
492 movl %esi, (%rdi)
493 testl %esi, %esi
494 cmove %edx, %ecx
495 movl %ecx, %eax
496 ret
497
498 As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll
499 without a test instruction.
500
501 //===---------------------------------------------------------------------===//
502
503 These two functions have identical effects:
504
505 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
506 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
507
508 We currently compile them to:
509
510 _f:
511 movl 4(%esp), %eax
512 movl %eax, %ecx
513 incl %ecx
514 movl 8(%esp), %edx
515 cmpl %edx, %ecx
516 jne LBB1_2 #UnifiedReturnBlock
517 LBB1_1: #cond_true
518 addl $2, %eax
519 ret
520 LBB1_2: #UnifiedReturnBlock
521 movl %ecx, %eax
522 ret
523 _f2:
524 movl 4(%esp), %eax
525 movl %eax, %ecx
526 incl %ecx
527 cmpl 8(%esp), %ecx
528 sete %cl
529 movzbl %cl, %ecx
530 leal 1(%ecx,%eax), %eax
531 ret
532
533 both of which are inferior to GCC's:
534
535 _f:
536 movl 4(%esp), %edx
537 leal 1(%edx), %eax
538 addl $2, %edx
539 cmpl 8(%esp), %eax
540 cmove %edx, %eax
541 ret
542 _f2:
543 movl 4(%esp), %eax
544 addl $1, %eax
545 xorl %edx, %edx
546 cmpl 8(%esp), %eax
547 sete %dl
548 addl %edx, %eax
549 ret
550
551 //===---------------------------------------------------------------------===//
552
553 This code:
554
555 void test(int X) {
556 if (X) abort();
557 }
558
559 is currently compiled to:
560
561 _test:
562 subl $12, %esp
563 cmpl $0, 16(%esp)
564 jne LBB1_1
565 addl $12, %esp
566 ret
567 LBB1_1:
568 call L_abort$stub
569
570 It would be better to produce:
571
572 _test:
573 subl $12, %esp
574 cmpl $0, 16(%esp)
575 jne L_abort$stub
576 addl $12, %esp
577 ret
578
579 This can be applied to any no-return function call that takes no arguments etc.
580 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
581 something like this:
582
583 _test:
584 cmpl $0, 4(%esp)
585 jne LBB1_1
586 ret
587 LBB1_1:
588 subl $12, %esp
589 call L_abort$stub
590
591 Both are useful in different situations. Finally, it could be shrink-wrapped
592 and tail called, like this:
593
594 _test:
595 cmpl $0, 4(%esp)
596 jne LBB1_1
597 ret
598 LBB1_1:
599 pop %eax # realign stack.
600 call L_abort$stub
601
602 Though this probably isn't worth it.
603
604 //===---------------------------------------------------------------------===//
605
606 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
607 a neg instead of a sub instruction. Consider:
608
609 int test(char X) { return 7-X; }
610
611 we currently produce:
612 _test:
613 movl $7, %eax
614 movsbl 4(%esp), %ecx
615 subl %ecx, %eax
616 ret
617
618 We would use one fewer register if codegen'd as:
619
620 movsbl 4(%esp), %eax
621 neg %eax
622 add $7, %eax
623 ret
624
625 Note that this isn't beneficial if the load can be folded into the sub. In
626 this case, we want a sub:
627
628 int test(int X) { return 7-X; }
629 _test:
630 movl $7, %eax
631 subl 4(%esp), %eax
632 ret
633
634 //===---------------------------------------------------------------------===//
635
636 Leaf functions that require one 4-byte spill slot have a prolog like this:
637
638 _foo:
639 pushl %esi
640 subl $4, %esp
641 ...
642 and an epilog like this:
643 addl $4, %esp
644 popl %esi
645 ret
646
647 It would be smaller, and potentially faster, to push eax on entry and to
648 pop into a dummy register instead of using addl/subl of esp. Just don't pop
649 into any return registers :)
650
651 //===---------------------------------------------------------------------===//
652
653 The X86 backend should fold (branch (or (setcc, setcc))) into multiple
654 branches. We generate really poor code for:
655
656 double testf(double a) {
657 return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
658 }
659
660 For example, the entry BB is:
661
662 _testf:
663 subl $20, %esp
664 pxor %xmm0, %xmm0
665 movsd 24(%esp), %xmm1
666 ucomisd %xmm0, %xmm1
667 setnp %al
668 sete %cl
669 testb %cl, %al
670 jne LBB1_5 # UnifiedReturnBlock
671 LBB1_1: # cond_true
672
673
674 it would be better to replace the last four instructions with:
675
676 jp LBB1_1
677 je LBB1_5
678 LBB1_1:
679
680 We also codegen the inner ?: into a diamond:
681
682 cvtss2sd LCPI1_0(%rip), %xmm2
683 cvtss2sd LCPI1_1(%rip), %xmm3
684 ucomisd %xmm1, %xmm0
685 ja LBB1_3 # cond_true
686 LBB1_2: # cond_true
687 movapd %xmm3, %xmm2
688 LBB1_3: # cond_true
689 movapd %xmm2, %xmm0
690 ret
691
692 We should sink the load into xmm3 into the LBB1_2 block. This should
693 be pretty easy, and will nuke all the copies.
694
695 //===---------------------------------------------------------------------===//
696
697 This:
698 #include <algorithm>
699 inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
700 { return std::make_pair(a + b, a + b < a); }
701 bool no_overflow(unsigned a, unsigned b)
702 { return !full_add(a, b).second; }
703
704 Should compile to:
705 addl %esi, %edi
706 setae %al
707 movzbl %al, %eax
708 ret
709
710 on x86-64, instead of the rather stupid-looking:
711 addl %esi, %edi
712 setb %al
713 xorb $1, %al
714 movzbl %al, %eax
715 ret
716
717
718 //===---------------------------------------------------------------------===//
719
720 The following code:
721
722 bb114.preheader: ; preds = %cond_next94
723 %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1]
724 %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1]
725 %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1]
726 %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1]
727 %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1]
728 %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2]
729 %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1]
730 %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1]
731 %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1]
732 %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1]
733 %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1]
734 br label %bb114
735
736 produces:
737
738 LBB3_5: # bb114.preheader
739 movswl -68(%ebp), %eax
740 movl $32, %ecx
741 movl %ecx, -80(%ebp)
742 subl %eax, -80(%ebp)
743 movswl -52(%ebp), %eax
744 movl %ecx, -84(%ebp)
745 subl %eax, -84(%ebp)
746 movswl -70(%ebp), %eax
747 movl %ecx, -88(%ebp)
748 subl %eax, -88(%ebp)
749 movswl -50(%ebp), %eax
750 subl %eax, %ecx
751 movl %ecx, -76(%ebp)
752 movswl -42(%ebp), %eax
753 movl %eax, -92(%ebp)
754 movswl -66(%ebp), %eax
755 movl %eax, -96(%ebp)
756 movw $0, -98(%ebp)
757
758 This appears to be bad because the RA is not folding the store to the stack
759 slot into the movl. The above instructions could be:
760 movl $32, -80(%ebp)
761 ...
762 movl $32, -84(%ebp)
763 ...
764 This seems like a cross between remat and spill folding.
765
766 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
767 change, so we could simply subtract %eax from %ecx first and then use %ecx (or
768 vice-versa).
769
770 //===---------------------------------------------------------------------===//
771
772 This code:
773
774 %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1]
775 br i1 %tmp659, label %cond_true662, label %cond_next715
776
777 produces this:
778
779 testw %cx, %cx
780 movswl %cx, %esi
781 jns LBB4_109 # cond_next715
782
783 Shark tells us that using %cx in the testw instruction is sub-optimal. It
784 suggests using the 32-bit register (which is what ICC uses).
785
786 //===---------------------------------------------------------------------===//
787
788 We compile this:
789
790 void compare (long long foo) {
791 if (foo < 4294967297LL)
792 abort();
793 }
794
795 to:
796
797 compare:
798 subl $4, %esp
799 cmpl $0, 8(%esp)
800 setne %al
801 movzbw %al, %ax
802 cmpl $1, 12(%esp)
803 setg %cl
804 movzbw %cl, %cx
805 cmove %ax, %cx
806 testb $1, %cl
807 jne .LBB1_2 # UnifiedReturnBlock
808 .LBB1_1: # ifthen
809 call abort
810 .LBB1_2: # UnifiedReturnBlock
811 addl $4, %esp
812 ret
813
814 (also really horrible code on ppc). This is due to the expand code for 64-bit
815 compares. GCC produces multiple branches, which is much nicer:
816
817 compare:
818 subl $12, %esp
819 movl 20(%esp), %edx
820 movl 16(%esp), %eax
821 decl %edx
822 jle .L7
823 .L5:
824 addl $12, %esp
825 ret
826 .p2align 4,,7
827 .L7:
828 jl .L4
829 cmpl $0, %eax
830 .p2align 4,,8
831 ja .L5
832 .L4:
833 .p2align 4,,9
834 call abort
835
836 //===---------------------------------------------------------------------===//
837
838 Tail call optimization improvements: Tail call optimization currently
839 pushes all arguments on the top of the stack (their normal place for
840 non-tail call optimized calls) that source from the callers arguments
841 or that source from a virtual register (also possibly sourcing from
842 callers arguments).
843 This is done to prevent overwriting of parameters (see example
844 below) that might be used later.
845
846 example:
847
848 int callee(int32, int64);
849 int caller(int32 arg1, int32 arg2) {
850 int64 local = arg2 * 2;
851 return callee(arg2, (int64)local);
852 }
853
854 [arg1] [!arg2 no longer valid since we moved local onto it]
855 [arg2] -> [(int64)
856 [RETADDR] local ]
857
858 Moving arg1 onto the stack slot of callee function would overwrite
859 arg2 of the caller.
860
861 Possible optimizations:
862
863
864 - Analyse the actual parameters of the callee to see which would
865 overwrite a caller parameter which is used by the callee and only
866 push them onto the top of the stack.
867
868 int callee (int32 arg1, int32 arg2);
869 int caller (int32 arg1, int32 arg2) {
870 return callee(arg1,arg2);
871 }
872
873 Here we don't need to write any variables to the top of the stack
874 since they don't overwrite each other.
875
876 int callee (int32 arg1, int32 arg2);
877 int caller (int32 arg1, int32 arg2) {
878 return callee(arg2,arg1);
879 }
880
881 Here we need to push the arguments because they overwrite each
882 other.
883
884 //===---------------------------------------------------------------------===//
885
886 main ()
887 {
888 int i = 0;
889 unsigned long int z = 0;
890
891 do {
892 z -= 0x00004000;
893 i++;
894 if (i > 0x00040000)
895 abort ();
896 } while (z > 0);
897 exit (0);
898 }
899
900 gcc compiles this to:
901
902 _main:
903 subl $28, %esp
904 xorl %eax, %eax
905 jmp L2
906 L3:
907 cmpl $262144, %eax
908 je L10
909 L2:
910 addl $1, %eax
911 cmpl $262145, %eax
912 jne L3
913 call L_abort$stub
914 L10:
915 movl $0, (%esp)
916 call L_exit$stub
917
918 llvm:
919
920 _main:
921 subl $12, %esp
922 movl $1, %eax
923 movl $16384, %ecx
924 LBB1_1: # bb
925 cmpl $262145, %eax
926 jge LBB1_4 # cond_true
927 LBB1_2: # cond_next
928 incl %eax
929 addl $4294950912, %ecx
930 cmpl $16384, %ecx
931 jne LBB1_1 # bb
932 LBB1_3: # bb11
933 xorl %eax, %eax
934 addl $12, %esp
935 ret
936 LBB1_4: # cond_true
937 call L_abort$stub
938
939 1. LSR should rewrite the first cmp with induction variable %ecx.
940 2. DAG combiner should fold
941 leal 1(%eax), %edx
942 cmpl $262145, %edx
943 =>
944 cmpl $262144, %eax
945
946 //===---------------------------------------------------------------------===//
947
948 define i64 @test(double %X) {
949 %Y = fptosi double %X to i64
950 ret i64 %Y
951 }
952
953 compiles to:
954
955 _test:
956 subl $20, %esp
957 movsd 24(%esp), %xmm0
958 movsd %xmm0, 8(%esp)
959 fldl 8(%esp)
960 fisttpll (%esp)
961 movl 4(%esp), %edx
962 movl (%esp), %eax
963 addl $20, %esp
964 #FP_REG_KILL
965 ret
966
967 This should just fldl directly from the input stack slot.
968
969 //===---------------------------------------------------------------------===//
970
971 This code:
972 int foo (int x) { return (x & 65535) | 255; }
973
974 Should compile into:
975
976 _foo:
977 movzwl 4(%esp), %eax
978 orl $255, %eax
979 ret
980
981 instead of:
982 _foo:
983 movl $65280, %eax
984 andl 4(%esp), %eax
985 orl $255, %eax
986 ret
987
988 //===---------------------------------------------------------------------===//
989
990 We're codegen'ing multiply of long longs inefficiently:
991
992 unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
993 return arg1 * arg2;
994 }
995
996 We compile to (fomit-frame-pointer):
997
998 _LLM:
999 pushl %esi
1000 movl 8(%esp), %ecx
1001 movl 16(%esp), %esi
1002 movl %esi, %eax
1003 mull %ecx
1004 imull 12(%esp), %esi
1005 addl %edx, %esi
1006 imull 20(%esp), %ecx
1007 movl %esi, %edx
1008 addl %ecx, %edx
1009 popl %esi
1010 ret
1011
1012 This looks like a scheduling deficiency and lack of remat of the load from
1013 the argument area. ICC apparently produces:
1014
1015 movl 8(%esp), %ecx
1016 imull 12(%esp), %ecx
1017 movl 16(%esp), %eax
1018 imull 4(%esp), %eax
1019 addl %eax, %ecx
1020 movl 4(%esp), %eax
1021 mull 12(%esp)
1022 addl %ecx, %edx
1023 ret
1024
1025 Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR:
1026 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
1027
1028 //===---------------------------------------------------------------------===//
1029
1030 We can fold a store into "zeroing a reg". Instead of:
1031
1032 xorl %eax, %eax
1033 movl %eax, 124(%esp)
1034
1035 we should get:
1036
1037 movl $0, 124(%esp)
1038
1039 if the flags of the xor are dead.
1040
1041 Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should
1042 be folded into: shl [mem], 1
1043
1044 //===---------------------------------------------------------------------===//
1045
1046 In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
1047 or and instruction, for example:
1048
1049 xorpd LCPI1_0, %xmm2
1050
1051 However, if xmm2 gets spilled, we end up with really ugly code like this:
1052
1053 movsd (%esp), %xmm0
1054 xorpd LCPI1_0, %xmm0
1055 movsd %xmm0, (%esp)
1056
1057 Since we 'know' that this is a 'neg', we can actually "fold" the spill into
1058 the neg/abs instruction, turning it into an *integer* operation, like this:
1059
1060 xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31)
1061
1062 you could also use xorb, but xorl is less likely to lead to a partial register
1063 stall. Here is a contrived testcase:
1064
1065 double a, b, c;
1066 void test(double *P) {
1067 double X = *P;
1068 a = X;
1069 bar();
1070 X = -X;
1071 b = X;
1072 bar();
1073 c = X;
1074 }
1075
1076 //===---------------------------------------------------------------------===//
1077
1078 The generated code on x86 for checking for signed overflow on a multiply the
1079 obvious way is much longer than it needs to be.
1080
1081 int x(int a, int b) {
1082 long long prod = (long long)a*b;
1083 return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
1084 }
1085
1086 See PR2053 for more details.
1087
1088 //===---------------------------------------------------------------------===//
1089
1090 We should investigate using cdq/ctld (effect: edx = sar eax, 31)
1091 more aggressively; it should cost the same as a move+shift on any modern
1092 processor, but it's a lot shorter. Downside is that it puts more
1093 pressure on register allocation because it has fixed operands.
1094
1095 Example:
1096 int abs(int x) {return x < 0 ? -x : x;}
1097
1098 gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
1099 abs:
1100 movl 4(%esp), %eax
1101 cltd
1102 xorl %edx, %eax
1103 subl %edx, %eax
1104 ret
1105
1106 //===---------------------------------------------------------------------===//
1107
1108 Take the following code (from
1109 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
1110
1111 extern unsigned char first_one[65536];
1112 int FirstOnet(unsigned long long arg1)
1113 {
1114 if (arg1 >> 48)
1115 return (first_one[arg1 >> 48]);
1116 return 0;
1117 }
1118
1119
1120 The following code is currently generated:
1121 FirstOnet:
1122 movl 8(%esp), %eax
1123 cmpl $65536, %eax
1124 movl 4(%esp), %ecx
1125 jb .LBB1_2 # UnifiedReturnBlock
1126 .LBB1_1: # ifthen
1127 shrl $16, %eax
1128 movzbl first_one(%eax), %eax
1129 ret
1130 .LBB1_2: # UnifiedReturnBlock
1131 xorl %eax, %eax
1132 ret
1133
1134 We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
1135 lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
1136
1137 //===---------------------------------------------------------------------===//
1138
1139 We compile this function:
1140
1141 define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind {
1142 entry:
1143 %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1]
1144 br i1 %tmp2, label %bb7, label %bb
1145
1146 bb: ; preds = %entry
1147 %tmp6 = add i32 %b, %a ; <i32> [#uses=1]
1148 ret i32 %tmp6
1149
1150 bb7: ; preds = %entry
1151 %tmp10 = sub i32 %a, %c ; <i32> [#uses=1]
1152 ret i32 %tmp10
1153 }
1154
1155 to:
1156
1157 foo: # @foo
1158 # BB#0: # %entry
1159 movl 4(%esp), %ecx
1160 cmpb $0, 16(%esp)
1161 je .LBB0_2
1162 # BB#1: # %bb
1163 movl 8(%esp), %eax
1164 addl %ecx, %eax
1165 ret
1166 .LBB0_2: # %bb7
1167 movl 12(%esp), %edx
1168 movl %ecx, %eax
1169 subl %edx, %eax
1170 ret
1171
1172 There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
1173 couple more movls by putting 4(%esp) into %eax instead of %ecx.
1174
1175 //===---------------------------------------------------------------------===//
1176
1177 See rdar://4653682.
1178
1179 From flops:
1180
1181 LBB1_15: # bb310
1182 cvtss2sd LCPI1_0, %xmm1
1183 addsd %xmm1, %xmm0
1184 movsd 176(%esp), %xmm2
1185 mulsd %xmm0, %xmm2
1186 movapd %xmm2, %xmm3
1187 mulsd %xmm3, %xmm3
1188 movapd %xmm3, %xmm4
1189 mulsd LCPI1_23, %xmm4
1190 addsd LCPI1_24, %xmm4
1191 mulsd %xmm3, %xmm4
1192 addsd LCPI1_25, %xmm4
1193 mulsd %xmm3, %xmm4
1194 addsd LCPI1_26, %xmm4
1195 mulsd %xmm3, %xmm4
1196 addsd LCPI1_27, %xmm4
1197 mulsd %xmm3, %xmm4
1198 addsd LCPI1_28, %xmm4
1199 mulsd %xmm3, %xmm4
1200 addsd %xmm1, %xmm4
1201 mulsd %xmm2, %xmm4
1202 movsd 152(%esp), %xmm1
1203 addsd %xmm4, %xmm1
1204 movsd %xmm1, 152(%esp)
1205 incl %eax
1206 cmpl %eax, %esi
1207 jge LBB1_15 # bb310
1208 LBB1_16: # bb358.loopexit
1209 movsd 152(%esp), %xmm0
1210 addsd %xmm0, %xmm0
1211 addsd LCPI1_22, %xmm0
1212 movsd %xmm0, 152(%esp)
1213
1214 Rather than spilling the result of the last addsd in the loop, we should have
1215 insert a copy to split the interval (one for the duration of the loop, one
1216 extending to the fall through). The register pressure in the loop isn't high
1217 enough to warrant the spill.
1218
1219 Also check why xmm7 is not used at all in the function.
1220
1221 //===---------------------------------------------------------------------===//
1222
1223 Take the following:
1224
1225 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
1226 target triple = "i386-apple-darwin8"
1227 @in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2]
1228 define fastcc void @abort_gzip() noreturn nounwind {
1229 entry:
1230 %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1]
1231 br i1 %tmp.b.i, label %bb.i, label %bb4.i
1232 bb.i: ; preds = %entry
1233 tail call void @exit( i32 1 ) noreturn nounwind
1234 unreachable
1235 bb4.i: ; preds = %entry
1236 store i1 true, i1* @in_exit.4870.b
1237 tail call void @exit( i32 1 ) noreturn nounwind
1238 unreachable
1239 }
1240 declare void @exit(i32) noreturn nounwind
1241
1242 This compiles into:
1243 _abort_gzip: ## @abort_gzip
1244 ## BB#0: ## %entry
1245 subl $12, %esp
1246 movb _in_exit.4870.b, %al
1247 cmpb $1, %al
1248 jne LBB0_2
1249
1250 We somehow miss folding the movb into the cmpb.
1251
1252 //===---------------------------------------------------------------------===//
1253
1254 We compile:
1255
1256 int test(int x, int y) {
1257 return x-y-1;
1258 }
1259
1260 into (-m64):
1261
1262 _test:
1263 decl %edi
1264 movl %edi, %eax
1265 subl %esi, %eax
1266 ret
1267
1268 it would be better to codegen as: x+~y (notl+addl)
1269
1270 //===---------------------------------------------------------------------===//
1271
1272 This code:
1273
1274 int foo(const char *str,...)
1275 {
1276 __builtin_va_list a; int x;
1277 __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
1278 return x;
1279 }
1280
1281 gets compiled into this on x86-64:
1282 subq $200, %rsp
1283 movaps %xmm7, 160(%rsp)
1284 movaps %xmm6, 144(%rsp)
1285 movaps %xmm5, 128(%rsp)
1286 movaps %xmm4, 112(%rsp)
1287 movaps %xmm3, 96(%rsp)
1288 movaps %xmm2, 80(%rsp)
1289 movaps %xmm1, 64(%rsp)
1290 movaps %xmm0, 48(%rsp)
1291 movq %r9, 40(%rsp)
1292 movq %r8, 32(%rsp)
1293 movq %rcx, 24(%rsp)
1294 movq %rdx, 16(%rsp)
1295 movq %rsi, 8(%rsp)
1296 leaq (%rsp), %rax
1297 movq %rax, 192(%rsp)
1298 leaq 208(%rsp), %rax
1299 movq %rax, 184(%rsp)
1300 movl $48, 180(%rsp)
1301 movl $8, 176(%rsp)
1302 movl 176(%rsp), %eax
1303 cmpl $47, %eax
1304 jbe .LBB1_3 # bb
1305 .LBB1_1: # bb3
1306 movq 184(%rsp), %rcx
1307 leaq 8(%rcx), %rax
1308 movq %rax, 184(%rsp)
1309 .LBB1_2: # bb4
1310 movl (%rcx), %eax
1311 addq $200, %rsp
1312 ret
1313 .LBB1_3: # bb
1314 movl %eax, %ecx
1315 addl $8, %eax
1316 addq 192(%rsp), %rcx
1317 movl %eax, 176(%rsp)
1318 jmp .LBB1_2 # bb4
1319
1320 gcc 4.3 generates:
1321 subq $96, %rsp
1322 .LCFI0:
1323 leaq 104(%rsp), %rax
1324 movq %rsi, -80(%rsp)
1325 movl $8, -120(%rsp)
1326 movq %rax, -112(%rsp)
1327 leaq -88(%rsp), %rax
1328 movq %rax, -104(%rsp)
1329 movl $8, %eax
1330 cmpl $48, %eax
1331 jb .L6
1332 movq -112(%rsp), %rdx
1333 movl (%rdx), %eax
1334 addq $96, %rsp
1335 ret
1336 .p2align 4,,10
1337 .p2align 3
1338 .L6:
1339 mov %eax, %edx
1340 addq -104(%rsp), %rdx
1341 addl $8, %eax
1342 movl %eax, -120(%rsp)
1343 movl (%rdx), %eax
1344 addq $96, %rsp
1345 ret
1346
1347 and it gets compiled into this on x86:
1348 pushl %ebp
1349 movl %esp, %ebp
1350 subl $4, %esp
1351 leal 12(%ebp), %eax
1352 movl %eax, -4(%ebp)
1353 leal 16(%ebp), %eax
1354 movl %eax, -4(%ebp)
1355 movl 12(%ebp), %eax
1356 addl $4, %esp
1357 popl %ebp
1358 ret
1359
1360 gcc 4.3 generates:
1361 pushl %ebp
1362 movl %esp, %ebp
1363 movl 12(%ebp), %eax
1364 popl %ebp
1365 ret
1366
1367 //===---------------------------------------------------------------------===//
1368
1369 Teach tblgen not to check bitconvert source type in some cases. This allows us
1370 to consolidate the following patterns in X86InstrMMX.td:
1371
1372 def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1373 (iPTR 0))))),
1374 (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
1375 def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1376 (iPTR 0))))),
1377 (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
1378 def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1379 (iPTR 0))))),
1380 (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
1381
1382 There are other cases in various td files.
1383
1384 //===---------------------------------------------------------------------===//
1385
1386 Take something like the following on x86-32:
1387 unsigned a(unsigned long long x, unsigned y) {return x % y;}
1388
1389 We currently generate a libcall, but we really shouldn't: the expansion is
1390 shorter and likely faster than the libcall. The expected code is something
1391 like the following:
1392
1393 movl 12(%ebp), %eax
1394 movl 16(%ebp), %ecx
1395 xorl %edx, %edx
1396 divl %ecx
1397 movl 8(%ebp), %eax
1398 divl %ecx
1399 movl %edx, %eax
1400 ret
1401
1402 A similar code sequence works for division.
1403
1404 //===---------------------------------------------------------------------===//
1405
1406 These should compile to the same code, but the later codegen's to useless
1407 instructions on X86. This may be a trivial dag combine (GCC PR7061):
1408
1409 struct s1 { unsigned char a, b; };
1410 unsigned long f1(struct s1 x) {
1411 return x.a + x.b;
1412 }
1413 struct s2 { unsigned a: 8, b: 8; };
1414 unsigned long f2(struct s2 x) {
1415 return x.a + x.b;
1416 }
1417
1418 //===---------------------------------------------------------------------===//
1419
1420 We currently compile this:
1421
1422 define i32 @func1(i32 %v1, i32 %v2) nounwind {
1423 entry:
1424 %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
1425 %sum = extractvalue {i32, i1} %t, 0
1426 %obit = extractvalue {i32, i1} %t, 1
1427 br i1 %obit, label %overflow, label %normal
1428 normal:
1429 ret i32 %sum
1430 overflow:
1431 call void @llvm.trap()
1432 unreachable
1433 }
1434 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
1435 declare void @llvm.trap()
1436
1437 to:
1438
1439 _func1:
1440 movl 4(%esp), %eax
1441 addl 8(%esp), %eax
1442 jo LBB1_2 ## overflow
1443 LBB1_1: ## normal
1444 ret
1445 LBB1_2: ## overflow
1446 ud2
1447
1448 it would be nice to produce "into" someday.
1449
1450 //===---------------------------------------------------------------------===//
1451
1452 This code:
1453
1454 void vec_mpys1(int y[], const int x[], int scaler) {
1455 int i;
1456 for (i = 0; i < 150; i++)
1457 y[i] += (((long long)scaler * (long long)x[i]) >> 31);
1458 }
1459
1460 Compiles to this loop with GCC 3.x:
1461
1462 .L5:
1463 movl %ebx, %eax
1464 imull (%edi,%ecx,4)
1465 shrdl $31, %edx, %eax
1466 addl %eax, (%esi,%ecx,4)
1467 incl %ecx
1468 cmpl $149, %ecx
1469 jle .L5
1470
1471 llvm-gcc compiles it to the much uglier:
1472
1473 LBB1_1: ## bb1
1474 movl 24(%esp), %eax
1475 movl (%eax,%edi,4), %ebx
1476 movl %ebx, %ebp
1477 imull %esi, %ebp
1478 movl %ebx, %eax
1479 mull %ecx
1480 addl %ebp, %edx
1481 sarl $31, %ebx
1482 imull %ecx, %ebx
1483 addl %edx, %ebx
1484 shldl $1, %eax, %ebx
1485 movl 20(%esp), %eax
1486 addl %ebx, (%eax,%edi,4)
1487 incl %edi
1488 cmpl $150, %edi
1489 jne LBB1_1 ## bb1
1490
1491 The issue is that we hoist the cast of "scaler" to long long outside of the
1492 loop, the value comes into the loop as two values, and
1493 RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
1494 constructed BUILD_PAIR which represents the cast value.
1495
1496 This can be handled by making CodeGenPrepare sink the cast.
1497
1498 //===---------------------------------------------------------------------===//
1499
1500 Test instructions can be eliminated by using EFLAGS values from arithmetic
1501 instructions. This is currently not done for mul, and, or, xor, neg, shl,
1502 sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
1503 for read-modify-write instructions. It is also current not done if the
1504 OF or CF flags are needed.
1505
1506 The shift operators have the complication that when the shift count is
1507 zero, EFLAGS is not set, so they can only subsume a test instruction if
1508 the shift count is known to be non-zero. Also, using the EFLAGS value
1509 from a shift is apparently very slow on some x86 implementations.
1510
1511 In read-modify-write instructions, the root node in the isel match is
1512 the store, and isel has no way for the use of the EFLAGS result of the
1513 arithmetic to be remapped to the new node.
1514
1515 Add and subtract instructions set OF on signed overflow and CF on unsiged
1516 overflow, while test instructions always clear OF and CF. In order to
1517 replace a test with an add or subtract in a situation where OF or CF is
1518 needed, codegen must be able to prove that the operation cannot see
1519 signed or unsigned overflow, respectively.
1520
1521 //===---------------------------------------------------------------------===//
1522
1523 memcpy/memmove do not lower to SSE copies when possible. A silly example is:
1524 define <16 x float> @foo(<16 x float> %A) nounwind {
1525 %tmp = alloca <16 x float>, align 16
1526 %tmp2 = alloca <16 x float>, align 16
1527 store <16 x float> %A, <16 x float>* %tmp
1528 %s = bitcast <16 x float>* %tmp to i8*
1529 %s2 = bitcast <16 x float>* %tmp2 to i8*
1530 call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
1531 %R = load <16 x float>* %tmp2
1532 ret <16 x float> %R
1533 }
1534
1535 declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
1536
1537 which compiles to:
1538
1539 _foo:
1540 subl $140, %esp
1541 movaps %xmm3, 112(%esp)
1542 movaps %xmm2, 96(%esp)
1543 movaps %xmm1, 80(%esp)
1544 movaps %xmm0, 64(%esp)
1545 movl 60(%esp), %eax
1546 movl %eax, 124(%esp)
1547 movl 56(%esp), %eax
1548 movl %eax, 120(%esp)
1549 movl 52(%esp), %eax
1550 <many many more 32-bit copies>
1551 movaps (%esp), %xmm0
1552 movaps 16(%esp), %xmm1
1553 movaps 32(%esp), %xmm2
1554 movaps 48(%esp), %xmm3
1555 addl $140, %esp
1556 ret
1557
1558 On Nehalem, it may even be cheaper to just use movups when unaligned than to
1559 fall back to lower-granularity chunks.
1560
1561 //===---------------------------------------------------------------------===//
1562
1563 Implement processor-specific optimizations for parity with GCC on these
1564 processors. GCC does two optimizations:
1565
1566 1. ix86_pad_returns inserts a noop before ret instructions if immediately
1567 preceded by a conditional branch or is the target of a jump.
1568 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
1569 code contains more than 3 branches.
1570
1571 The first one is done for all AMDs, Core2, and "Generic"
1572 The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
1573 Core 2, and "Generic"
1574
1575 //===---------------------------------------------------------------------===//
1576
1577 Testcase:
1578 int a(int x) { return (x & 127) > 31; }
1579
1580 Current output:
1581 movl 4(%esp), %eax
1582 andl $127, %eax
1583 cmpl $31, %eax
1584 seta %al
1585 movzbl %al, %eax
1586 ret
1587
1588 Ideal output:
1589 xorl %eax, %eax
1590 testl $96, 4(%esp)
1591 setne %al
1592 ret
1593
1594 This should definitely be done in instcombine, canonicalizing the range
1595 condition into a != condition. We get this IR:
1596
1597 define i32 @a(i32 %x) nounwind readnone {
1598 entry:
1599 %0 = and i32 %x, 127 ; <i32> [#uses=1]
1600 %1 = icmp ugt i32 %0, 31 ; <i1> [#uses=1]
1601 %2 = zext i1 %1 to i32 ; <i32> [#uses=1]
1602 ret i32 %2
1603 }
1604
1605 Instcombine prefers to strength reduce relational comparisons to equality
1606 comparisons when possible, this should be another case of that. This could
1607 be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it
1608 looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already
1609 be redesigned to use ComputeMaskedBits and friends.
1610
1611
1612 //===---------------------------------------------------------------------===//
1613 Testcase:
1614 int x(int a) { return (a&0xf0)>>4; }
1615
1616 Current output:
1617 movl 4(%esp), %eax
1618 shrl $4, %eax
1619 andl $15, %eax
1620 ret
1621
1622 Ideal output:
1623 movzbl 4(%esp), %eax
1624 shrl $4, %eax
1625 ret
1626
1627 //===---------------------------------------------------------------------===//
1628
1629 Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
1630 properly.
1631
1632 When the return value is not used (i.e. only care about the value in the
1633 memory), x86 does not have to use add to implement these. Instead, it can use
1634 add, sub, inc, dec instructions with the "lock" prefix.
1635
1636 This is currently implemented using a bit of instruction selection trick. The
1637 issue is the target independent pattern produces one output and a chain and we
1638 want to map it into one that just output a chain. The current trick is to select
1639 it into a MERGE_VALUES with the first definition being an implicit_def. The
1640 proper solution is to add new ISD opcodes for the no-output variant. DAG
1641 combiner can then transform the node before it gets to target node selection.
1642
1643 Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
1644 fact these instructions are identical to the non-lock versions. We need a way to
1645 add target specific information to target nodes and have this information
1646 carried over to machine instructions. Asm printer (or JIT) can use this
1647 information to add the "lock" prefix.
1648
1649 //===---------------------------------------------------------------------===//
1650
1651 struct B {
1652 unsigned char y0 : 1;
1653 };
1654
1655 int bar(struct B* a) { return a->y0; }
1656
1657 define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
1658 %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
1659 %2 = load i8* %1, align 1
1660 %3 = and i8 %2, 1
1661 %4 = zext i8 %3 to i32
1662 ret i32 %4
1663 }
1664
1665 bar: # @bar
1666 # BB#0:
1667 movb (%rdi), %al
1668 andb $1, %al
1669 movzbl %al, %eax
1670 ret
1671
1672 Missed optimization: should be movl+andl.
1673
1674 //===---------------------------------------------------------------------===//
1675
1676 The x86_64 abi says:
1677
1678 Booleans, when stored in a memory object, are stored as single byte objects the
1679 value of which is always 0 (false) or 1 (true).
1680
1681 We are not using this fact:
1682
1683 int bar(_Bool *a) { return *a; }
1684
1685 define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
1686 %1 = load i8* %a, align 1, !tbaa !0
1687 %tmp = and i8 %1, 1
1688 %2 = zext i8 %tmp to i32
1689 ret i32 %2
1690 }
1691
1692 bar:
1693 movb (%rdi), %al
1694 andb $1, %al
1695 movzbl %al, %eax
1696 ret
1697
1698 GCC produces
1699
1700 bar:
1701 movzbl (%rdi), %eax
1702 ret
1703
1704 //===---------------------------------------------------------------------===//
1705
1706 Consider the following two functions compiled with clang:
1707 _Bool foo(int *x) { return !(*x & 4); }
1708 unsigned bar(int *x) { return !(*x & 4); }
1709
1710 foo:
1711 movl 4(%esp), %eax
1712 testb $4, (%eax)
1713 sete %al
1714 movzbl %al, %eax
1715 ret
1716
1717 bar:
1718 movl 4(%esp), %eax
1719 movl (%eax), %eax
1720 shrl $2, %eax
1721 andl $1, %eax
1722 xorl $1, %eax
1723 ret
1724
1725 The second function generates more code even though the two functions are
1726 are functionally identical.
1727
1728 //===---------------------------------------------------------------------===//
1729
1730 Take the following C code:
1731 int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
1732
1733 We generate the following IR with clang:
1734 define i32 @f(i32 %a, i32 %b) nounwind readnone {
1735 entry:
1736 %tmp = xor i32 %b, %a ; <i32> [#uses=1]
1737 %tmp6 = and i32 %tmp, 255 ; <i32> [#uses=1]
1738 %cmp = icmp eq i32 %tmp6, 0 ; <i1> [#uses=1]
1739 %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1]
1740 ret i32 %conv5
1741 }
1742
1743 And the following x86 code:
1744 xorl %esi, %edi
1745 testb $-1, %dil
1746 sete %al
1747 movzbl %al, %eax
1748 ret
1749
1750 A cmpb instead of the xorl+testb would be one instruction shorter.
1751
1752 //===---------------------------------------------------------------------===//
1753
1754 Given the following C code:
1755 int f(int a, int b) { return (signed char)a == (signed char)b; }
1756
1757 We generate the following IR with clang:
1758 define i32 @f(i32 %a, i32 %b) nounwind readnone {
1759 entry:
1760 %sext = shl i32 %a, 24 ; <i32> [#uses=1]
1761 %conv1 = ashr i32 %sext, 24 ; <i32> [#uses=1]
1762 %sext6 = shl i32 %b, 24 ; <i32> [#uses=1]
1763 %conv4 = ashr i32 %sext6, 24 ; <i32> [#uses=1]
1764 %cmp = icmp eq i32 %conv1, %conv4 ; <i1> [#uses=1]
1765 %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1]
1766 ret i32 %conv5
1767 }
1768
1769 And the following x86 code:
1770 movsbl %sil, %eax
1771 movsbl %dil, %ecx
1772 cmpl %eax, %ecx
1773 sete %al
1774 movzbl %al, %eax
1775 ret
1776
1777
1778 It should be possible to eliminate the sign extensions.
1779
1780 //===---------------------------------------------------------------------===//
1781
1782 LLVM misses a load+store narrowing opportunity in this code:
1783
1784 %struct.bf = type { i64, i16, i16, i32 }
1785
1786 @bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2]
1787
1788 define void @t1() nounwind ssp {
1789 entry:
1790 %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1]
1791 %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
1792 %2 = bitcast i16* %1 to i32* ; <i32*> [#uses=2]
1793 %3 = load i32* %2, align 1 ; <i32> [#uses=1]
1794 %4 = and i32 %3, -65537 ; <i32> [#uses=1]
1795 store i32 %4, i32* %2, align 1
1796 %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1]
1797 %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
1798 %7 = bitcast i16* %6 to i32* ; <i32*> [#uses=2]
1799 %8 = load i32* %7, align 1 ; <i32> [#uses=1]
1800 %9 = and i32 %8, -131073 ; <i32> [#uses=1]
1801 store i32 %9, i32* %7, align 1
1802 ret void
1803 }
1804
1805 LLVM currently emits this:
1806
1807 movq bfi(%rip), %rax
1808 andl $-65537, 8(%rax)
1809 movq bfi(%rip), %rax
1810 andl $-131073, 8(%rax)
1811 ret
1812
1813 It could narrow the loads and stores to emit this:
1814
1815 movq bfi(%rip), %rax
1816 andb $-2, 10(%rax)
1817 movq bfi(%rip), %rax
1818 andb $-3, 10(%rax)
1819 ret
1820
1821 The trouble is that there is a TokenFactor between the store and the
1822 load, making it non-trivial to determine if there's anything between
1823 the load and the store which would prohibit narrowing.
1824
1825 //===---------------------------------------------------------------------===//
1826
1827 This code:
1828 void foo(unsigned x) {
1829 if (x == 0) bar();
1830 else if (x == 1) qux();
1831 }
1832
1833 currently compiles into:
1834 _foo:
1835 movl 4(%esp), %eax
1836 cmpl $1, %eax
1837 je LBB0_3
1838 testl %eax, %eax
1839 jne LBB0_4
1840
1841 the testl could be removed:
1842 _foo:
1843 movl 4(%esp), %eax
1844 cmpl $1, %eax
1845 je LBB0_3
1846 jb LBB0_4
1847
1848 0 is the only unsigned number < 1.
1849
1850 //===---------------------------------------------------------------------===//
1851
1852 This code:
1853
1854 %0 = type { i32, i1 }
1855
1856 define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
1857 entry:
1858 %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
1859 %cmp = extractvalue %0 %uadd, 1
1860 %inc = zext i1 %cmp to i32
1861 %add = add i32 %x, %sum
1862 %z.0 = add i32 %add, %inc
1863 ret i32 %z.0
1864 }
1865
1866 declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
1867
1868 compiles to:
1869
1870 _add32carry: ## @add32carry
1871 addl %esi, %edi
1872 sbbl %ecx, %ecx
1873 movl %edi, %eax
1874 subl %ecx, %eax
1875 ret
1876
1877 But it could be:
1878
1879 _add32carry:
1880 leal (%rsi,%rdi), %eax
1881 cmpl %esi, %eax
1882 adcl $0, %eax
1883 ret
1884
1885 //===---------------------------------------------------------------------===//
1886
1887 The hot loop of 256.bzip2 contains code that looks a bit like this:
1888
1889 int foo(char *P, char *Q, int x, int y) {
1890 if (P[0] != Q[0])
1891 return P[0] < Q[0];
1892 if (P[1] != Q[1])
1893 return P[1] < Q[1];
1894 if (P[2] != Q[2])
1895 return P[2] < Q[2];
1896 return P[3] < Q[3];
1897 }
1898
1899 In the real code, we get a lot more wrong than this. However, even in this
1900 code we generate:
1901
1902 _foo: ## @foo
1903 ## BB#0: ## %entry
1904 movb (%rsi), %al
1905 movb (%rdi), %cl
1906 cmpb %al, %cl
1907 je LBB0_2
1908 LBB0_1: ## %if.then
1909 cmpb %al, %cl
1910 jmp LBB0_5
1911 LBB0_2: ## %if.end
1912 movb 1(%rsi), %al
1913 movb 1(%rdi), %cl
1914 cmpb %al, %cl
1915 jne LBB0_1
1916 ## BB#3: ## %if.end38
1917 movb 2(%rsi), %al
1918 movb 2(%rdi), %cl
1919 cmpb %al, %cl
1920 jne LBB0_1
1921 ## BB#4: ## %if.end60
1922 movb 3(%rdi), %al
1923 cmpb 3(%rsi), %al
1924 LBB0_5: ## %if.end60
1925 setl %al
1926 movzbl %al, %eax
1927 ret
1928
1929 Note that we generate jumps to LBB0_1 which does a redundant compare. The
1930 redundant compare also forces the register values to be live, which prevents
1931 folding one of the loads into the compare. In contrast, GCC 4.2 produces:
1932
1933 _foo:
1934 movzbl (%rsi), %eax
1935 cmpb %al, (%rdi)
1936 jne L10
1937 L12:
1938 movzbl 1(%rsi), %eax
1939 cmpb %al, 1(%rdi)
1940 jne L10
1941 movzbl 2(%rsi), %eax
1942 cmpb %al, 2(%rdi)
1943 jne L10
1944 movzbl 3(%rdi), %eax
1945 cmpb 3(%rsi), %al
1946 L10:
1947 setl %al
1948 movzbl %al, %eax
1949 ret
1950
1951 which is "perfect".
1952
1953 //===---------------------------------------------------------------------===//
1954
1955 For the branch in the following code:
1956 int a();
1957 int b(int x, int y) {
1958 if (x & (1<<(y&7)))
1959 return a();
1960 return y;
1961 }
1962
1963 We currently generate:
1964 movb %sil, %al
1965 andb $7, %al
1966 movzbl %al, %eax
1967 btl %eax, %edi
1968 jae .LBB0_2
1969
1970 movl+andl would be shorter than the movb+andb+movzbl sequence.
1971
1972 //===---------------------------------------------------------------------===//
1973
1974 For the following:
1975 struct u1 {
1976 float x, y;
1977 };
1978 float foo(struct u1 u) {
1979 return u.x + u.y;
1980 }
1981
1982 We currently generate:
1983 movdqa %xmm0, %xmm1
1984 pshufd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,0]
1985 addss %xmm1, %xmm0
1986 ret
1987
1988 We could save an instruction here by commuting the addss.
1989
1990 //===---------------------------------------------------------------------===//
1991
1992 This (from PR9661):
1993
1994 float clamp_float(float a) {
1995 if (a > 1.0f)
1996 return 1.0f;
1997 else if (a < 0.0f)
1998 return 0.0f;
1999 else
2000 return a;
2001 }
2002
2003 Could compile to:
2004
2005 clamp_float: # @clamp_float
2006 movss .LCPI0_0(%rip), %xmm1
2007 minss %xmm1, %xmm0
2008 pxor %xmm1, %xmm1
2009 maxss %xmm1, %xmm0
2010 ret
2011
2012 with -ffast-math.
2013
2014 //===---------------------------------------------------------------------===//
2015
2016 This function (from PR9803):
2017
2018 int clamp2(int a) {
2019 if (a > 5)
2020 a = 5;
2021 if (a < 0)
2022 return 0;
2023 return a;
2024 }
2025
2026 Compiles to:
2027
2028 _clamp2: ## @clamp2
2029 pushq %rbp
2030 movq %rsp, %rbp
2031 cmpl $5, %edi
2032 movl $5, %ecx
2033 cmovlel %edi, %ecx
2034 testl %ecx, %ecx
2035 movl $0, %eax
2036 cmovnsl %ecx, %eax
2037 popq %rbp
2038 ret
2039
2040 The move of 0 could be scheduled above the test to make it is xor reg,reg.
2041
2042 //===---------------------------------------------------------------------===//
2043
2044 GCC PR48986. We currently compile this:
2045
2046 void bar(void);
2047 void yyy(int* p) {
2048 if (__sync_fetch_and_add(p, -1) == 1)
2049 bar();
2050 }
2051
2052 into:
2053 movl $-1, %eax
2054 lock
2055 xaddl %eax, (%rdi)
2056 cmpl $1, %eax
2057 je LBB0_2
2058
2059 Instead we could generate:
2060
2061 lock
2062 dec %rdi
2063 je LBB0_2
2064
2065 The trick is to match "fetch_and_add(X, -C) == C".
2066
2067 //===---------------------------------------------------------------------===//
2068
2069