1 //===---------------------------------------------------------------------===//
2 // Random ideas for the X86 backend: SSE-specific stuff.
3 //===---------------------------------------------------------------------===//
4
5 //===---------------------------------------------------------------------===//
6
7 SSE Variable shift can be custom lowered to something like this, which uses a
8 small table + unaligned load + shuffle instead of going through memory.
9
10 __m128i_shift_right:
11 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
12 .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
13
14 ...
15 __m128i shift_right(__m128i value, unsigned long offset) {
16 return _mm_shuffle_epi8(value,
17 _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
18 }
19
20 //===---------------------------------------------------------------------===//
21
22 SSE has instructions for doing operations on complex numbers, we should pattern
23 match them. For example, this should turn into a horizontal add:
24
25 typedef float __attribute__((vector_size(16))) v4f32;
26 float f32(v4f32 A) {
27 return A[0]+A[1]+A[2]+A[3];
28 }
29
30 Instead we get this:
31
32 _f32: ## @f32
33 pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0]
34 addss %xmm0, %xmm1
35 pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0]
36 movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1]
37 movaps %xmm0, %xmm3
38 addss %xmm1, %xmm3
39 movdqa %xmm2, %xmm0
40 addss %xmm3, %xmm0
41 ret
42
43 Also, there are cases where some simple local SLP would improve codegen a bit.
44 compiling this:
45
46 _Complex float f32(_Complex float A, _Complex float B) {
47 return A+B;
48 }
49
50 into:
51
52 _f32: ## @f32
53 movdqa %xmm0, %xmm2
54 addss %xmm1, %xmm2
55 pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0]
56 pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0]
57 addss %xmm1, %xmm3
58 movaps %xmm2, %xmm0
59 unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
60 ret
61
62 seems silly when it could just be one addps.
63
64
65 //===---------------------------------------------------------------------===//
66
67 Expand libm rounding functions inline: Significant speedups possible.
68 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
69
70 //===---------------------------------------------------------------------===//
71
72 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
73 other fast SSE modes.
74
75 //===---------------------------------------------------------------------===//
76
77 Think about doing i64 math in SSE regs on x86-32.
78
79 //===---------------------------------------------------------------------===//
80
81 This testcase should have no SSE instructions in it, and only one load from
82 a constant pool:
83
84 double %test3(bool %B) {
85 %C = select bool %B, double 123.412, double 523.01123123
86 ret double %C
87 }
88
89 Currently, the select is being lowered, which prevents the dag combiner from
90 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
91
92 The pattern isel got this one right.
93
94 //===---------------------------------------------------------------------===//
95
96 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
97 feasible.
98
99 //===---------------------------------------------------------------------===//
100
101 Codegen:
102 if (copysign(1.0, x) == copysign(1.0, y))
103 into:
104 if (x^y & mask)
105 when using SSE.
106
107 //===---------------------------------------------------------------------===//
108
109 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
110 of a v4sf value.
111
112 //===---------------------------------------------------------------------===//
113
114 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
115 Perhaps use pxor / xorp* to clear a XMM register first?
116
117 //===---------------------------------------------------------------------===//
118
119 External test Nurbs exposed some problems. Look for
120 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
121 emits:
122
123 movaps (%edx), %xmm2 #59.21
124 movaps (%edx), %xmm5 #60.21
125 movaps (%edx), %xmm4 #61.21
126 movaps (%edx), %xmm3 #62.21
127 movl 40(%ecx), %ebp #69.49
128 shufps $0, %xmm2, %xmm5 #60.21
129 movl 100(%esp), %ebx #69.20
130 movl (%ebx), %edi #69.20
131 imull %ebp, %edi #69.49
132 addl (%eax), %edi #70.33
133 shufps $85, %xmm2, %xmm4 #61.21
134 shufps $170, %xmm2, %xmm3 #62.21
135 shufps $255, %xmm2, %xmm2 #63.21
136 lea (%ebp,%ebp,2), %ebx #69.49
137 negl %ebx #69.49
138 lea -3(%edi,%ebx), %ebx #70.33
139 shll $4, %ebx #68.37
140 addl 32(%ecx), %ebx #68.37
141 testb $15, %bl #91.13
142 jne L_B1.24 # Prob 5% #91.13
143
144 This is the llvm code after instruction scheduling:
145
146 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
147 %reg1078 = MOV32ri -3
148 %reg1079 = ADD32rm %reg1078, %reg1068, 1, %noreg, 0
149 %reg1037 = MOV32rm %reg1024, 1, %noreg, 40
150 %reg1080 = IMUL32rr %reg1079, %reg1037
151 %reg1081 = MOV32rm %reg1058, 1, %noreg, 0
152 %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
153 %reg1036 = MOV32rm %reg1024, 1, %noreg, 32
154 %reg1082 = SHL32ri %reg1038, 4
155 %reg1039 = ADD32rr %reg1036, %reg1082
156 %reg1083 = MOVAPSrm %reg1059, 1, %noreg, 0
157 %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
158 %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
159 %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
160 %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
161 %reg1040 = MOV32rr %reg1039
162 %reg1084 = AND32ri8 %reg1039, 15
163 CMP32ri8 %reg1084, 0
164 JE mbb<cond_next204,0xa914d30>
165
166 Still ok. After register allocation:
167
168 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
169 %eax = MOV32ri -3
170 %edx = MOV32rm %stack.3, 1, %noreg, 0
171 ADD32rm %eax<def&use>, %edx, 1, %noreg, 0
172 %edx = MOV32rm %stack.7, 1, %noreg, 0
173 %edx = MOV32rm %edx, 1, %noreg, 40
174 IMUL32rr %eax<def&use>, %edx
175 %esi = MOV32rm %stack.5, 1, %noreg, 0
176 %esi = MOV32rm %esi, 1, %noreg, 0
177 MOV32mr %stack.4, 1, %noreg, 0, %esi
178 %eax = LEA32r %esi, 1, %eax, -3
179 %esi = MOV32rm %stack.7, 1, %noreg, 0
180 %esi = MOV32rm %esi, 1, %noreg, 32
181 %edi = MOV32rr %eax
182 SHL32ri %edi<def&use>, 4
183 ADD32rr %edi<def&use>, %esi
184 %xmm0 = MOVAPSrm %ecx, 1, %noreg, 0
185 %xmm1 = MOVAPSrr %xmm0
186 SHUFPSrr %xmm1<def&use>, %xmm1, 170
187 %xmm2 = MOVAPSrr %xmm0
188 SHUFPSrr %xmm2<def&use>, %xmm2, 0
189 %xmm3 = MOVAPSrr %xmm0
190 SHUFPSrr %xmm3<def&use>, %xmm3, 255
191 SHUFPSrr %xmm0<def&use>, %xmm0, 85
192 %ebx = MOV32rr %edi
193 AND32ri8 %ebx<def&use>, 15
194 CMP32ri8 %ebx, 0
195 JE mbb<cond_next204,0xa914d30>
196
197 This looks really bad. The problem is shufps is a destructive opcode. Since it
198 appears as operand two in more than one shufps ops. It resulted in a number of
199 copies. Note icc also suffers from the same problem. Either the instruction
200 selector should select pshufd or The register allocator can made the two-address
201 to three-address transformation.
202
203 It also exposes some other problems. See MOV32ri -3 and the spills.
204
205 //===---------------------------------------------------------------------===//
206
207 Consider:
208
209 __m128 test(float a) {
210 return _mm_set_ps(0.0, 0.0, 0.0, a*a);
211 }
212
213 This compiles into:
214
215 movss 4(%esp), %xmm1
216 mulss %xmm1, %xmm1
217 xorps %xmm0, %xmm0
218 movss %xmm1, %xmm0
219 ret
220
221 Because mulss doesn't modify the top 3 elements, the top elements of
222 xmm1 are already zero'd. We could compile this to:
223
224 movss 4(%esp), %xmm0
225 mulss %xmm0, %xmm0
226 ret
227
228 //===---------------------------------------------------------------------===//
229
230 Here's a sick and twisted idea. Consider code like this:
231
232 __m128 test(__m128 a) {
233 float b = *(float*)&A;
234 ...
235 return _mm_set_ps(0.0, 0.0, 0.0, b);
236 }
237
238 This might compile to this code:
239
240 movaps c(%esp), %xmm1
241 xorps %xmm0, %xmm0
242 movss %xmm1, %xmm0
243 ret
244
245 Now consider if the ... code caused xmm1 to get spilled. This might produce
246 this code:
247
248 movaps c(%esp), %xmm1
249 movaps %xmm1, c2(%esp)
250 ...
251
252 xorps %xmm0, %xmm0
253 movaps c2(%esp), %xmm1
254 movss %xmm1, %xmm0
255 ret
256
257 However, since the reload is only used by these instructions, we could
258 "fold" it into the uses, producing something like this:
259
260 movaps c(%esp), %xmm1
261 movaps %xmm1, c2(%esp)
262 ...
263
264 movss c2(%esp), %xmm0
265 ret
266
267 ... saving two instructions.
268
269 The basic idea is that a reload from a spill slot, can, if only one 4-byte
270 chunk is used, bring in 3 zeros the one element instead of 4 elements.
271 This can be used to simplify a variety of shuffle operations, where the
272 elements are fixed zeros.
273
274 //===---------------------------------------------------------------------===//
275
276 This code generates ugly code, probably due to costs being off or something:
277
278 define void @test(float* %P, <4 x float>* %P2 ) {
279 %xFloat0.688 = load float* %P
280 %tmp = load <4 x float>* %P2
281 %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
282 store <4 x float> %inFloat3.713, <4 x float>* %P2
283 ret void
284 }
285
286 Generates:
287
288 _test:
289 movl 8(%esp), %eax
290 movaps (%eax), %xmm0
291 pxor %xmm1, %xmm1
292 movaps %xmm0, %xmm2
293 shufps $50, %xmm1, %xmm2
294 shufps $132, %xmm2, %xmm0
295 movaps %xmm0, (%eax)
296 ret
297
298 Would it be better to generate:
299
300 _test:
301 movl 8(%esp), %ecx
302 movaps (%ecx), %xmm0
303 xor %eax, %eax
304 pinsrw $6, %eax, %xmm0
305 pinsrw $7, %eax, %xmm0
306 movaps %xmm0, (%ecx)
307 ret
308
309 ?
310
311 //===---------------------------------------------------------------------===//
312
313 Some useful information in the Apple Altivec / SSE Migration Guide:
314
315 http://developer.apple.com/documentation/Performance/Conceptual/
316 Accelerate_sse_migration/index.html
317
318 e.g. SSE select using and, andnot, or. Various SSE compare translations.
319
320 //===---------------------------------------------------------------------===//
321
322 Add hooks to commute some CMPP operations.
323
324 //===---------------------------------------------------------------------===//
325
326 Apply the same transformation that merged four float into a single 128-bit load
327 to loads from constant pool.
328
329 //===---------------------------------------------------------------------===//
330
331 Floating point max / min are commutable when -enable-unsafe-fp-path is
332 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
333 nodes which are selected to max / min instructions that are marked commutable.
334
335 //===---------------------------------------------------------------------===//
336
337 We should materialize vector constants like "all ones" and "signbit" with
338 code like:
339
340 cmpeqps xmm1, xmm1 ; xmm1 = all-ones
341
342 and:
343 cmpeqps xmm1, xmm1 ; xmm1 = all-ones
344 psrlq xmm1, 31 ; xmm1 = all 100000000000...
345
346 instead of using a load from the constant pool. The later is important for
347 ABS/NEG/copysign etc.
348
349 //===---------------------------------------------------------------------===//
350
351 These functions:
352
353 #include <xmmintrin.h>
354 __m128i a;
355 void x(unsigned short n) {
356 a = _mm_slli_epi32 (a, n);
357 }
358 void y(unsigned n) {
359 a = _mm_slli_epi32 (a, n);
360 }
361
362 compile to ( -O3 -static -fomit-frame-pointer):
363 _x:
364 movzwl 4(%esp), %eax
365 movd %eax, %xmm0
366 movaps _a, %xmm1
367 pslld %xmm0, %xmm1
368 movaps %xmm1, _a
369 ret
370 _y:
371 movd 4(%esp), %xmm0
372 movaps _a, %xmm1
373 pslld %xmm0, %xmm1
374 movaps %xmm1, _a
375 ret
376
377 "y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems
378 like movd would be sufficient in both cases as the value is already zero
379 extended in the 32-bit stack slot IIRC. For signed short, it should also be
380 save, as a really-signed value would be undefined for pslld.
381
382
383 //===---------------------------------------------------------------------===//
384
385 #include <math.h>
386 int t1(double d) { return signbit(d); }
387
388 This currently compiles to:
389 subl $12, %esp
390 movsd 16(%esp), %xmm0
391 movsd %xmm0, (%esp)
392 movl 4(%esp), %eax
393 shrl $31, %eax
394 addl $12, %esp
395 ret
396
397 We should use movmskp{s|d} instead.
398
399 //===---------------------------------------------------------------------===//
400
401 CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
402 (aligned) vector load. This functionality has a couple of problems.
403
404 1. The code to infer alignment from loads of globals is in the X86 backend,
405 not the dag combiner. This is because dagcombine2 needs to be able to see
406 through the X86ISD::Wrapper node, which DAGCombine can't really do.
407 2. The code for turning 4 x load into a single vector load is target
408 independent and should be moved to the dag combiner.
409 3. The code for turning 4 x load into a vector load can only handle a direct
410 load from a global or a direct load from the stack. It should be generalized
411 to handle any load from P, P+4, P+8, P+12, where P can be anything.
412 4. The alignment inference code cannot handle loads from globals in non-static
413 mode because it doesn't look through the extra dyld stub load. If you try
414 vec_align.ll without -relocation-model=static, you'll see what I mean.
415
416 //===---------------------------------------------------------------------===//
417
418 We should lower store(fneg(load p), q) into an integer load+xor+store, which
419 eliminates a constant pool load. For example, consider:
420
421 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly {
422 entry:
423 %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1]
424 %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
425 ret i64 %tmp20
426 }
427 declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
428
429 This currently compiles to:
430
431 LCPI1_0: # <4 x float>
432 .long 2147483648 # float -0
433 .long 2147483648 # float -0
434 .long 2147483648 # float -0
435 .long 2147483648 # float -0
436 _ccosf:
437 subl $12, %esp
438 movss 16(%esp), %xmm0
439 movss %xmm0, 4(%esp)
440 movss 20(%esp), %xmm0
441 xorps LCPI1_0, %xmm0
442 movss %xmm0, (%esp)
443 call L_ccoshf$stub
444 addl $12, %esp
445 ret
446
447 Note the load into xmm0, then xor (to negate), then store. In PIC mode,
448 this code computes the pic base and does two loads to do the constant pool
449 load, so the improvement is much bigger.
450
451 The tricky part about this xform is that the argument load/store isn't exposed
452 until post-legalize, and at that point, the fneg has been custom expanded into
453 an X86 fxor. This means that we need to handle this case in the x86 backend
454 instead of in target independent code.
455
456 //===---------------------------------------------------------------------===//
457
458 Non-SSE4 insert into 16 x i8 is atrociously bad.
459
460 //===---------------------------------------------------------------------===//
461
462 <2 x i64> extract is substantially worse than <2 x f64>, even if the destination
463 is memory.
464
465 //===---------------------------------------------------------------------===//
466
467 INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
468 any number of 0.0 simultaneously. Currently we only use it for simple
469 insertions.
470
471 See comments in LowerINSERT_VECTOR_ELT_SSE4.
472
473 //===---------------------------------------------------------------------===//
474
475 On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
476 Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
477 legal, it'll just take a few extra patterns written in the .td file.
478
479 Note: this is not a code quality issue; the custom lowered code happens to be
480 right, but we shouldn't have to custom lower anything. This is probably related
481 to <2 x i64> ops being so bad.
482
483 //===---------------------------------------------------------------------===//
484
485 LLVM currently generates stack realignment code, when it is not necessary
486 needed. The problem is that we need to know about stack alignment too early,
487 before RA runs.
488
489 At that point we don't know, whether there will be vector spill, or not.
490 Stack realignment logic is overly conservative here, but otherwise we can
491 produce unaligned loads/stores.
492
493 Fixing this will require some huge RA changes.
494
495 Testcase:
496 #include <emmintrin.h>
497
498 typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
499
500 static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
501 - 22725, - 12873};;
502
503 vSInt16 madd(vSInt16 b)
504 {
505 return _mm_madd_epi16(a, b);
506 }
507
508 Generated code (x86-32, linux):
509 madd:
510 pushl %ebp
511 movl %esp, %ebp
512 andl $-16, %esp
513 movaps .LCPI1_0, %xmm1
514 pmaddwd %xmm1, %xmm0
515 movl %ebp, %esp
516 popl %ebp
517 ret
518
519 //===---------------------------------------------------------------------===//
520
521 Consider:
522 #include <emmintrin.h>
523 __m128 foo2 (float x) {
524 return _mm_set_ps (0, 0, x, 0);
525 }
526
527 In x86-32 mode, we generate this spiffy code:
528
529 _foo2:
530 movss 4(%esp), %xmm0
531 pshufd $81, %xmm0, %xmm0
532 ret
533
534 in x86-64 mode, we generate this code, which could be better:
535
536 _foo2:
537 xorps %xmm1, %xmm1
538 movss %xmm0, %xmm1
539 pshufd $81, %xmm1, %xmm0
540 ret
541
542 In sse4 mode, we could use insertps to make both better.
543
544 Here's another testcase that could use insertps [mem]:
545
546 #include <xmmintrin.h>
547 extern float x2, x3;
548 __m128 foo1 (float x1, float x4) {
549 return _mm_set_ps (x2, x1, x3, x4);
550 }
551
552 gcc mainline compiles it to:
553
554 foo1:
555 insertps $0x10, x2(%rip), %xmm0
556 insertps $0x10, x3(%rip), %xmm1
557 movaps %xmm1, %xmm2
558 movlhps %xmm0, %xmm2
559 movaps %xmm2, %xmm0
560 ret
561
562 //===---------------------------------------------------------------------===//
563
564 We compile vector multiply-by-constant into poor code:
565
566 define <4 x i32> @f(<4 x i32> %i) nounwind {
567 %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
568 ret <4 x i32> %A
569 }
570
571 On targets without SSE4.1, this compiles into:
572
573 LCPI1_0: ## <4 x i32>
574 .long 10
575 .long 10
576 .long 10
577 .long 10
578 .text
579 .align 4,0x90
580 .globl _f
581 _f:
582 pshufd $3, %xmm0, %xmm1
583 movd %xmm1, %eax
584 imull LCPI1_0+12, %eax
585 movd %eax, %xmm1
586 pshufd $1, %xmm0, %xmm2
587 movd %xmm2, %eax
588 imull LCPI1_0+4, %eax
589 movd %eax, %xmm2
590 punpckldq %xmm1, %xmm2
591 movd %xmm0, %eax
592 imull LCPI1_0, %eax
593 movd %eax, %xmm1
594 movhlps %xmm0, %xmm0
595 movd %xmm0, %eax
596 imull LCPI1_0+8, %eax
597 movd %eax, %xmm0
598 punpckldq %xmm0, %xmm1
599 movaps %xmm1, %xmm0
600 punpckldq %xmm2, %xmm0
601 ret
602
603 It would be better to synthesize integer vector multiplication by constants
604 using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
605 simple cases such as multiplication by powers of two would be better as
606 vector shifts than as multiplications.
607
608 //===---------------------------------------------------------------------===//
609
610 We compile this:
611
612 __m128i
613 foo2 (char x)
614 {
615 return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
616 }
617
618 into:
619 movl $1, %eax
620 xorps %xmm0, %xmm0
621 pinsrw $2, %eax, %xmm0
622 movzbl 4(%esp), %eax
623 pinsrw $3, %eax, %xmm0
624 movl $256, %eax
625 pinsrw $7, %eax, %xmm0
626 ret
627
628
629 gcc-4.2:
630 subl $12, %esp
631 movzbl 16(%esp), %eax
632 movdqa LC0, %xmm0
633 pinsrw $3, %eax, %xmm0
634 addl $12, %esp
635 ret
636 .const
637 .align 4
638 LC0:
639 .word 0
640 .word 0
641 .word 1
642 .word 0
643 .word 0
644 .word 0
645 .word 0
646 .word 256
647
648 With SSE4, it should be
649 movdqa .LC0(%rip), %xmm0
650 pinsrb $6, %edi, %xmm0
651
652 //===---------------------------------------------------------------------===//
653
654 We should transform a shuffle of two vectors of constants into a single vector
655 of constants. Also, insertelement of a constant into a vector of constants
656 should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
657
658 We compiled it to something horrible:
659
660 .align 4
661 LCPI1_1: ## float
662 .long 1065353216 ## float 1
663 .const
664
665 .align 4
666 LCPI1_0: ## <4 x float>
667 .space 4
668 .long 1065353216 ## float 1
669 .space 4
670 .long 1065353216 ## float 1
671 .text
672 .align 4,0x90
673 .globl _t
674 _t:
675 xorps %xmm0, %xmm0
676 movhps LCPI1_0, %xmm0
677 movss LCPI1_1, %xmm1
678 movaps %xmm0, %xmm2
679 shufps $2, %xmm1, %xmm2
680 shufps $132, %xmm2, %xmm0
681 movaps %xmm0, 0
682
683 //===---------------------------------------------------------------------===//
684 rdar://5907648
685
686 This function:
687
688 float foo(unsigned char x) {
689 return x;
690 }
691
692 compiles to (x86-32):
693
694 define float @foo(i8 zeroext %x) nounwind {
695 %tmp12 = uitofp i8 %x to float ; <float> [#uses=1]
696 ret float %tmp12
697 }
698
699 compiles to:
700
701 _foo:
702 subl $4, %esp
703 movzbl 8(%esp), %eax
704 cvtsi2ss %eax, %xmm0
705 movss %xmm0, (%esp)
706 flds (%esp)
707 addl $4, %esp
708 ret
709
710 We should be able to use:
711 cvtsi2ss 8($esp), %xmm0
712 since we know the stack slot is already zext'd.
713
714 //===---------------------------------------------------------------------===//
715
716 Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
717 when code size is critical. movlps is slower than movsd on core2 but it's one
718 byte shorter.
719
720 //===---------------------------------------------------------------------===//
721
722 We should use a dynamic programming based approach to tell when using FPStack
723 operations is cheaper than SSE. SciMark montecarlo contains code like this
724 for example:
725
726 double MonteCarlo_num_flops(int Num_samples) {
727 return ((double) Num_samples)* 4.0;
728 }
729
730 In fpstack mode, this compiles into:
731
732 LCPI1_0:
733 .long 1082130432 ## float 4.000000e+00
734 _MonteCarlo_num_flops:
735 subl $4, %esp
736 movl 8(%esp), %eax
737 movl %eax, (%esp)
738 fildl (%esp)
739 fmuls LCPI1_0
740 addl $4, %esp
741 ret
742
743 in SSE mode, it compiles into significantly slower code:
744
745 _MonteCarlo_num_flops:
746 subl $12, %esp
747 cvtsi2sd 16(%esp), %xmm0
748 mulsd LCPI1_0, %xmm0
749 movsd %xmm0, (%esp)
750 fldl (%esp)
751 addl $12, %esp
752 ret
753
754 There are also other cases in scimark where using fpstack is better, it is
755 cheaper to do fld1 than load from a constant pool for example, so
756 "load, add 1.0, store" is better done in the fp stack, etc.
757
758 //===---------------------------------------------------------------------===//
759
760 These should compile into the same code (PR6214): Perhaps instcombine should
761 canonicalize the former into the later?
762
763 define float @foo(float %x) nounwind {
764 %t = bitcast float %x to i32
765 %s = and i32 %t, 2147483647
766 %d = bitcast i32 %s to float
767 ret float %d
768 }
769
770 declare float @fabsf(float %n)
771 define float @bar(float %x) nounwind {
772 %d = call float @fabsf(float %x)
773 ret float %d
774 }
775
776 //===---------------------------------------------------------------------===//
777
778 This IR (from PR6194):
779
780 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
781 target triple = "x86_64-apple-darwin10.0.0"
782
783 %0 = type { double, double }
784 %struct.float3 = type { float, float, float }
785
786 define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
787 entry:
788 %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1]
789 %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1]
790 %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1]
791 %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1]
792 %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1]
793 %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1]
794 %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
795 store float %tmp12, float* %tmp5
796 ret void
797 }
798
799 Compiles to:
800
801 _test: ## @test
802 movd %xmm0, %rax
803 shrq $32, %rax
804 movl %eax, 4(%rdi)
805 ret
806
807 This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
808 doing a shuffle from v[1] to v[0] then a float store.
809
810 //===---------------------------------------------------------------------===//
811
812 [UNSAFE FP]
813
814 void foo(double, double, double);
815 void norm(double x, double y, double z) {
816 double scale = __builtin_sqrt(x*x + y*y + z*z);
817 foo(x/scale, y/scale, z/scale);
818 }
819
820 We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is
821 slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first
822 and emit 3 mulsd in place of the divs. This can be done as a target-independent
823 transform.
824
825 If we're dealing with floats instead of doubles we could even replace the sqrtss
826 and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
827 cost of reduced accuracy.
828
829 //===---------------------------------------------------------------------===//
830
831 This function should be matched to haddpd when the appropriate CPU is enabled:
832
833 #include <x86intrin.h>
834 double f (__m128d p) {
835 return p[0] + p[1];
836 }
837
838 similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should
839 turn into hsubpd also.
840
841 //===---------------------------------------------------------------------===//
842
1 //===---------------------------------------------------------------------===//
2 // Random ideas for the X86 backend.
3 //===---------------------------------------------------------------------===//
4
5 Improvements to the multiply -> shift/add algorithm:
6 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
7
8 //===---------------------------------------------------------------------===//
9
10 Improve code like this (occurs fairly frequently, e.g. in LLVM):
11 long long foo(int x) { return 1LL << x; }
12
13 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
14 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
15 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
16
17 Another useful one would be ~0ULL >> X and ~0ULL << X.
18
19 One better solution for 1LL << x is:
20 xorl %eax, %eax
21 xorl %edx, %edx
22 testb $32, %cl
23 sete %al
24 setne %dl
25 sall %cl, %eax
26 sall %cl, %edx
27
28 But that requires good 8-bit subreg support.
29
30 Also, this might be better. It's an extra shift, but it's one instruction
31 shorter, and doesn't stress 8-bit subreg support.
32 (From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
33 but without the unnecessary and.)
34 movl %ecx, %eax
35 shrl $5, %eax
36 movl %eax, %edx
37 xorl $1, %edx
38 sall %cl, %eax
39 sall %cl. %edx
40
41 64-bit shifts (in general) expand to really bad code. Instead of using
42 cmovs, we should expand to a conditional branch like GCC produces.
43
44 //===---------------------------------------------------------------------===//
45
46 Some isel ideas:
47
48 1. Dynamic programming based approach when compile time is not an
49 issue.
50 2. Code duplication (addressing mode) during isel.
51 3. Other ideas from "Register-Sensitive Selection, Duplication, and
52 Sequencing of Instructions".
53 4. Scheduling for reduced register pressure. E.g. "Minimum Register
54 Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
55 and other related papers.
56 http://citeseer.ist.psu.edu/govindarajan01minimum.html
57
58 //===---------------------------------------------------------------------===//
59
60 Should we promote i16 to i32 to avoid partial register update stalls?
61
62 //===---------------------------------------------------------------------===//
63
64 Leave any_extend as pseudo instruction and hint to register
65 allocator. Delay codegen until post register allocation.
66 Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
67 the coalescer how to deal with it though.
68
69 //===---------------------------------------------------------------------===//
70
71 It appears icc use push for parameter passing. Need to investigate.
72
73 //===---------------------------------------------------------------------===//
74
75 The instruction selector sometimes misses folding a load into a compare. The
76 pattern is written as (cmp reg, (load p)). Because the compare isn't
77 commutative, it is not matched with the load on both sides. The dag combiner
78 should be made smart enough to canonicalize the load into the RHS of a compare
79 when it can invert the result of the compare for free.
80
81 //===---------------------------------------------------------------------===//
82
83 In many cases, LLVM generates code like this:
84
85 _test:
86 movl 8(%esp), %eax
87 cmpl %eax, 4(%esp)
88 setl %al
89 movzbl %al, %eax
90 ret
91
92 on some processors (which ones?), it is more efficient to do this:
93
94 _test:
95 movl 8(%esp), %ebx
96 xor %eax, %eax
97 cmpl %ebx, 4(%esp)
98 setl %al
99 ret
100
101 Doing this correctly is tricky though, as the xor clobbers the flags.
102
103 //===---------------------------------------------------------------------===//
104
105 We should generate bts/btr/etc instructions on targets where they are cheap or
106 when codesize is important. e.g., for:
107
108 void setbit(int *target, int bit) {
109 *target |= (1 << bit);
110 }
111 void clearbit(int *target, int bit) {
112 *target &= ~(1 << bit);
113 }
114
115 //===---------------------------------------------------------------------===//
116
117 Instead of the following for memset char*, 1, 10:
118
119 movl $16843009, 4(%edx)
120 movl $16843009, (%edx)
121 movw $257, 8(%edx)
122
123 It might be better to generate
124
125 movl $16843009, %eax
126 movl %eax, 4(%edx)
127 movl %eax, (%edx)
128 movw al, 8(%edx)
129
130 when we can spare a register. It reduces code size.
131
132 //===---------------------------------------------------------------------===//
133
134 Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently
135 get this:
136
137 define i32 @test1(i32 %X) {
138 %Y = sdiv i32 %X, 8
139 ret i32 %Y
140 }
141
142 _test1:
143 movl 4(%esp), %eax
144 movl %eax, %ecx
145 sarl $31, %ecx
146 shrl $29, %ecx
147 addl %ecx, %eax
148 sarl $3, %eax
149 ret
150
151 GCC knows several different ways to codegen it, one of which is this:
152
153 _test1:
154 movl 4(%esp), %eax
155 cmpl $-1, %eax
156 leal 7(%eax), %ecx
157 cmovle %ecx, %eax
158 sarl $3, %eax
159 ret
160
161 which is probably slower, but it's interesting at least :)
162
163 //===---------------------------------------------------------------------===//
164
165 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
166 We should leave these as libcalls for everything over a much lower threshold,
167 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
168 stores, TLB preheating, etc)
169
170 //===---------------------------------------------------------------------===//
171
172 Optimize this into something reasonable:
173 x * copysign(1.0, y) * copysign(1.0, z)
174
175 //===---------------------------------------------------------------------===//
176
177 Optimize copysign(x, *y) to use an integer load from y.
178
179 //===---------------------------------------------------------------------===//
180
181 The following tests perform worse with LSR:
182
183 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
184
185 //===---------------------------------------------------------------------===//
186
187 Adding to the list of cmp / test poor codegen issues:
188
189 int test(__m128 *A, __m128 *B) {
190 if (_mm_comige_ss(*A, *B))
191 return 3;
192 else
193 return 4;
194 }
195
196 _test:
197 movl 8(%esp), %eax
198 movaps (%eax), %xmm0
199 movl 4(%esp), %eax
200 movaps (%eax), %xmm1
201 comiss %xmm0, %xmm1
202 setae %al
203 movzbl %al, %ecx
204 movl $3, %eax
205 movl $4, %edx
206 cmpl $0, %ecx
207 cmove %edx, %eax
208 ret
209
210 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
211 are a number of issues. 1) We are introducing a setcc between the result of the
212 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
213 so a any extend (which becomes a zero extend) is added.
214
215 We probably need some kind of target DAG combine hook to fix this.
216
217 //===---------------------------------------------------------------------===//
218
219 We generate significantly worse code for this than GCC:
220 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
221 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
222
223 There is also one case we do worse on PPC.
224
225 //===---------------------------------------------------------------------===//
226
227 For this:
228
229 int test(int a)
230 {
231 return a * 3;
232 }
233
234 We currently emits
235 imull $3, 4(%esp), %eax
236
237 Perhaps this is what we really should generate is? Is imull three or four
238 cycles? Note: ICC generates this:
239 movl 4(%esp), %eax
240 leal (%eax,%eax,2), %eax
241
242 The current instruction priority is based on pattern complexity. The former is
243 more "complex" because it folds a load so the latter will not be emitted.
244
245 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
246 should always try to match LEA first since the LEA matching code does some
247 estimate to determine whether the match is profitable.
248
249 However, if we care more about code size, then imull is better. It's two bytes
250 shorter than movl + leal.
251
252 On a Pentium M, both variants have the same characteristics with regard
253 to throughput; however, the multiplication has a latency of four cycles, as
254 opposed to two cycles for the movl+lea variant.
255
256 //===---------------------------------------------------------------------===//
257
258 It appears gcc place string data with linkonce linkage in
259 .section __TEXT,__const_coal,coalesced instead of
260 .section __DATA,__const_coal,coalesced.
261 Take a look at darwin.h, there are other Darwin assembler directives that we
262 do not make use of.
263
264 //===---------------------------------------------------------------------===//
265
266 define i32 @foo(i32* %a, i32 %t) {
267 entry:
268 br label %cond_true
269
270 cond_true: ; preds = %cond_true, %entry
271 %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; <i32> [#uses=3]
272 %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; <i32> [#uses=1]
273 %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; <i32*> [#uses=1]
274 %tmp3 = load i32* %tmp2 ; <i32> [#uses=1]
275 %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; <i32> [#uses=1]
276 %tmp7 = add i32 %tmp5, %tmp3 ; <i32> [#uses=2]
277 %tmp9 = add i32 %x.0.0, 1 ; <i32> [#uses=2]
278 %tmp = icmp sgt i32 %tmp9, 39 ; <i1> [#uses=1]
279 br i1 %tmp, label %bb12, label %cond_true
280
281 bb12: ; preds = %cond_true
282 ret i32 %tmp7
283 }
284 is pessimized by -loop-reduce and -indvars
285
286 //===---------------------------------------------------------------------===//
287
288 u32 to float conversion improvement:
289
290 float uint32_2_float( unsigned u ) {
291 float fl = (int) (u & 0xffff);
292 float fh = (int) (u >> 16);
293 fh *= 0x1.0p16f;
294 return fh + fl;
295 }
296
297 00000000 subl $0x04,%esp
298 00000003 movl 0x08(%esp,1),%eax
299 00000007 movl %eax,%ecx
300 00000009 shrl $0x10,%ecx
301 0000000c cvtsi2ss %ecx,%xmm0
302 00000010 andl $0x0000ffff,%eax
303 00000015 cvtsi2ss %eax,%xmm1
304 00000019 mulss 0x00000078,%xmm0
305 00000021 addss %xmm1,%xmm0
306 00000025 movss %xmm0,(%esp,1)
307 0000002a flds (%esp,1)
308 0000002d addl $0x04,%esp
309 00000030 ret
310
311 //===---------------------------------------------------------------------===//
312
313 When using fastcc abi, align stack slot of argument of type double on 8 byte
314 boundary to improve performance.
315
316 //===---------------------------------------------------------------------===//
317
318 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
319 simplifications for integer "x cmp y ? a : b".
320
321 //===---------------------------------------------------------------------===//
322
323 Consider the expansion of:
324
325 define i32 @test3(i32 %X) {
326 %tmp1 = urem i32 %X, 255
327 ret i32 %tmp1
328 }
329
330 Currently it compiles to:
331
332 ...
333 movl $2155905153, %ecx
334 movl 8(%esp), %esi
335 movl %esi, %eax
336 mull %ecx
337 ...
338
339 This could be "reassociated" into:
340
341 movl $2155905153, %eax
342 movl 8(%esp), %ecx
343 mull %ecx
344
345 to avoid the copy. In fact, the existing two-address stuff would do this
346 except that mul isn't a commutative 2-addr instruction. I guess this has
347 to be done at isel time based on the #uses to mul?
348
349 //===---------------------------------------------------------------------===//
350
351 Make sure the instruction which starts a loop does not cross a cacheline
352 boundary. This requires knowning the exact length of each machine instruction.
353 That is somewhat complicated, but doable. Example 256.bzip2:
354
355 In the new trace, the hot loop has an instruction which crosses a cacheline
356 boundary. In addition to potential cache misses, this can't help decoding as I
357 imagine there has to be some kind of complicated decoder reset and realignment
358 to grab the bytes from the next cacheline.
359
360 532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines
361 942 942 0x3d03 movl %dh, (1809(%esp, %esi)
362 937 937 0x3d0a incl %esi
363 3 3 0x3d0b cmpb %bl, %dl
364 27 27 0x3d0d jnz 0x000062db <main+11707>
365
366 //===---------------------------------------------------------------------===//
367
368 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
369
370 //===---------------------------------------------------------------------===//
371
372 This could be a single 16-bit load.
373
374 int f(char *p) {
375 if ((p[0] == 1) & (p[1] == 2)) return 1;
376 return 0;
377 }
378
379 //===---------------------------------------------------------------------===//
380
381 We should inline lrintf and probably other libc functions.
382
383 //===---------------------------------------------------------------------===//
384
385 This code:
386
387 void test(int X) {
388 if (X) abort();
389 }
390
391 is currently compiled to:
392
393 _test:
394 subl $12, %esp
395 cmpl $0, 16(%esp)
396 jne LBB1_1
397 addl $12, %esp
398 ret
399 LBB1_1:
400 call L_abort$stub
401
402 It would be better to produce:
403
404 _test:
405 subl $12, %esp
406 cmpl $0, 16(%esp)
407 jne L_abort$stub
408 addl $12, %esp
409 ret
410
411 This can be applied to any no-return function call that takes no arguments etc.
412 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
413 something like this:
414
415 _test:
416 cmpl $0, 4(%esp)
417 jne LBB1_1
418 ret
419 LBB1_1:
420 subl $12, %esp
421 call L_abort$stub
422
423 Both are useful in different situations. Finally, it could be shrink-wrapped
424 and tail called, like this:
425
426 _test:
427 cmpl $0, 4(%esp)
428 jne LBB1_1
429 ret
430 LBB1_1:
431 pop %eax # realign stack.
432 call L_abort$stub
433
434 Though this probably isn't worth it.
435
436 //===---------------------------------------------------------------------===//
437
438 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
439 a neg instead of a sub instruction. Consider:
440
441 int test(char X) { return 7-X; }
442
443 we currently produce:
444 _test:
445 movl $7, %eax
446 movsbl 4(%esp), %ecx
447 subl %ecx, %eax
448 ret
449
450 We would use one fewer register if codegen'd as:
451
452 movsbl 4(%esp), %eax
453 neg %eax
454 add $7, %eax
455 ret
456
457 Note that this isn't beneficial if the load can be folded into the sub. In
458 this case, we want a sub:
459
460 int test(int X) { return 7-X; }
461 _test:
462 movl $7, %eax
463 subl 4(%esp), %eax
464 ret
465
466 //===---------------------------------------------------------------------===//
467
468 Leaf functions that require one 4-byte spill slot have a prolog like this:
469
470 _foo:
471 pushl %esi
472 subl $4, %esp
473 ...
474 and an epilog like this:
475 addl $4, %esp
476 popl %esi
477 ret
478
479 It would be smaller, and potentially faster, to push eax on entry and to
480 pop into a dummy register instead of using addl/subl of esp. Just don't pop
481 into any return registers :)
482
483 //===---------------------------------------------------------------------===//
484
485 The X86 backend should fold (branch (or (setcc, setcc))) into multiple
486 branches. We generate really poor code for:
487
488 double testf(double a) {
489 return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
490 }
491
492 For example, the entry BB is:
493
494 _testf:
495 subl $20, %esp
496 pxor %xmm0, %xmm0
497 movsd 24(%esp), %xmm1
498 ucomisd %xmm0, %xmm1
499 setnp %al
500 sete %cl
501 testb %cl, %al
502 jne LBB1_5 # UnifiedReturnBlock
503 LBB1_1: # cond_true
504
505
506 it would be better to replace the last four instructions with:
507
508 jp LBB1_1
509 je LBB1_5
510 LBB1_1:
511
512 We also codegen the inner ?: into a diamond:
513
514 cvtss2sd LCPI1_0(%rip), %xmm2
515 cvtss2sd LCPI1_1(%rip), %xmm3
516 ucomisd %xmm1, %xmm0
517 ja LBB1_3 # cond_true
518 LBB1_2: # cond_true
519 movapd %xmm3, %xmm2
520 LBB1_3: # cond_true
521 movapd %xmm2, %xmm0
522 ret
523
524 We should sink the load into xmm3 into the LBB1_2 block. This should
525 be pretty easy, and will nuke all the copies.
526
527 //===---------------------------------------------------------------------===//
528
529 This:
530 #include <algorithm>
531 inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
532 { return std::make_pair(a + b, a + b < a); }
533 bool no_overflow(unsigned a, unsigned b)
534 { return !full_add(a, b).second; }
535
536 Should compile to:
537 addl %esi, %edi
538 setae %al
539 movzbl %al, %eax
540 ret
541
542 on x86-64, instead of the rather stupid-looking:
543 addl %esi, %edi
544 setb %al
545 xorb $1, %al
546 movzbl %al, %eax
547 ret
548
549
550 //===---------------------------------------------------------------------===//
551
552 The following code:
553
554 bb114.preheader: ; preds = %cond_next94
555 %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1]
556 %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1]
557 %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1]
558 %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1]
559 %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1]
560 %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2]
561 %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1]
562 %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1]
563 %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1]
564 %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1]
565 %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1]
566 br label %bb114
567
568 produces:
569
570 LBB3_5: # bb114.preheader
571 movswl -68(%ebp), %eax
572 movl $32, %ecx
573 movl %ecx, -80(%ebp)
574 subl %eax, -80(%ebp)
575 movswl -52(%ebp), %eax
576 movl %ecx, -84(%ebp)
577 subl %eax, -84(%ebp)
578 movswl -70(%ebp), %eax
579 movl %ecx, -88(%ebp)
580 subl %eax, -88(%ebp)
581 movswl -50(%ebp), %eax
582 subl %eax, %ecx
583 movl %ecx, -76(%ebp)
584 movswl -42(%ebp), %eax
585 movl %eax, -92(%ebp)
586 movswl -66(%ebp), %eax
587 movl %eax, -96(%ebp)
588 movw $0, -98(%ebp)
589
590 This appears to be bad because the RA is not folding the store to the stack
591 slot into the movl. The above instructions could be:
592 movl $32, -80(%ebp)
593 ...
594 movl $32, -84(%ebp)
595 ...
596 This seems like a cross between remat and spill folding.
597
598 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
599 change, so we could simply subtract %eax from %ecx first and then use %ecx (or
600 vice-versa).
601
602 //===---------------------------------------------------------------------===//
603
604 This code:
605
606 %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1]
607 br i1 %tmp659, label %cond_true662, label %cond_next715
608
609 produces this:
610
611 testw %cx, %cx
612 movswl %cx, %esi
613 jns LBB4_109 # cond_next715
614
615 Shark tells us that using %cx in the testw instruction is sub-optimal. It
616 suggests using the 32-bit register (which is what ICC uses).
617
618 //===---------------------------------------------------------------------===//
619
620 We compile this:
621
622 void compare (long long foo) {
623 if (foo < 4294967297LL)
624 abort();
625 }
626
627 to:
628
629 compare:
630 subl $4, %esp
631 cmpl $0, 8(%esp)
632 setne %al
633 movzbw %al, %ax
634 cmpl $1, 12(%esp)
635 setg %cl
636 movzbw %cl, %cx
637 cmove %ax, %cx
638 testb $1, %cl
639 jne .LBB1_2 # UnifiedReturnBlock
640 .LBB1_1: # ifthen
641 call abort
642 .LBB1_2: # UnifiedReturnBlock
643 addl $4, %esp
644 ret
645
646 (also really horrible code on ppc). This is due to the expand code for 64-bit
647 compares. GCC produces multiple branches, which is much nicer:
648
649 compare:
650 subl $12, %esp
651 movl 20(%esp), %edx
652 movl 16(%esp), %eax
653 decl %edx
654 jle .L7
655 .L5:
656 addl $12, %esp
657 ret
658 .p2align 4,,7
659 .L7:
660 jl .L4
661 cmpl $0, %eax
662 .p2align 4,,8
663 ja .L5
664 .L4:
665 .p2align 4,,9
666 call abort
667
668 //===---------------------------------------------------------------------===//
669
670 Tail call optimization improvements: Tail call optimization currently
671 pushes all arguments on the top of the stack (their normal place for
672 non-tail call optimized calls) that source from the callers arguments
673 or that source from a virtual register (also possibly sourcing from
674 callers arguments).
675 This is done to prevent overwriting of parameters (see example
676 below) that might be used later.
677
678 example:
679
680 int callee(int32, int64);
681 int caller(int32 arg1, int32 arg2) {
682 int64 local = arg2 * 2;
683 return callee(arg2, (int64)local);
684 }
685
686 [arg1] [!arg2 no longer valid since we moved local onto it]
687 [arg2] -> [(int64)
688 [RETADDR] local ]
689
690 Moving arg1 onto the stack slot of callee function would overwrite
691 arg2 of the caller.
692
693 Possible optimizations:
694
695
696 - Analyse the actual parameters of the callee to see which would
697 overwrite a caller parameter which is used by the callee and only
698 push them onto the top of the stack.
699
700 int callee (int32 arg1, int32 arg2);
701 int caller (int32 arg1, int32 arg2) {
702 return callee(arg1,arg2);
703 }
704
705 Here we don't need to write any variables to the top of the stack
706 since they don't overwrite each other.
707
708 int callee (int32 arg1, int32 arg2);
709 int caller (int32 arg1, int32 arg2) {
710 return callee(arg2,arg1);
711 }
712
713 Here we need to push the arguments because they overwrite each
714 other.
715
716 //===---------------------------------------------------------------------===//
717
718 main ()
719 {
720 int i = 0;
721 unsigned long int z = 0;
722
723 do {
724 z -= 0x00004000;
725 i++;
726 if (i > 0x00040000)
727 abort ();
728 } while (z > 0);
729 exit (0);
730 }
731
732 gcc compiles this to:
733
734 _main:
735 subl $28, %esp
736 xorl %eax, %eax
737 jmp L2
738 L3:
739 cmpl $262144, %eax
740 je L10
741 L2:
742 addl $1, %eax
743 cmpl $262145, %eax
744 jne L3
745 call L_abort$stub
746 L10:
747 movl $0, (%esp)
748 call L_exit$stub
749
750 llvm:
751
752 _main:
753 subl $12, %esp
754 movl $1, %eax
755 movl $16384, %ecx
756 LBB1_1: # bb
757 cmpl $262145, %eax
758 jge LBB1_4 # cond_true
759 LBB1_2: # cond_next
760 incl %eax
761 addl $4294950912, %ecx
762 cmpl $16384, %ecx
763 jne LBB1_1 # bb
764 LBB1_3: # bb11
765 xorl %eax, %eax
766 addl $12, %esp
767 ret
768 LBB1_4: # cond_true
769 call L_abort$stub
770
771 1. LSR should rewrite the first cmp with induction variable %ecx.
772 2. DAG combiner should fold
773 leal 1(%eax), %edx
774 cmpl $262145, %edx
775 =>
776 cmpl $262144, %eax
777
778 //===---------------------------------------------------------------------===//
779
780 define i64 @test(double %X) {
781 %Y = fptosi double %X to i64
782 ret i64 %Y
783 }
784
785 compiles to:
786
787 _test:
788 subl $20, %esp
789 movsd 24(%esp), %xmm0
790 movsd %xmm0, 8(%esp)
791 fldl 8(%esp)
792 fisttpll (%esp)
793 movl 4(%esp), %edx
794 movl (%esp), %eax
795 addl $20, %esp
796 #FP_REG_KILL
797 ret
798
799 This should just fldl directly from the input stack slot.
800
801 //===---------------------------------------------------------------------===//
802
803 This code:
804 int foo (int x) { return (x & 65535) | 255; }
805
806 Should compile into:
807
808 _foo:
809 movzwl 4(%esp), %eax
810 orl $255, %eax
811 ret
812
813 instead of:
814 _foo:
815 movl $65280, %eax
816 andl 4(%esp), %eax
817 orl $255, %eax
818 ret
819
820 //===---------------------------------------------------------------------===//
821
822 We're codegen'ing multiply of long longs inefficiently:
823
824 unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
825 return arg1 * arg2;
826 }
827
828 We compile to (fomit-frame-pointer):
829
830 _LLM:
831 pushl %esi
832 movl 8(%esp), %ecx
833 movl 16(%esp), %esi
834 movl %esi, %eax
835 mull %ecx
836 imull 12(%esp), %esi
837 addl %edx, %esi
838 imull 20(%esp), %ecx
839 movl %esi, %edx
840 addl %ecx, %edx
841 popl %esi
842 ret
843
844 This looks like a scheduling deficiency and lack of remat of the load from
845 the argument area. ICC apparently produces:
846
847 movl 8(%esp), %ecx
848 imull 12(%esp), %ecx
849 movl 16(%esp), %eax
850 imull 4(%esp), %eax
851 addl %eax, %ecx
852 movl 4(%esp), %eax
853 mull 12(%esp)
854 addl %ecx, %edx
855 ret
856
857 Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR:
858 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
859
860 //===---------------------------------------------------------------------===//
861
862 We can fold a store into "zeroing a reg". Instead of:
863
864 xorl %eax, %eax
865 movl %eax, 124(%esp)
866
867 we should get:
868
869 movl $0, 124(%esp)
870
871 if the flags of the xor are dead.
872
873 Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should
874 be folded into: shl [mem], 1
875
876 //===---------------------------------------------------------------------===//
877
878 In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
879 or and instruction, for example:
880
881 xorpd LCPI1_0, %xmm2
882
883 However, if xmm2 gets spilled, we end up with really ugly code like this:
884
885 movsd (%esp), %xmm0
886 xorpd LCPI1_0, %xmm0
887 movsd %xmm0, (%esp)
888
889 Since we 'know' that this is a 'neg', we can actually "fold" the spill into
890 the neg/abs instruction, turning it into an *integer* operation, like this:
891
892 xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31)
893
894 you could also use xorb, but xorl is less likely to lead to a partial register
895 stall. Here is a contrived testcase:
896
897 double a, b, c;
898 void test(double *P) {
899 double X = *P;
900 a = X;
901 bar();
902 X = -X;
903 b = X;
904 bar();
905 c = X;
906 }
907
908 //===---------------------------------------------------------------------===//
909
910 The generated code on x86 for checking for signed overflow on a multiply the
911 obvious way is much longer than it needs to be.
912
913 int x(int a, int b) {
914 long long prod = (long long)a*b;
915 return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
916 }
917
918 See PR2053 for more details.
919
920 //===---------------------------------------------------------------------===//
921
922 We should investigate using cdq/ctld (effect: edx = sar eax, 31)
923 more aggressively; it should cost the same as a move+shift on any modern
924 processor, but it's a lot shorter. Downside is that it puts more
925 pressure on register allocation because it has fixed operands.
926
927 Example:
928 int abs(int x) {return x < 0 ? -x : x;}
929
930 gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
931 abs:
932 movl 4(%esp), %eax
933 cltd
934 xorl %edx, %eax
935 subl %edx, %eax
936 ret
937
938 //===---------------------------------------------------------------------===//
939
940 Take the following code (from
941 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
942
943 extern unsigned char first_one[65536];
944 int FirstOnet(unsigned long long arg1)
945 {
946 if (arg1 >> 48)
947 return (first_one[arg1 >> 48]);
948 return 0;
949 }
950
951
952 The following code is currently generated:
953 FirstOnet:
954 movl 8(%esp), %eax
955 cmpl $65536, %eax
956 movl 4(%esp), %ecx
957 jb .LBB1_2 # UnifiedReturnBlock
958 .LBB1_1: # ifthen
959 shrl $16, %eax
960 movzbl first_one(%eax), %eax
961 ret
962 .LBB1_2: # UnifiedReturnBlock
963 xorl %eax, %eax
964 ret
965
966 We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
967 lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
968
969 //===---------------------------------------------------------------------===//
970
971 We compile this function:
972
973 define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind {
974 entry:
975 %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1]
976 br i1 %tmp2, label %bb7, label %bb
977
978 bb: ; preds = %entry
979 %tmp6 = add i32 %b, %a ; <i32> [#uses=1]
980 ret i32 %tmp6
981
982 bb7: ; preds = %entry
983 %tmp10 = sub i32 %a, %c ; <i32> [#uses=1]
984 ret i32 %tmp10
985 }
986
987 to:
988
989 foo: # @foo
990 # %bb.0: # %entry
991 movl 4(%esp), %ecx
992 cmpb $0, 16(%esp)
993 je .LBB0_2
994 # %bb.1: # %bb
995 movl 8(%esp), %eax
996 addl %ecx, %eax
997 ret
998 .LBB0_2: # %bb7
999 movl 12(%esp), %edx
1000 movl %ecx, %eax
1001 subl %edx, %eax
1002 ret
1003
1004 There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
1005 couple more movls by putting 4(%esp) into %eax instead of %ecx.
1006
1007 //===---------------------------------------------------------------------===//
1008
1009 See rdar://4653682.
1010
1011 From flops:
1012
1013 LBB1_15: # bb310
1014 cvtss2sd LCPI1_0, %xmm1
1015 addsd %xmm1, %xmm0
1016 movsd 176(%esp), %xmm2
1017 mulsd %xmm0, %xmm2
1018 movapd %xmm2, %xmm3
1019 mulsd %xmm3, %xmm3
1020 movapd %xmm3, %xmm4
1021 mulsd LCPI1_23, %xmm4
1022 addsd LCPI1_24, %xmm4
1023 mulsd %xmm3, %xmm4
1024 addsd LCPI1_25, %xmm4
1025 mulsd %xmm3, %xmm4
1026 addsd LCPI1_26, %xmm4
1027 mulsd %xmm3, %xmm4
1028 addsd LCPI1_27, %xmm4
1029 mulsd %xmm3, %xmm4
1030 addsd LCPI1_28, %xmm4
1031 mulsd %xmm3, %xmm4
1032 addsd %xmm1, %xmm4
1033 mulsd %xmm2, %xmm4
1034 movsd 152(%esp), %xmm1
1035 addsd %xmm4, %xmm1
1036 movsd %xmm1, 152(%esp)
1037 incl %eax
1038 cmpl %eax, %esi
1039 jge LBB1_15 # bb310
1040 LBB1_16: # bb358.loopexit
1041 movsd 152(%esp), %xmm0
1042 addsd %xmm0, %xmm0
1043 addsd LCPI1_22, %xmm0
1044 movsd %xmm0, 152(%esp)
1045
1046 Rather than spilling the result of the last addsd in the loop, we should have
1047 insert a copy to split the interval (one for the duration of the loop, one
1048 extending to the fall through). The register pressure in the loop isn't high
1049 enough to warrant the spill.
1050
1051 Also check why xmm7 is not used at all in the function.
1052
1053 //===---------------------------------------------------------------------===//
1054
1055 Take the following:
1056
1057 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128"
1058 target triple = "i386-apple-darwin8"
1059 @in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2]
1060 define fastcc void @abort_gzip() noreturn nounwind {
1061 entry:
1062 %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1]
1063 br i1 %tmp.b.i, label %bb.i, label %bb4.i
1064 bb.i: ; preds = %entry
1065 tail call void @exit( i32 1 ) noreturn nounwind
1066 unreachable
1067 bb4.i: ; preds = %entry
1068 store i1 true, i1* @in_exit.4870.b
1069 tail call void @exit( i32 1 ) noreturn nounwind
1070 unreachable
1071 }
1072 declare void @exit(i32) noreturn nounwind
1073
1074 This compiles into:
1075 _abort_gzip: ## @abort_gzip
1076 ## %bb.0: ## %entry
1077 subl $12, %esp
1078 movb _in_exit.4870.b, %al
1079 cmpb $1, %al
1080 jne LBB0_2
1081
1082 We somehow miss folding the movb into the cmpb.
1083
1084 //===---------------------------------------------------------------------===//
1085
1086 We compile:
1087
1088 int test(int x, int y) {
1089 return x-y-1;
1090 }
1091
1092 into (-m64):
1093
1094 _test:
1095 decl %edi
1096 movl %edi, %eax
1097 subl %esi, %eax
1098 ret
1099
1100 it would be better to codegen as: x+~y (notl+addl)
1101
1102 //===---------------------------------------------------------------------===//
1103
1104 This code:
1105
1106 int foo(const char *str,...)
1107 {
1108 __builtin_va_list a; int x;
1109 __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
1110 return x;
1111 }
1112
1113 gets compiled into this on x86-64:
1114 subq $200, %rsp
1115 movaps %xmm7, 160(%rsp)
1116 movaps %xmm6, 144(%rsp)
1117 movaps %xmm5, 128(%rsp)
1118 movaps %xmm4, 112(%rsp)
1119 movaps %xmm3, 96(%rsp)
1120 movaps %xmm2, 80(%rsp)
1121 movaps %xmm1, 64(%rsp)
1122 movaps %xmm0, 48(%rsp)
1123 movq %r9, 40(%rsp)
1124 movq %r8, 32(%rsp)
1125 movq %rcx, 24(%rsp)
1126 movq %rdx, 16(%rsp)
1127 movq %rsi, 8(%rsp)
1128 leaq (%rsp), %rax
1129 movq %rax, 192(%rsp)
1130 leaq 208(%rsp), %rax
1131 movq %rax, 184(%rsp)
1132 movl $48, 180(%rsp)
1133 movl $8, 176(%rsp)
1134 movl 176(%rsp), %eax
1135 cmpl $47, %eax
1136 jbe .LBB1_3 # bb
1137 .LBB1_1: # bb3
1138 movq 184(%rsp), %rcx
1139 leaq 8(%rcx), %rax
1140 movq %rax, 184(%rsp)
1141 .LBB1_2: # bb4
1142 movl (%rcx), %eax
1143 addq $200, %rsp
1144 ret
1145 .LBB1_3: # bb
1146 movl %eax, %ecx
1147 addl $8, %eax
1148 addq 192(%rsp), %rcx
1149 movl %eax, 176(%rsp)
1150 jmp .LBB1_2 # bb4
1151
1152 gcc 4.3 generates:
1153 subq $96, %rsp
1154 .LCFI0:
1155 leaq 104(%rsp), %rax
1156 movq %rsi, -80(%rsp)
1157 movl $8, -120(%rsp)
1158 movq %rax, -112(%rsp)
1159 leaq -88(%rsp), %rax
1160 movq %rax, -104(%rsp)
1161 movl $8, %eax
1162 cmpl $48, %eax
1163 jb .L6
1164 movq -112(%rsp), %rdx
1165 movl (%rdx), %eax
1166 addq $96, %rsp
1167 ret
1168 .p2align 4,,10
1169 .p2align 3
1170 .L6:
1171 mov %eax, %edx
1172 addq -104(%rsp), %rdx
1173 addl $8, %eax
1174 movl %eax, -120(%rsp)
1175 movl (%rdx), %eax
1176 addq $96, %rsp
1177 ret
1178
1179 and it gets compiled into this on x86:
1180 pushl %ebp
1181 movl %esp, %ebp
1182 subl $4, %esp
1183 leal 12(%ebp), %eax
1184 movl %eax, -4(%ebp)
1185 leal 16(%ebp), %eax
1186 movl %eax, -4(%ebp)
1187 movl 12(%ebp), %eax
1188 addl $4, %esp
1189 popl %ebp
1190 ret
1191
1192 gcc 4.3 generates:
1193 pushl %ebp
1194 movl %esp, %ebp
1195 movl 12(%ebp), %eax
1196 popl %ebp
1197 ret
1198
1199 //===---------------------------------------------------------------------===//
1200
1201 Teach tblgen not to check bitconvert source type in some cases. This allows us
1202 to consolidate the following patterns in X86InstrMMX.td:
1203
1204 def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1205 (iPTR 0))))),
1206 (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
1207 def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1208 (iPTR 0))))),
1209 (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
1210 def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1211 (iPTR 0))))),
1212 (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
1213
1214 There are other cases in various td files.
1215
1216 //===---------------------------------------------------------------------===//
1217
1218 Take something like the following on x86-32:
1219 unsigned a(unsigned long long x, unsigned y) {return x % y;}
1220
1221 We currently generate a libcall, but we really shouldn't: the expansion is
1222 shorter and likely faster than the libcall. The expected code is something
1223 like the following:
1224
1225 movl 12(%ebp), %eax
1226 movl 16(%ebp), %ecx
1227 xorl %edx, %edx
1228 divl %ecx
1229 movl 8(%ebp), %eax
1230 divl %ecx
1231 movl %edx, %eax
1232 ret
1233
1234 A similar code sequence works for division.
1235
1236 //===---------------------------------------------------------------------===//
1237
1238 We currently compile this:
1239
1240 define i32 @func1(i32 %v1, i32 %v2) nounwind {
1241 entry:
1242 %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
1243 %sum = extractvalue {i32, i1} %t, 0
1244 %obit = extractvalue {i32, i1} %t, 1
1245 br i1 %obit, label %overflow, label %normal
1246 normal:
1247 ret i32 %sum
1248 overflow:
1249 call void @llvm.trap()
1250 unreachable
1251 }
1252 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
1253 declare void @llvm.trap()
1254
1255 to:
1256
1257 _func1:
1258 movl 4(%esp), %eax
1259 addl 8(%esp), %eax
1260 jo LBB1_2 ## overflow
1261 LBB1_1: ## normal
1262 ret
1263 LBB1_2: ## overflow
1264 ud2
1265
1266 it would be nice to produce "into" someday.
1267
1268 //===---------------------------------------------------------------------===//
1269
1270 Test instructions can be eliminated by using EFLAGS values from arithmetic
1271 instructions. This is currently not done for mul, and, or, xor, neg, shl,
1272 sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
1273 for read-modify-write instructions. It is also current not done if the
1274 OF or CF flags are needed.
1275
1276 The shift operators have the complication that when the shift count is
1277 zero, EFLAGS is not set, so they can only subsume a test instruction if
1278 the shift count is known to be non-zero. Also, using the EFLAGS value
1279 from a shift is apparently very slow on some x86 implementations.
1280
1281 In read-modify-write instructions, the root node in the isel match is
1282 the store, and isel has no way for the use of the EFLAGS result of the
1283 arithmetic to be remapped to the new node.
1284
1285 Add and subtract instructions set OF on signed overflow and CF on unsiged
1286 overflow, while test instructions always clear OF and CF. In order to
1287 replace a test with an add or subtract in a situation where OF or CF is
1288 needed, codegen must be able to prove that the operation cannot see
1289 signed or unsigned overflow, respectively.
1290
1291 //===---------------------------------------------------------------------===//
1292
1293 memcpy/memmove do not lower to SSE copies when possible. A silly example is:
1294 define <16 x float> @foo(<16 x float> %A) nounwind {
1295 %tmp = alloca <16 x float>, align 16
1296 %tmp2 = alloca <16 x float>, align 16
1297 store <16 x float> %A, <16 x float>* %tmp
1298 %s = bitcast <16 x float>* %tmp to i8*
1299 %s2 = bitcast <16 x float>* %tmp2 to i8*
1300 call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
1301 %R = load <16 x float>* %tmp2
1302 ret <16 x float> %R
1303 }
1304
1305 declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
1306
1307 which compiles to:
1308
1309 _foo:
1310 subl $140, %esp
1311 movaps %xmm3, 112(%esp)
1312 movaps %xmm2, 96(%esp)
1313 movaps %xmm1, 80(%esp)
1314 movaps %xmm0, 64(%esp)
1315 movl 60(%esp), %eax
1316 movl %eax, 124(%esp)
1317 movl 56(%esp), %eax
1318 movl %eax, 120(%esp)
1319 movl 52(%esp), %eax
1320 <many many more 32-bit copies>
1321 movaps (%esp), %xmm0
1322 movaps 16(%esp), %xmm1
1323 movaps 32(%esp), %xmm2
1324 movaps 48(%esp), %xmm3
1325 addl $140, %esp
1326 ret
1327
1328 On Nehalem, it may even be cheaper to just use movups when unaligned than to
1329 fall back to lower-granularity chunks.
1330
1331 //===---------------------------------------------------------------------===//
1332
1333 Implement processor-specific optimizations for parity with GCC on these
1334 processors. GCC does two optimizations:
1335
1336 1. ix86_pad_returns inserts a noop before ret instructions if immediately
1337 preceded by a conditional branch or is the target of a jump.
1338 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
1339 code contains more than 3 branches.
1340
1341 The first one is done for all AMDs, Core2, and "Generic"
1342 The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
1343 Core 2, and "Generic"
1344
1345 //===---------------------------------------------------------------------===//
1346 Testcase:
1347 int x(int a) { return (a&0xf0)>>4; }
1348
1349 Current output:
1350 movl 4(%esp), %eax
1351 shrl $4, %eax
1352 andl $15, %eax
1353 ret
1354
1355 Ideal output:
1356 movzbl 4(%esp), %eax
1357 shrl $4, %eax
1358 ret
1359
1360 //===---------------------------------------------------------------------===//
1361
1362 Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
1363 properly.
1364
1365 When the return value is not used (i.e. only care about the value in the
1366 memory), x86 does not have to use add to implement these. Instead, it can use
1367 add, sub, inc, dec instructions with the "lock" prefix.
1368
1369 This is currently implemented using a bit of instruction selection trick. The
1370 issue is the target independent pattern produces one output and a chain and we
1371 want to map it into one that just output a chain. The current trick is to select
1372 it into a MERGE_VALUES with the first definition being an implicit_def. The
1373 proper solution is to add new ISD opcodes for the no-output variant. DAG
1374 combiner can then transform the node before it gets to target node selection.
1375
1376 Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
1377 fact these instructions are identical to the non-lock versions. We need a way to
1378 add target specific information to target nodes and have this information
1379 carried over to machine instructions. Asm printer (or JIT) can use this
1380 information to add the "lock" prefix.
1381
1382 //===---------------------------------------------------------------------===//
1383
1384 struct B {
1385 unsigned char y0 : 1;
1386 };
1387
1388 int bar(struct B* a) { return a->y0; }
1389
1390 define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
1391 %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
1392 %2 = load i8* %1, align 1
1393 %3 = and i8 %2, 1
1394 %4 = zext i8 %3 to i32
1395 ret i32 %4
1396 }
1397
1398 bar: # @bar
1399 # %bb.0:
1400 movb (%rdi), %al
1401 andb $1, %al
1402 movzbl %al, %eax
1403 ret
1404
1405 Missed optimization: should be movl+andl.
1406
1407 //===---------------------------------------------------------------------===//
1408
1409 The x86_64 abi says:
1410
1411 Booleans, when stored in a memory object, are stored as single byte objects the
1412 value of which is always 0 (false) or 1 (true).
1413
1414 We are not using this fact:
1415
1416 int bar(_Bool *a) { return *a; }
1417
1418 define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
1419 %1 = load i8* %a, align 1, !tbaa !0
1420 %tmp = and i8 %1, 1
1421 %2 = zext i8 %tmp to i32
1422 ret i32 %2
1423 }
1424
1425 bar:
1426 movb (%rdi), %al
1427 andb $1, %al
1428 movzbl %al, %eax
1429 ret
1430
1431 GCC produces
1432
1433 bar:
1434 movzbl (%rdi), %eax
1435 ret
1436
1437 //===---------------------------------------------------------------------===//
1438
1439 Take the following C code:
1440 int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
1441
1442 We generate the following IR with clang:
1443 define i32 @f(i32 %a, i32 %b) nounwind readnone {
1444 entry:
1445 %tmp = xor i32 %b, %a ; <i32> [#uses=1]
1446 %tmp6 = and i32 %tmp, 255 ; <i32> [#uses=1]
1447 %cmp = icmp eq i32 %tmp6, 0 ; <i1> [#uses=1]
1448 %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1]
1449 ret i32 %conv5
1450 }
1451
1452 And the following x86 code:
1453 xorl %esi, %edi
1454 testb $-1, %dil
1455 sete %al
1456 movzbl %al, %eax
1457 ret
1458
1459 A cmpb instead of the xorl+testb would be one instruction shorter.
1460
1461 //===---------------------------------------------------------------------===//
1462
1463 Given the following C code:
1464 int f(int a, int b) { return (signed char)a == (signed char)b; }
1465
1466 We generate the following IR with clang:
1467 define i32 @f(i32 %a, i32 %b) nounwind readnone {
1468 entry:
1469 %sext = shl i32 %a, 24 ; <i32> [#uses=1]
1470 %conv1 = ashr i32 %sext, 24 ; <i32> [#uses=1]
1471 %sext6 = shl i32 %b, 24 ; <i32> [#uses=1]
1472 %conv4 = ashr i32 %sext6, 24 ; <i32> [#uses=1]
1473 %cmp = icmp eq i32 %conv1, %conv4 ; <i1> [#uses=1]
1474 %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1]
1475 ret i32 %conv5
1476 }
1477
1478 And the following x86 code:
1479 movsbl %sil, %eax
1480 movsbl %dil, %ecx
1481 cmpl %eax, %ecx
1482 sete %al
1483 movzbl %al, %eax
1484 ret
1485
1486
1487 It should be possible to eliminate the sign extensions.
1488
1489 //===---------------------------------------------------------------------===//
1490
1491 LLVM misses a load+store narrowing opportunity in this code:
1492
1493 %struct.bf = type { i64, i16, i16, i32 }
1494
1495 @bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2]
1496
1497 define void @t1() nounwind ssp {
1498 entry:
1499 %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1]
1500 %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
1501 %2 = bitcast i16* %1 to i32* ; <i32*> [#uses=2]
1502 %3 = load i32* %2, align 1 ; <i32> [#uses=1]
1503 %4 = and i32 %3, -65537 ; <i32> [#uses=1]
1504 store i32 %4, i32* %2, align 1
1505 %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1]
1506 %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
1507 %7 = bitcast i16* %6 to i32* ; <i32*> [#uses=2]
1508 %8 = load i32* %7, align 1 ; <i32> [#uses=1]
1509 %9 = and i32 %8, -131073 ; <i32> [#uses=1]
1510 store i32 %9, i32* %7, align 1
1511 ret void
1512 }
1513
1514 LLVM currently emits this:
1515
1516 movq bfi(%rip), %rax
1517 andl $-65537, 8(%rax)
1518 movq bfi(%rip), %rax
1519 andl $-131073, 8(%rax)
1520 ret
1521
1522 It could narrow the loads and stores to emit this:
1523
1524 movq bfi(%rip), %rax
1525 andb $-2, 10(%rax)
1526 movq bfi(%rip), %rax
1527 andb $-3, 10(%rax)
1528 ret
1529
1530 The trouble is that there is a TokenFactor between the store and the
1531 load, making it non-trivial to determine if there's anything between
1532 the load and the store which would prohibit narrowing.
1533
1534 //===---------------------------------------------------------------------===//
1535
1536 This code:
1537 void foo(unsigned x) {
1538 if (x == 0) bar();
1539 else if (x == 1) qux();
1540 }
1541
1542 currently compiles into:
1543 _foo:
1544 movl 4(%esp), %eax
1545 cmpl $1, %eax
1546 je LBB0_3
1547 testl %eax, %eax
1548 jne LBB0_4
1549
1550 the testl could be removed:
1551 _foo:
1552 movl 4(%esp), %eax
1553 cmpl $1, %eax
1554 je LBB0_3
1555 jb LBB0_4
1556
1557 0 is the only unsigned number < 1.
1558
1559 //===---------------------------------------------------------------------===//
1560
1561 This code:
1562
1563 %0 = type { i32, i1 }
1564
1565 define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
1566 entry:
1567 %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
1568 %cmp = extractvalue %0 %uadd, 1
1569 %inc = zext i1 %cmp to i32
1570 %add = add i32 %x, %sum
1571 %z.0 = add i32 %add, %inc
1572 ret i32 %z.0
1573 }
1574
1575 declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
1576
1577 compiles to:
1578
1579 _add32carry: ## @add32carry
1580 addl %esi, %edi
1581 sbbl %ecx, %ecx
1582 movl %edi, %eax
1583 subl %ecx, %eax
1584 ret
1585
1586 But it could be:
1587
1588 _add32carry:
1589 leal (%rsi,%rdi), %eax
1590 cmpl %esi, %eax
1591 adcl $0, %eax
1592 ret
1593
1594 //===---------------------------------------------------------------------===//
1595
1596 The hot loop of 256.bzip2 contains code that looks a bit like this:
1597
1598 int foo(char *P, char *Q, int x, int y) {
1599 if (P[0] != Q[0])
1600 return P[0] < Q[0];
1601 if (P[1] != Q[1])
1602 return P[1] < Q[1];
1603 if (P[2] != Q[2])
1604 return P[2] < Q[2];
1605 return P[3] < Q[3];
1606 }
1607
1608 In the real code, we get a lot more wrong than this. However, even in this
1609 code we generate:
1610
1611 _foo: ## @foo
1612 ## %bb.0: ## %entry
1613 movb (%rsi), %al
1614 movb (%rdi), %cl
1615 cmpb %al, %cl
1616 je LBB0_2
1617 LBB0_1: ## %if.then
1618 cmpb %al, %cl
1619 jmp LBB0_5
1620 LBB0_2: ## %if.end
1621 movb 1(%rsi), %al
1622 movb 1(%rdi), %cl
1623 cmpb %al, %cl
1624 jne LBB0_1
1625 ## %bb.3: ## %if.end38
1626 movb 2(%rsi), %al
1627 movb 2(%rdi), %cl
1628 cmpb %al, %cl
1629 jne LBB0_1
1630 ## %bb.4: ## %if.end60
1631 movb 3(%rdi), %al
1632 cmpb 3(%rsi), %al
1633 LBB0_5: ## %if.end60
1634 setl %al
1635 movzbl %al, %eax
1636 ret
1637
1638 Note that we generate jumps to LBB0_1 which does a redundant compare. The
1639 redundant compare also forces the register values to be live, which prevents
1640 folding one of the loads into the compare. In contrast, GCC 4.2 produces:
1641
1642 _foo:
1643 movzbl (%rsi), %eax
1644 cmpb %al, (%rdi)
1645 jne L10
1646 L12:
1647 movzbl 1(%rsi), %eax
1648 cmpb %al, 1(%rdi)
1649 jne L10
1650 movzbl 2(%rsi), %eax
1651 cmpb %al, 2(%rdi)
1652 jne L10
1653 movzbl 3(%rdi), %eax
1654 cmpb 3(%rsi), %al
1655 L10:
1656 setl %al
1657 movzbl %al, %eax
1658 ret
1659
1660 which is "perfect".
1661
1662 //===---------------------------------------------------------------------===//
1663
1664 For the branch in the following code:
1665 int a();
1666 int b(int x, int y) {
1667 if (x & (1<<(y&7)))
1668 return a();
1669 return y;
1670 }
1671
1672 We currently generate:
1673 movb %sil, %al
1674 andb $7, %al
1675 movzbl %al, %eax
1676 btl %eax, %edi
1677 jae .LBB0_2
1678
1679 movl+andl would be shorter than the movb+andb+movzbl sequence.
1680
1681 //===---------------------------------------------------------------------===//
1682
1683 For the following:
1684 struct u1 {
1685 float x, y;
1686 };
1687 float foo(struct u1 u) {
1688 return u.x + u.y;
1689 }
1690
1691 We currently generate:
1692 movdqa %xmm0, %xmm1
1693 pshufd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,0]
1694 addss %xmm1, %xmm0
1695 ret
1696
1697 We could save an instruction here by commuting the addss.
1698
1699 //===---------------------------------------------------------------------===//
1700
1701 This (from PR9661):
1702
1703 float clamp_float(float a) {
1704 if (a > 1.0f)
1705 return 1.0f;
1706 else if (a < 0.0f)
1707 return 0.0f;
1708 else
1709 return a;
1710 }
1711
1712 Could compile to:
1713
1714 clamp_float: # @clamp_float
1715 movss .LCPI0_0(%rip), %xmm1
1716 minss %xmm1, %xmm0
1717 pxor %xmm1, %xmm1
1718 maxss %xmm1, %xmm0
1719 ret
1720
1721 with -ffast-math.
1722
1723 //===---------------------------------------------------------------------===//
1724
1725 This function (from PR9803):
1726
1727 int clamp2(int a) {
1728 if (a > 5)
1729 a = 5;
1730 if (a < 0)
1731 return 0;
1732 return a;
1733 }
1734
1735 Compiles to:
1736
1737 _clamp2: ## @clamp2
1738 pushq %rbp
1739 movq %rsp, %rbp
1740 cmpl $5, %edi
1741 movl $5, %ecx
1742 cmovlel %edi, %ecx
1743 testl %ecx, %ecx
1744 movl $0, %eax
1745 cmovnsl %ecx, %eax
1746 popq %rbp
1747 ret
1748
1749 The move of 0 could be scheduled above the test to make it is xor reg,reg.
1750
1751 //===---------------------------------------------------------------------===//
1752
1753 GCC PR48986. We currently compile this:
1754
1755 void bar(void);
1756 void yyy(int* p) {
1757 if (__sync_fetch_and_add(p, -1) == 1)
1758 bar();
1759 }
1760
1761 into:
1762 movl $-1, %eax
1763 lock
1764 xaddl %eax, (%rdi)
1765 cmpl $1, %eax
1766 je LBB0_2
1767
1768 Instead we could generate:
1769
1770 lock
1771 dec %rdi
1772 je LBB0_2
1773
1774 The trick is to match "fetch_and_add(X, -C) == C".
1775
1776 //===---------------------------------------------------------------------===//
1777
1778 unsigned t(unsigned a, unsigned b) {
1779 return a <= b ? 5 : -5;
1780 }
1781
1782 We generate:
1783 movl $5, %ecx
1784 cmpl %esi, %edi
1785 movl $-5, %eax
1786 cmovbel %ecx, %eax
1787
1788 GCC:
1789 cmpl %edi, %esi
1790 sbbl %eax, %eax
1791 andl $-10, %eax
1792 addl $5, %eax
1793
1794 //===---------------------------------------------------------------------===//
1795