Home | History | Annotate | only in /external/llvm/lib/Target/X86
Up to higher level directory
NameDateSize
Android.mk15-Nov-20111.7K
AsmParser/15-Nov-2011
CMakeLists.txt15-Nov-20111.9K
Disassembler/15-Nov-2011
InstPrinter/15-Nov-2011
Makefile15-Nov-2011861
MCTargetDesc/15-Nov-2011
README-FPStack.txt15-Nov-20112.7K
README-MMX.txt15-Nov-20111.5K
README-SSE.txt15-Nov-201126.4K
README-UNIMPLEMENTED.txt15-Nov-2011679
README-X86-64.txt15-Nov-20116K
README.txt15-Nov-201153.5K
SSEDomainFix.cpp15-Nov-201116.1K
TargetInfo/15-Nov-2011
Utils/15-Nov-2011
X86.h15-Nov-20113.1K
X86.td15-Nov-201111.2K
X86AsmBackend.cpp15-Nov-201114.4K
X86AsmPrinter.cpp15-Nov-201126.1K
X86AsmPrinter.h15-Nov-20112.9K
X86CallingConv.td15-Nov-201115.3K
X86CodeEmitter.cpp15-Nov-201134.8K
X86COFFMachineModuleInfo.cpp15-Nov-2011615
X86COFFMachineModuleInfo.h15-Nov-20111.4K
X86CompilationCallback_Win64.asm15-Nov-20111.6K
X86ELFWriterInfo.cpp15-Nov-20114.2K
X86ELFWriterInfo.h15-Nov-20112.2K
X86FastISel.cpp15-Nov-201171.6K
X86FixupKinds.h15-Nov-20111.2K
X86FloatingPoint.cpp15-Nov-201163.7K
X86FrameLowering.cpp15-Nov-201142.7K
X86FrameLowering.h15-Nov-20112.4K
X86Instr3DNow.td15-Nov-20114.3K
X86InstrArithmetic.td15-Nov-201153.5K
X86InstrBuilder.h15-Nov-20116.7K
X86InstrCMovSetCC.td15-Nov-20114.9K
X86InstrCompiler.td15-Nov-201174.4K
X86InstrControl.td15-Nov-201113.9K
X86InstrExtension.td15-Nov-20117.8K
X86InstrFMA.td15-Nov-20112.8K
X86InstrFormats.td15-Nov-201120.5K
X86InstrFPStack.td15-Nov-201132.7K
X86InstrFragmentsSIMD.td15-Nov-201120.6K
X86InstrInfo.cpp15-Nov-2011119.8K
X86InstrInfo.h15-Nov-201135.9K
X86InstrInfo.td15-Nov-201175.2K
X86InstrMMX.td15-Nov-201121.9K
X86InstrShiftRotate.td15-Nov-201137.1K
X86InstrSSE.td15-Nov-2011287.9K
X86InstrSystem.td15-Nov-201119.5K
X86InstrVMX.td15-Nov-20112.4K
X86ISelDAGToDAG.cpp15-Nov-201181.2K
X86ISelLowering.cpp15-Nov-2011501.3K
X86ISelLowering.h15-Nov-201141K
X86JITInfo.cpp15-Nov-201118.8K
X86JITInfo.h15-Nov-20113K
X86MachineFunctionInfo.h15-Nov-20115.2K
X86MachObjectWriter.cpp15-Nov-201122.3K
X86MCCodeEmitter.cpp15-Nov-201135.6K
X86MCInstLower.cpp15-Nov-201127.6K
X86MCInstLower.h15-Nov-20111.3K
X86RegisterInfo.cpp15-Nov-201129.8K
X86RegisterInfo.h15-Nov-20114.6K
X86RegisterInfo.td15-Nov-201119.4K
X86Relocations.h15-Nov-20112K
X86SelectionDAGInfo.cpp15-Nov-20119.8K
X86SelectionDAGInfo.h15-Nov-20111.9K
X86Subtarget.cpp15-Nov-201111.1K
X86Subtarget.h15-Nov-20118.9K
X86TargetMachine.cpp15-Nov-20117K
X86TargetMachine.h15-Nov-20114.4K
X86TargetObjectFile.cpp15-Nov-20114.1K
X86TargetObjectFile.h15-Nov-20112.1K

README-FPStack.txt

      1 //===---------------------------------------------------------------------===//
      2 // Random ideas for the X86 backend: FP stack related stuff
      3 //===---------------------------------------------------------------------===//
      4 
      5 //===---------------------------------------------------------------------===//
      6 
      7 Some targets (e.g. athlons) prefer freep to fstp ST(0):
      8 http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html
      9 
     10 //===---------------------------------------------------------------------===//
     11 
     12 This should use fiadd on chips where it is profitable:
     13 double foo(double P, int *I) { return P+*I; }
     14 
     15 We have fiadd patterns now but the followings have the same cost and
     16 complexity. We need a way to specify the later is more profitable.
     17 
     18 def FpADD32m  : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW,
     19                     [(set RFP:$dst, (fadd RFP:$src1,
     20                                      (extloadf64f32 addr:$src2)))]>;
     21                 // ST(0) = ST(0) + [mem32]
     22 
     23 def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW,
     24                     [(set RFP:$dst, (fadd RFP:$src1,
     25                                      (X86fild addr:$src2, i32)))]>;
     26                 // ST(0) = ST(0) + [mem32int]
     27 
     28 //===---------------------------------------------------------------------===//
     29 
     30 The FP stackifier should handle simple permutates to reduce number of shuffle
     31 instructions, e.g. turning:
     32 
     33 fld P	->		fld Q
     34 fld Q			fld P
     35 fxch
     36 
     37 or:
     38 
     39 fxch	->		fucomi
     40 fucomi			jl X
     41 jg X
     42 
     43 Ideas:
     44 http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html
     45 
     46 
     47 //===---------------------------------------------------------------------===//
     48 
     49 Add a target specific hook to DAG combiner to handle SINT_TO_FP and
     50 FP_TO_SINT when the source operand is already in memory.
     51 
     52 //===---------------------------------------------------------------------===//
     53 
     54 Open code rint,floor,ceil,trunc:
     55 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html
     56 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html
     57 
     58 Opencode the sincos[f] libcall.
     59 
     60 //===---------------------------------------------------------------------===//
     61 
     62 None of the FPStack instructions are handled in
     63 X86RegisterInfo::foldMemoryOperand, which prevents the spiller from
     64 folding spill code into the instructions.
     65 
     66 //===---------------------------------------------------------------------===//
     67 
     68 Currently the x86 codegen isn't very good at mixing SSE and FPStack
     69 code:
     70 
     71 unsigned int foo(double x) { return x; }
     72 
     73 foo:
     74 	subl $20, %esp
     75 	movsd 24(%esp), %xmm0
     76 	movsd %xmm0, 8(%esp)
     77 	fldl 8(%esp)
     78 	fisttpll (%esp)
     79 	movl (%esp), %eax
     80 	addl $20, %esp
     81 	ret
     82 
     83 This just requires being smarter when custom expanding fptoui.
     84 
     85 //===---------------------------------------------------------------------===//
     86 

README-MMX.txt

      1 //===---------------------------------------------------------------------===//
      2 // Random ideas for the X86 backend: MMX-specific stuff.
      3 //===---------------------------------------------------------------------===//
      4 
      5 //===---------------------------------------------------------------------===//
      6 
      7 This:
      8 
      9 #include <mmintrin.h>
     10 
     11 __v2si qux(int A) {
     12   return (__v2si){ 0, A };
     13 }
     14 
     15 is compiled into:
     16 
     17 _qux:
     18         subl $28, %esp
     19         movl 32(%esp), %eax
     20         movd %eax, %mm0
     21         movq %mm0, (%esp)
     22         movl (%esp), %eax
     23         movl %eax, 20(%esp)
     24         movq %mm0, 8(%esp)
     25         movl 12(%esp), %eax
     26         movl %eax, 16(%esp)
     27         movq 16(%esp), %mm0
     28         addl $28, %esp
     29         ret
     30 
     31 Yuck!
     32 
     33 GCC gives us:
     34 
     35 _qux:
     36         subl    $12, %esp
     37         movl    16(%esp), %eax
     38         movl    20(%esp), %edx
     39         movl    $0, (%eax)
     40         movl    %edx, 4(%eax)
     41         addl    $12, %esp
     42         ret     $4
     43 
     44 //===---------------------------------------------------------------------===//
     45 
     46 We generate crappy code for this:
     47 
     48 __m64 t() {
     49   return _mm_cvtsi32_si64(1);
     50 }
     51 
     52 _t:
     53 	subl	$12, %esp
     54 	movl	$1, %eax
     55 	movd	%eax, %mm0
     56 	movq	%mm0, (%esp)
     57 	movl	(%esp), %eax
     58 	movl	4(%esp), %edx
     59 	addl	$12, %esp
     60 	ret
     61 
     62 The extra stack traffic is covered in the previous entry. But the other reason
     63 is we are not smart about materializing constants in MMX registers. With -m64
     64 
     65 	movl	$1, %eax
     66 	movd	%eax, %mm0
     67 	movd	%mm0, %rax
     68 	ret
     69 
     70 We should be using a constantpool load instead:
     71 	movq	LC0(%rip), %rax
     72 

README-SSE.txt

      1 //===---------------------------------------------------------------------===//
      2 // Random ideas for the X86 backend: SSE-specific stuff.
      3 //===---------------------------------------------------------------------===//
      4 
      5 //===---------------------------------------------------------------------===//
      6 
      7 SSE Variable shift can be custom lowered to something like this, which uses a
      8 small table + unaligned load + shuffle instead of going through memory.
      9 
     10 __m128i_shift_right:
     11 	.byte	  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
     12 	.byte	 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
     13 
     14 ...
     15 __m128i shift_right(__m128i value, unsigned long offset) {
     16   return _mm_shuffle_epi8(value,
     17                _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
     18 }
     19 
     20 //===---------------------------------------------------------------------===//
     21 
     22 SSE has instructions for doing operations on complex numbers, we should pattern
     23 match them.   For example, this should turn into a horizontal add:
     24 
     25 typedef float __attribute__((vector_size(16))) v4f32;
     26 float f32(v4f32 A) {
     27   return A[0]+A[1]+A[2]+A[3];
     28 }
     29 
     30 Instead we get this:
     31 
     32 _f32:                                   ## @f32
     33 	pshufd	$1, %xmm0, %xmm1        ## xmm1 = xmm0[1,0,0,0]
     34 	addss	%xmm0, %xmm1
     35 	pshufd	$3, %xmm0, %xmm2        ## xmm2 = xmm0[3,0,0,0]
     36 	movhlps	%xmm0, %xmm0            ## xmm0 = xmm0[1,1]
     37 	movaps	%xmm0, %xmm3
     38 	addss	%xmm1, %xmm3
     39 	movdqa	%xmm2, %xmm0
     40 	addss	%xmm3, %xmm0
     41 	ret
     42 
     43 Also, there are cases where some simple local SLP would improve codegen a bit.
     44 compiling this:
     45 
     46 _Complex float f32(_Complex float A, _Complex float B) {
     47   return A+B;
     48 }
     49 
     50 into:
     51 
     52 _f32:                                   ## @f32
     53 	movdqa	%xmm0, %xmm2
     54 	addss	%xmm1, %xmm2
     55 	pshufd	$1, %xmm1, %xmm1        ## xmm1 = xmm1[1,0,0,0]
     56 	pshufd	$1, %xmm0, %xmm3        ## xmm3 = xmm0[1,0,0,0]
     57 	addss	%xmm1, %xmm3
     58 	movaps	%xmm2, %xmm0
     59 	unpcklps	%xmm3, %xmm0    ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
     60 	ret
     61 
     62 seems silly when it could just be one addps.
     63 
     64 
     65 //===---------------------------------------------------------------------===//
     66 
     67 Expand libm rounding functions inline:  Significant speedups possible.
     68 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
     69 
     70 //===---------------------------------------------------------------------===//
     71 
     72 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
     73 other fast SSE modes.
     74 
     75 //===---------------------------------------------------------------------===//
     76 
     77 Think about doing i64 math in SSE regs on x86-32.
     78 
     79 //===---------------------------------------------------------------------===//
     80 
     81 This testcase should have no SSE instructions in it, and only one load from
     82 a constant pool:
     83 
     84 double %test3(bool %B) {
     85         %C = select bool %B, double 123.412, double 523.01123123
     86         ret double %C
     87 }
     88 
     89 Currently, the select is being lowered, which prevents the dag combiner from
     90 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
     91 
     92 The pattern isel got this one right.
     93 
     94 //===---------------------------------------------------------------------===//
     95 
     96 SSE should implement 'select_cc' using 'emulated conditional moves' that use
     97 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
     98 
     99 double %X(double %Y, double %Z, double %A, double %B) {
    100         %C = setlt double %A, %B
    101         %z = fadd double %Z, 0.0    ;; select operand is not a load
    102         %D = select bool %C, double %Y, double %z
    103         ret double %D
    104 }
    105 
    106 We currently emit:
    107 
    108 _X:
    109         subl $12, %esp
    110         xorpd %xmm0, %xmm0
    111         addsd 24(%esp), %xmm0
    112         movsd 32(%esp), %xmm1
    113         movsd 16(%esp), %xmm2
    114         ucomisd 40(%esp), %xmm1
    115         jb LBB_X_2
    116 LBB_X_1:
    117         movsd %xmm0, %xmm2
    118 LBB_X_2:
    119         movsd %xmm2, (%esp)
    120         fldl (%esp)
    121         addl $12, %esp
    122         ret
    123 
    124 //===---------------------------------------------------------------------===//
    125 
    126 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
    127 feasible.
    128 
    129 //===---------------------------------------------------------------------===//
    130 
    131 Codegen:
    132   if (copysign(1.0, x) == copysign(1.0, y))
    133 into:
    134   if (x^y & mask)
    135 when using SSE.
    136 
    137 //===---------------------------------------------------------------------===//
    138 
    139 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
    140 of a v4sf value.
    141 
    142 //===---------------------------------------------------------------------===//
    143 
    144 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
    145 Perhaps use pxor / xorp* to clear a XMM register first?
    146 
    147 //===---------------------------------------------------------------------===//
    148 
    149 External test Nurbs exposed some problems. Look for
    150 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
    151 emits:
    152 
    153         movaps    (%edx), %xmm2                                 #59.21
    154         movaps    (%edx), %xmm5                                 #60.21
    155         movaps    (%edx), %xmm4                                 #61.21
    156         movaps    (%edx), %xmm3                                 #62.21
    157         movl      40(%ecx), %ebp                                #69.49
    158         shufps    $0, %xmm2, %xmm5                              #60.21
    159         movl      100(%esp), %ebx                               #69.20
    160         movl      (%ebx), %edi                                  #69.20
    161         imull     %ebp, %edi                                    #69.49
    162         addl      (%eax), %edi                                  #70.33
    163         shufps    $85, %xmm2, %xmm4                             #61.21
    164         shufps    $170, %xmm2, %xmm3                            #62.21
    165         shufps    $255, %xmm2, %xmm2                            #63.21
    166         lea       (%ebp,%ebp,2), %ebx                           #69.49
    167         negl      %ebx                                          #69.49
    168         lea       -3(%edi,%ebx), %ebx                           #70.33
    169         shll      $4, %ebx                                      #68.37
    170         addl      32(%ecx), %ebx                                #68.37
    171         testb     $15, %bl                                      #91.13
    172         jne       L_B1.24       # Prob 5%                       #91.13
    173 
    174 This is the llvm code after instruction scheduling:
    175 
    176 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
    177 	%reg1078 = MOV32ri -3
    178 	%reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
    179 	%reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
    180 	%reg1080 = IMUL32rr %reg1079, %reg1037
    181 	%reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
    182 	%reg1038 = LEA32r %reg1081, 1, %reg1080, -3
    183 	%reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
    184 	%reg1082 = SHL32ri %reg1038, 4
    185 	%reg1039 = ADD32rr %reg1036, %reg1082
    186 	%reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
    187 	%reg1034 = SHUFPSrr %reg1083, %reg1083, 170
    188 	%reg1032 = SHUFPSrr %reg1083, %reg1083, 0
    189 	%reg1035 = SHUFPSrr %reg1083, %reg1083, 255
    190 	%reg1033 = SHUFPSrr %reg1083, %reg1083, 85
    191 	%reg1040 = MOV32rr %reg1039
    192 	%reg1084 = AND32ri8 %reg1039, 15
    193 	CMP32ri8 %reg1084, 0
    194 	JE mbb<cond_next204,0xa914d30>
    195 
    196 Still ok. After register allocation:
    197 
    198 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
    199 	%EAX = MOV32ri -3
    200 	%EDX = MOV32rm <fi#3>, 1, %NOREG, 0
    201 	ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
    202 	%EDX = MOV32rm <fi#7>, 1, %NOREG, 0
    203 	%EDX = MOV32rm %EDX, 1, %NOREG, 40
    204 	IMUL32rr %EAX<def&use>, %EDX
    205 	%ESI = MOV32rm <fi#5>, 1, %NOREG, 0
    206 	%ESI = MOV32rm %ESI, 1, %NOREG, 0
    207 	MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
    208 	%EAX = LEA32r %ESI, 1, %EAX, -3
    209 	%ESI = MOV32rm <fi#7>, 1, %NOREG, 0
    210 	%ESI = MOV32rm %ESI, 1, %NOREG, 32
    211 	%EDI = MOV32rr %EAX
    212 	SHL32ri %EDI<def&use>, 4
    213 	ADD32rr %EDI<def&use>, %ESI
    214 	%XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
    215 	%XMM1 = MOVAPSrr %XMM0
    216 	SHUFPSrr %XMM1<def&use>, %XMM1, 170
    217 	%XMM2 = MOVAPSrr %XMM0
    218 	SHUFPSrr %XMM2<def&use>, %XMM2, 0
    219 	%XMM3 = MOVAPSrr %XMM0
    220 	SHUFPSrr %XMM3<def&use>, %XMM3, 255
    221 	SHUFPSrr %XMM0<def&use>, %XMM0, 85
    222 	%EBX = MOV32rr %EDI
    223 	AND32ri8 %EBX<def&use>, 15
    224 	CMP32ri8 %EBX, 0
    225 	JE mbb<cond_next204,0xa914d30>
    226 
    227 This looks really bad. The problem is shufps is a destructive opcode. Since it
    228 appears as operand two in more than one shufps ops. It resulted in a number of
    229 copies. Note icc also suffers from the same problem. Either the instruction
    230 selector should select pshufd or The register allocator can made the two-address
    231 to three-address transformation.
    232 
    233 It also exposes some other problems. See MOV32ri -3 and the spills.
    234 
    235 //===---------------------------------------------------------------------===//
    236 
    237 Consider:
    238 
    239 __m128 test(float a) {
    240   return _mm_set_ps(0.0, 0.0, 0.0, a*a);
    241 }
    242 
    243 This compiles into:
    244 
    245 movss 4(%esp), %xmm1
    246 mulss %xmm1, %xmm1
    247 xorps %xmm0, %xmm0
    248 movss %xmm1, %xmm0
    249 ret
    250 
    251 Because mulss doesn't modify the top 3 elements, the top elements of 
    252 xmm1 are already zero'd.  We could compile this to:
    253 
    254 movss 4(%esp), %xmm0
    255 mulss %xmm0, %xmm0
    256 ret
    257 
    258 //===---------------------------------------------------------------------===//
    259 
    260 Here's a sick and twisted idea.  Consider code like this:
    261 
    262 __m128 test(__m128 a) {
    263   float b = *(float*)&A;
    264   ...
    265   return _mm_set_ps(0.0, 0.0, 0.0, b);
    266 }
    267 
    268 This might compile to this code:
    269 
    270 movaps c(%esp), %xmm1
    271 xorps %xmm0, %xmm0
    272 movss %xmm1, %xmm0
    273 ret
    274 
    275 Now consider if the ... code caused xmm1 to get spilled.  This might produce
    276 this code:
    277 
    278 movaps c(%esp), %xmm1
    279 movaps %xmm1, c2(%esp)
    280 ...
    281 
    282 xorps %xmm0, %xmm0
    283 movaps c2(%esp), %xmm1
    284 movss %xmm1, %xmm0
    285 ret
    286 
    287 However, since the reload is only used by these instructions, we could 
    288 "fold" it into the uses, producing something like this:
    289 
    290 movaps c(%esp), %xmm1
    291 movaps %xmm1, c2(%esp)
    292 ...
    293 
    294 movss c2(%esp), %xmm0
    295 ret
    296 
    297 ... saving two instructions.
    298 
    299 The basic idea is that a reload from a spill slot, can, if only one 4-byte 
    300 chunk is used, bring in 3 zeros the one element instead of 4 elements.
    301 This can be used to simplify a variety of shuffle operations, where the
    302 elements are fixed zeros.
    303 
    304 //===---------------------------------------------------------------------===//
    305 
    306 This code generates ugly code, probably due to costs being off or something:
    307 
    308 define void @test(float* %P, <4 x float>* %P2 ) {
    309         %xFloat0.688 = load float* %P
    310         %tmp = load <4 x float>* %P2
    311         %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
    312         store <4 x float> %inFloat3.713, <4 x float>* %P2
    313         ret void
    314 }
    315 
    316 Generates:
    317 
    318 _test:
    319 	movl	8(%esp), %eax
    320 	movaps	(%eax), %xmm0
    321 	pxor	%xmm1, %xmm1
    322 	movaps	%xmm0, %xmm2
    323 	shufps	$50, %xmm1, %xmm2
    324 	shufps	$132, %xmm2, %xmm0
    325 	movaps	%xmm0, (%eax)
    326 	ret
    327 
    328 Would it be better to generate:
    329 
    330 _test:
    331         movl 8(%esp), %ecx
    332         movaps (%ecx), %xmm0
    333 	xor %eax, %eax
    334         pinsrw $6, %eax, %xmm0
    335         pinsrw $7, %eax, %xmm0
    336         movaps %xmm0, (%ecx)
    337         ret
    338 
    339 ?
    340 
    341 //===---------------------------------------------------------------------===//
    342 
    343 Some useful information in the Apple Altivec / SSE Migration Guide:
    344 
    345 http://developer.apple.com/documentation/Performance/Conceptual/
    346 Accelerate_sse_migration/index.html
    347 
    348 e.g. SSE select using and, andnot, or. Various SSE compare translations.
    349 
    350 //===---------------------------------------------------------------------===//
    351 
    352 Add hooks to commute some CMPP operations.
    353 
    354 //===---------------------------------------------------------------------===//
    355 
    356 Apply the same transformation that merged four float into a single 128-bit load
    357 to loads from constant pool.
    358 
    359 //===---------------------------------------------------------------------===//
    360 
    361 Floating point max / min are commutable when -enable-unsafe-fp-path is
    362 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
    363 nodes which are selected to max / min instructions that are marked commutable.
    364 
    365 //===---------------------------------------------------------------------===//
    366 
    367 We should materialize vector constants like "all ones" and "signbit" with 
    368 code like:
    369 
    370      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
    371 
    372 and:
    373      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
    374      psrlq   xmm1, 31     ; xmm1 = all 100000000000...
    375 
    376 instead of using a load from the constant pool.  The later is important for
    377 ABS/NEG/copysign etc.
    378 
    379 //===---------------------------------------------------------------------===//
    380 
    381 These functions:
    382 
    383 #include <xmmintrin.h>
    384 __m128i a;
    385 void x(unsigned short n) {
    386   a = _mm_slli_epi32 (a, n);
    387 }
    388 void y(unsigned n) {
    389   a = _mm_slli_epi32 (a, n);
    390 }
    391 
    392 compile to ( -O3 -static -fomit-frame-pointer):
    393 _x:
    394         movzwl  4(%esp), %eax
    395         movd    %eax, %xmm0
    396         movaps  _a, %xmm1
    397         pslld   %xmm0, %xmm1
    398         movaps  %xmm1, _a
    399         ret
    400 _y:
    401         movd    4(%esp), %xmm0
    402         movaps  _a, %xmm1
    403         pslld   %xmm0, %xmm1
    404         movaps  %xmm1, _a
    405         ret
    406 
    407 "y" looks good, but "x" does silly movzwl stuff around into a GPR.  It seems
    408 like movd would be sufficient in both cases as the value is already zero 
    409 extended in the 32-bit stack slot IIRC.  For signed short, it should also be
    410 save, as a really-signed value would be undefined for pslld.
    411 
    412 
    413 //===---------------------------------------------------------------------===//
    414 
    415 #include <math.h>
    416 int t1(double d) { return signbit(d); }
    417 
    418 This currently compiles to:
    419 	subl	$12, %esp
    420 	movsd	16(%esp), %xmm0
    421 	movsd	%xmm0, (%esp)
    422 	movl	4(%esp), %eax
    423 	shrl	$31, %eax
    424 	addl	$12, %esp
    425 	ret
    426 
    427 We should use movmskp{s|d} instead.
    428 
    429 //===---------------------------------------------------------------------===//
    430 
    431 CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
    432 (aligned) vector load.  This functionality has a couple of problems.
    433 
    434 1. The code to infer alignment from loads of globals is in the X86 backend,
    435    not the dag combiner.  This is because dagcombine2 needs to be able to see
    436    through the X86ISD::Wrapper node, which DAGCombine can't really do.
    437 2. The code for turning 4 x load into a single vector load is target 
    438    independent and should be moved to the dag combiner.
    439 3. The code for turning 4 x load into a vector load can only handle a direct 
    440    load from a global or a direct load from the stack.  It should be generalized
    441    to handle any load from P, P+4, P+8, P+12, where P can be anything.
    442 4. The alignment inference code cannot handle loads from globals in non-static
    443    mode because it doesn't look through the extra dyld stub load.  If you try
    444    vec_align.ll without -relocation-model=static, you'll see what I mean.
    445 
    446 //===---------------------------------------------------------------------===//
    447 
    448 We should lower store(fneg(load p), q) into an integer load+xor+store, which
    449 eliminates a constant pool load.  For example, consider:
    450 
    451 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
    452 entry:
    453  %tmp6 = fsub float -0.000000e+00, %z.1		; <float> [#uses=1]
    454  %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
    455  ret i64 %tmp20
    456 }
    457 declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
    458 
    459 This currently compiles to:
    460 
    461 LCPI1_0:					#  <4 x float>
    462 	.long	2147483648	# float -0
    463 	.long	2147483648	# float -0
    464 	.long	2147483648	# float -0
    465 	.long	2147483648	# float -0
    466 _ccosf:
    467 	subl	$12, %esp
    468 	movss	16(%esp), %xmm0
    469 	movss	%xmm0, 4(%esp)
    470 	movss	20(%esp), %xmm0
    471 	xorps	LCPI1_0, %xmm0
    472 	movss	%xmm0, (%esp)
    473 	call	L_ccoshf$stub
    474 	addl	$12, %esp
    475 	ret
    476 
    477 Note the load into xmm0, then xor (to negate), then store.  In PIC mode,
    478 this code computes the pic base and does two loads to do the constant pool 
    479 load, so the improvement is much bigger.
    480 
    481 The tricky part about this xform is that the argument load/store isn't exposed
    482 until post-legalize, and at that point, the fneg has been custom expanded into 
    483 an X86 fxor.  This means that we need to handle this case in the x86 backend
    484 instead of in target independent code.
    485 
    486 //===---------------------------------------------------------------------===//
    487 
    488 Non-SSE4 insert into 16 x i8 is atrociously bad.
    489 
    490 //===---------------------------------------------------------------------===//
    491 
    492 <2 x i64> extract is substantially worse than <2 x f64>, even if the destination
    493 is memory.
    494 
    495 //===---------------------------------------------------------------------===//
    496 
    497 SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
    498 sitting between the truncate and the extract.
    499 
    500 //===---------------------------------------------------------------------===//
    501 
    502 INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
    503 any number of 0.0 simultaneously.  Currently we only use it for simple
    504 insertions.
    505 
    506 See comments in LowerINSERT_VECTOR_ELT_SSE4.
    507 
    508 //===---------------------------------------------------------------------===//
    509 
    510 On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
    511 Custom.  All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
    512 legal, it'll just take a few extra patterns written in the .td file.
    513 
    514 Note: this is not a code quality issue; the custom lowered code happens to be
    515 right, but we shouldn't have to custom lower anything.  This is probably related
    516 to <2 x i64> ops being so bad.
    517 
    518 //===---------------------------------------------------------------------===//
    519 
    520 'select' on vectors and scalars could be a whole lot better.  We currently 
    521 lower them to conditional branches.  On x86-64 for example, we compile this:
    522 
    523 double test(double a, double b, double c, double d) { return a<b ? c : d; }
    524 
    525 to:
    526 
    527 _test:
    528 	ucomisd	%xmm0, %xmm1
    529 	ja	LBB1_2	# entry
    530 LBB1_1:	# entry
    531 	movapd	%xmm3, %xmm2
    532 LBB1_2:	# entry
    533 	movapd	%xmm2, %xmm0
    534 	ret
    535 
    536 instead of:
    537 
    538 _test:
    539 	cmpltsd	%xmm1, %xmm0
    540 	andpd	%xmm0, %xmm2
    541 	andnpd	%xmm3, %xmm0
    542 	orpd	%xmm2, %xmm0
    543 	ret
    544 
    545 For unpredictable branches, the later is much more efficient.  This should
    546 just be a matter of having scalar sse map to SELECT_CC and custom expanding
    547 or iseling it.
    548 
    549 //===---------------------------------------------------------------------===//
    550 
    551 LLVM currently generates stack realignment code, when it is not necessary
    552 needed. The problem is that we need to know about stack alignment too early,
    553 before RA runs.
    554 
    555 At that point we don't know, whether there will be vector spill, or not.
    556 Stack realignment logic is overly conservative here, but otherwise we can
    557 produce unaligned loads/stores.
    558 
    559 Fixing this will require some huge RA changes.
    560 
    561 Testcase:
    562 #include <emmintrin.h>
    563 
    564 typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
    565 
    566 static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
    567 - 22725, - 12873};;
    568 
    569 vSInt16 madd(vSInt16 b)
    570 {
    571     return _mm_madd_epi16(a, b);
    572 }
    573 
    574 Generated code (x86-32, linux):
    575 madd:
    576         pushl   %ebp
    577         movl    %esp, %ebp
    578         andl    $-16, %esp
    579         movaps  .LCPI1_0, %xmm1
    580         pmaddwd %xmm1, %xmm0
    581         movl    %ebp, %esp
    582         popl    %ebp
    583         ret
    584 
    585 //===---------------------------------------------------------------------===//
    586 
    587 Consider:
    588 #include <emmintrin.h> 
    589 __m128 foo2 (float x) {
    590  return _mm_set_ps (0, 0, x, 0);
    591 }
    592 
    593 In x86-32 mode, we generate this spiffy code:
    594 
    595 _foo2:
    596 	movss	4(%esp), %xmm0
    597 	pshufd	$81, %xmm0, %xmm0
    598 	ret
    599 
    600 in x86-64 mode, we generate this code, which could be better:
    601 
    602 _foo2:
    603 	xorps	%xmm1, %xmm1
    604 	movss	%xmm0, %xmm1
    605 	pshufd	$81, %xmm1, %xmm0
    606 	ret
    607 
    608 In sse4 mode, we could use insertps to make both better.
    609 
    610 Here's another testcase that could use insertps [mem]:
    611 
    612 #include <xmmintrin.h>
    613 extern float x2, x3;
    614 __m128 foo1 (float x1, float x4) {
    615  return _mm_set_ps (x2, x1, x3, x4);
    616 }
    617 
    618 gcc mainline compiles it to:
    619 
    620 foo1:
    621        insertps        $0x10, x2(%rip), %xmm0
    622        insertps        $0x10, x3(%rip), %xmm1
    623        movaps  %xmm1, %xmm2
    624        movlhps %xmm0, %xmm2
    625        movaps  %xmm2, %xmm0
    626        ret
    627 
    628 //===---------------------------------------------------------------------===//
    629 
    630 We compile vector multiply-by-constant into poor code:
    631 
    632 define <4 x i32> @f(<4 x i32> %i) nounwind  {
    633 	%A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
    634 	ret <4 x i32> %A
    635 }
    636 
    637 On targets without SSE4.1, this compiles into:
    638 
    639 LCPI1_0:					##  <4 x i32>
    640 	.long	10
    641 	.long	10
    642 	.long	10
    643 	.long	10
    644 	.text
    645 	.align	4,0x90
    646 	.globl	_f
    647 _f:
    648 	pshufd	$3, %xmm0, %xmm1
    649 	movd	%xmm1, %eax
    650 	imull	LCPI1_0+12, %eax
    651 	movd	%eax, %xmm1
    652 	pshufd	$1, %xmm0, %xmm2
    653 	movd	%xmm2, %eax
    654 	imull	LCPI1_0+4, %eax
    655 	movd	%eax, %xmm2
    656 	punpckldq	%xmm1, %xmm2
    657 	movd	%xmm0, %eax
    658 	imull	LCPI1_0, %eax
    659 	movd	%eax, %xmm1
    660 	movhlps	%xmm0, %xmm0
    661 	movd	%xmm0, %eax
    662 	imull	LCPI1_0+8, %eax
    663 	movd	%eax, %xmm0
    664 	punpckldq	%xmm0, %xmm1
    665 	movaps	%xmm1, %xmm0
    666 	punpckldq	%xmm2, %xmm0
    667 	ret
    668 
    669 It would be better to synthesize integer vector multiplication by constants
    670 using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
    671 simple cases such as multiplication by powers of two would be better as
    672 vector shifts than as multiplications.
    673 
    674 //===---------------------------------------------------------------------===//
    675 
    676 We compile this:
    677 
    678 __m128i
    679 foo2 (char x)
    680 {
    681   return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
    682 }
    683 
    684 into:
    685 	movl	$1, %eax
    686 	xorps	%xmm0, %xmm0
    687 	pinsrw	$2, %eax, %xmm0
    688 	movzbl	4(%esp), %eax
    689 	pinsrw	$3, %eax, %xmm0
    690 	movl	$256, %eax
    691 	pinsrw	$7, %eax, %xmm0
    692 	ret
    693 
    694 
    695 gcc-4.2:
    696 	subl	$12, %esp
    697 	movzbl	16(%esp), %eax
    698 	movdqa	LC0, %xmm0
    699 	pinsrw	$3, %eax, %xmm0
    700 	addl	$12, %esp
    701 	ret
    702 	.const
    703 	.align 4
    704 LC0:
    705 	.word	0
    706 	.word	0
    707 	.word	1
    708 	.word	0
    709 	.word	0
    710 	.word	0
    711 	.word	0
    712 	.word	256
    713 
    714 With SSE4, it should be
    715       movdqa  .LC0(%rip), %xmm0
    716       pinsrb  $6, %edi, %xmm0
    717 
    718 //===---------------------------------------------------------------------===//
    719 
    720 We should transform a shuffle of two vectors of constants into a single vector
    721 of constants. Also, insertelement of a constant into a vector of constants
    722 should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
    723 
    724 We compiled it to something horrible:
    725 
    726 	.align	4
    727 LCPI1_1:					##  float
    728 	.long	1065353216	## float 1
    729 	.const
    730 
    731 	.align	4
    732 LCPI1_0:					##  <4 x float>
    733 	.space	4
    734 	.long	1065353216	## float 1
    735 	.space	4
    736 	.long	1065353216	## float 1
    737 	.text
    738 	.align	4,0x90
    739 	.globl	_t
    740 _t:
    741 	xorps	%xmm0, %xmm0
    742 	movhps	LCPI1_0, %xmm0
    743 	movss	LCPI1_1, %xmm1
    744 	movaps	%xmm0, %xmm2
    745 	shufps	$2, %xmm1, %xmm2
    746 	shufps	$132, %xmm2, %xmm0
    747 	movaps	%xmm0, 0
    748 
    749 //===---------------------------------------------------------------------===//
    750 rdar://5907648
    751 
    752 This function:
    753 
    754 float foo(unsigned char x) {
    755   return x;
    756 }
    757 
    758 compiles to (x86-32):
    759 
    760 define float @foo(i8 zeroext  %x) nounwind  {
    761 	%tmp12 = uitofp i8 %x to float		; <float> [#uses=1]
    762 	ret float %tmp12
    763 }
    764 
    765 compiles to:
    766 
    767 _foo:
    768 	subl	$4, %esp
    769 	movzbl	8(%esp), %eax
    770 	cvtsi2ss	%eax, %xmm0
    771 	movss	%xmm0, (%esp)
    772 	flds	(%esp)
    773 	addl	$4, %esp
    774 	ret
    775 
    776 We should be able to use:
    777   cvtsi2ss 8($esp), %xmm0
    778 since we know the stack slot is already zext'd.
    779 
    780 //===---------------------------------------------------------------------===//
    781 
    782 Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
    783 when code size is critical. movlps is slower than movsd on core2 but it's one
    784 byte shorter.
    785 
    786 //===---------------------------------------------------------------------===//
    787 
    788 We should use a dynamic programming based approach to tell when using FPStack
    789 operations is cheaper than SSE.  SciMark montecarlo contains code like this
    790 for example:
    791 
    792 double MonteCarlo_num_flops(int Num_samples) {
    793     return ((double) Num_samples)* 4.0;
    794 }
    795 
    796 In fpstack mode, this compiles into:
    797 
    798 LCPI1_0:					
    799 	.long	1082130432	## float 4.000000e+00
    800 _MonteCarlo_num_flops:
    801 	subl	$4, %esp
    802 	movl	8(%esp), %eax
    803 	movl	%eax, (%esp)
    804 	fildl	(%esp)
    805 	fmuls	LCPI1_0
    806 	addl	$4, %esp
    807 	ret
    808         
    809 in SSE mode, it compiles into significantly slower code:
    810 
    811 _MonteCarlo_num_flops:
    812 	subl	$12, %esp
    813 	cvtsi2sd	16(%esp), %xmm0
    814 	mulsd	LCPI1_0, %xmm0
    815 	movsd	%xmm0, (%esp)
    816 	fldl	(%esp)
    817 	addl	$12, %esp
    818 	ret
    819 
    820 There are also other cases in scimark where using fpstack is better, it is
    821 cheaper to do fld1 than load from a constant pool for example, so
    822 "load, add 1.0, store" is better done in the fp stack, etc.
    823 
    824 //===---------------------------------------------------------------------===//
    825 
    826 The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to
    827 "cmpsd".  For example, this code:
    828 
    829 double d1(double x) { return x == x ? x : x + x; }
    830 
    831 Compiles into:
    832 
    833 _d1:
    834 	ucomisd	%xmm0, %xmm0
    835 	jnp	LBB1_2
    836 	addsd	%xmm0, %xmm0
    837 	ret
    838 LBB1_2:
    839 	ret
    840 
    841 Also, the 'ret's should be shared.  This is PR6032.
    842 
    843 //===---------------------------------------------------------------------===//
    844 
    845 These should compile into the same code (PR6214): Perhaps instcombine should
    846 canonicalize the former into the later?
    847 
    848 define float @foo(float %x) nounwind {
    849   %t = bitcast float %x to i32
    850   %s = and i32 %t, 2147483647
    851   %d = bitcast i32 %s to float
    852   ret float %d
    853 }
    854 
    855 declare float @fabsf(float %n)
    856 define float @bar(float %x) nounwind {
    857   %d = call float @fabsf(float %x)
    858   ret float %d
    859 }
    860 
    861 //===---------------------------------------------------------------------===//
    862 
    863 This IR (from PR6194):
    864 
    865 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
    866 target triple = "x86_64-apple-darwin10.0.0"
    867 
    868 %0 = type { double, double }
    869 %struct.float3 = type { float, float, float }
    870 
    871 define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
    872 entry:
    873   %tmp18 = extractvalue %0 %0, 0                  ; <double> [#uses=1]
    874   %tmp19 = bitcast double %tmp18 to i64           ; <i64> [#uses=1]
    875   %tmp20 = zext i64 %tmp19 to i128                ; <i128> [#uses=1]
    876   %tmp10 = lshr i128 %tmp20, 32                   ; <i128> [#uses=1]
    877   %tmp11 = trunc i128 %tmp10 to i32               ; <i32> [#uses=1]
    878   %tmp12 = bitcast i32 %tmp11 to float            ; <float> [#uses=1]
    879   %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
    880   store float %tmp12, float* %tmp5
    881   ret void
    882 }
    883 
    884 Compiles to:
    885 
    886 _test:                                  ## @test
    887 	movd	%xmm0, %rax
    888 	shrq	$32, %rax
    889 	movl	%eax, 4(%rdi)
    890 	ret
    891 
    892 This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
    893 doing a shuffle from v[1] to v[0] then a float store.
    894 
    895 //===---------------------------------------------------------------------===//
    896 
    897 On SSE4 machines, we compile this code:
    898 
    899 define <2 x float> @test2(<2 x float> %Q, <2 x float> %R,
    900        <2 x float> *%P) nounwind {
    901   %Z = fadd <2 x float> %Q, %R
    902 
    903   store <2 x float> %Z, <2 x float> *%P
    904   ret <2 x float> %Z
    905 }
    906 
    907 into:
    908 
    909 _test2:                                 ## @test2
    910 ## BB#0:
    911 	insertps	$0, %xmm2, %xmm2
    912 	insertps	$16, %xmm3, %xmm2
    913 	insertps	$0, %xmm0, %xmm3
    914 	insertps	$16, %xmm1, %xmm3
    915 	addps	%xmm2, %xmm3
    916 	movq	%xmm3, (%rdi)
    917 	movaps	%xmm3, %xmm0
    918 	pshufd	$1, %xmm3, %xmm1
    919                                         ## kill: XMM1<def> XMM1<kill>
    920 	ret
    921 
    922 The insertps's of $0 are pointless complex copies.
    923 
    924 //===---------------------------------------------------------------------===//
    925 
    926 If SSE4.1 is available we should inline rounding functions instead of emitting
    927 a libcall.
    928 
    929 floor: roundsd $0x01, %xmm, %xmm
    930 ceil:  roundsd $0x02, %xmm, %xmm
    931 
    932 and likewise for the single precision versions.
    933 
    934 Currently, SelectionDAGBuilder doesn't turn calls to these functions into the
    935 corresponding nodes and some targets (including X86) aren't ready for them.
    936 
    937 //===---------------------------------------------------------------------===//
    938 

README-UNIMPLEMENTED.txt

      1 //===---------------------------------------------------------------------===//
      2 // Testcases that crash the X86 backend because they aren't implemented
      3 //===---------------------------------------------------------------------===//
      4 
      5 These are cases we know the X86 backend doesn't handle.  Patches are welcome
      6 and appreciated, because no one has signed up to implemented these yet.
      7 Implementing these would allow elimination of the corresponding intrinsics,
      8 which would be great.
      9 
     10 1) vector shifts
     11 2) vector comparisons
     12 3) vector fp<->int conversions: PR2683, PR2684, PR2685, PR2686, PR2688
     13 4) bitcasts from vectors to scalars: PR2804
     14 5) llvm.atomic.cmp.swap.i128.p0i128: PR3462
     15 

README-X86-64.txt

      1 //===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===//
      2 
      3 AMD64 Optimization Manual 8.2 has some nice information about optimizing integer
      4 multiplication by a constant. How much of it applies to Intel's X86-64
      5 implementation? There are definite trade-offs to consider: latency vs. register
      6 pressure vs. code size.
      7 
      8 //===---------------------------------------------------------------------===//
      9 
     10 Are we better off using branches instead of cmove to implement FP to
     11 unsigned i64?
     12 
     13 _conv:
     14 	ucomiss	LC0(%rip), %xmm0
     15 	cvttss2siq	%xmm0, %rdx
     16 	jb	L3
     17 	subss	LC0(%rip), %xmm0
     18 	movabsq	$-9223372036854775808, %rax
     19 	cvttss2siq	%xmm0, %rdx
     20 	xorq	%rax, %rdx
     21 L3:
     22 	movq	%rdx, %rax
     23 	ret
     24 
     25 instead of
     26 
     27 _conv:
     28 	movss LCPI1_0(%rip), %xmm1
     29 	cvttss2siq %xmm0, %rcx
     30 	movaps %xmm0, %xmm2
     31 	subss %xmm1, %xmm2
     32 	cvttss2siq %xmm2, %rax
     33 	movabsq $-9223372036854775808, %rdx
     34 	xorq %rdx, %rax
     35 	ucomiss %xmm1, %xmm0
     36 	cmovb %rcx, %rax
     37 	ret
     38 
     39 Seems like the jb branch has high likelihood of being taken. It would have
     40 saved a few instructions.
     41 
     42 //===---------------------------------------------------------------------===//
     43 
     44 It's not possible to reference AH, BH, CH, and DH registers in an instruction
     45 requiring REX prefix. However, divb and mulb both produce results in AH. If isel
     46 emits a CopyFromReg which gets turned into a movb and that can be allocated a
     47 r8b - r15b.
     48 
     49 To get around this, isel emits a CopyFromReg from AX and then right shift it
     50 down by 8 and truncate it. It's not pretty but it works. We need some register
     51 allocation magic to make the hack go away (e.g. putting additional constraints
     52 on the result of the movb).
     53 
     54 //===---------------------------------------------------------------------===//
     55 
     56 The x86-64 ABI for hidden-argument struct returns requires that the
     57 incoming value of %rdi be copied into %rax by the callee upon return.
     58 
     59 The idea is that it saves callers from having to remember this value,
     60 which would often require a callee-saved register. Callees usually
     61 need to keep this value live for most of their body anyway, so it
     62 doesn't add a significant burden on them.
     63 
     64 We currently implement this in codegen, however this is suboptimal
     65 because it means that it would be quite awkward to implement the
     66 optimization for callers.
     67 
     68 A better implementation would be to relax the LLVM IR rules for sret
     69 arguments to allow a function with an sret argument to have a non-void
     70 return type, and to have the front-end to set up the sret argument value
     71 as the return value of the function. The front-end could more easily
     72 emit uses of the returned struct value to be in terms of the function's
     73 lowered return value, and it would free non-C frontends from a
     74 complication only required by a C-based ABI.
     75 
     76 //===---------------------------------------------------------------------===//
     77 
     78 We get a redundant zero extension for code like this:
     79 
     80 int mask[1000];
     81 int foo(unsigned x) {
     82  if (x < 10)
     83    x = x * 45;
     84  else
     85    x = x * 78;
     86  return mask[x];
     87 }
     88 
     89 _foo:
     90 LBB1_0:	## entry
     91 	cmpl	$9, %edi
     92 	jbe	LBB1_3	## bb
     93 LBB1_1:	## bb1
     94 	imull	$78, %edi, %eax
     95 LBB1_2:	## bb2
     96 	movl	%eax, %eax                    <----
     97 	movq	_mask@GOTPCREL(%rip), %rcx
     98 	movl	(%rcx,%rax,4), %eax
     99 	ret
    100 LBB1_3:	## bb
    101 	imull	$45, %edi, %eax
    102 	jmp	LBB1_2	## bb2
    103   
    104 Before regalloc, we have:
    105 
    106         %reg1025<def> = IMUL32rri8 %reg1024, 45, %EFLAGS<imp-def>
    107         JMP mbb<bb2,0x203afb0>
    108     Successors according to CFG: 0x203afb0 (#3)
    109 
    110 bb1: 0x203af60, LLVM BB @0x1e02310, ID#2:
    111     Predecessors according to CFG: 0x203aec0 (#0)
    112         %reg1026<def> = IMUL32rri8 %reg1024, 78, %EFLAGS<imp-def>
    113     Successors according to CFG: 0x203afb0 (#3)
    114 
    115 bb2: 0x203afb0, LLVM BB @0x1e02340, ID#3:
    116     Predecessors according to CFG: 0x203af10 (#1) 0x203af60 (#2)
    117         %reg1027<def> = PHI %reg1025, mbb<bb,0x203af10>,
    118                             %reg1026, mbb<bb1,0x203af60>
    119         %reg1029<def> = MOVZX64rr32 %reg1027
    120 
    121 so we'd have to know that IMUL32rri8 leaves the high word zero extended and to
    122 be able to recognize the zero extend.  This could also presumably be implemented
    123 if we have whole-function selectiondags.
    124 
    125 //===---------------------------------------------------------------------===//
    126 
    127 Take the following code
    128 (from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653):
    129 extern unsigned long table[];
    130 unsigned long foo(unsigned char *p) {
    131   unsigned long tag = *p;
    132   return table[tag >> 4] + table[tag & 0xf];
    133 }
    134 
    135 Current code generated:
    136 	movzbl	(%rdi), %eax
    137 	movq	%rax, %rcx
    138 	andq	$240, %rcx
    139 	shrq	%rcx
    140 	andq	$15, %rax
    141 	movq	table(,%rax,8), %rax
    142 	addq	table(%rcx), %rax
    143 	ret
    144 
    145 Issues:
    146 1. First movq should be movl; saves a byte.
    147 2. Both andq's should be andl; saves another two bytes.  I think this was
    148    implemented at one point, but subsequently regressed.
    149 3. shrq should be shrl; saves another byte.
    150 4. The first andq can be completely eliminated by using a slightly more
    151    expensive addressing mode.
    152 
    153 //===---------------------------------------------------------------------===//
    154 
    155 Consider the following (contrived testcase, but contains common factors):
    156 
    157 #include <stdarg.h>
    158 int test(int x, ...) {
    159   int sum, i;
    160   va_list l;
    161   va_start(l, x);
    162   for (i = 0; i < x; i++)
    163     sum += va_arg(l, int);
    164   va_end(l);
    165   return sum;
    166 }
    167 
    168 Testcase given in C because fixing it will likely involve changing the IR
    169 generated for it.  The primary issue with the result is that it doesn't do any
    170 of the optimizations which are possible if we know the address of a va_list
    171 in the current function is never taken:
    172 1. We shouldn't spill the XMM registers because we only call va_arg with "int".
    173 2. It would be nice if we could scalarrepl the va_list.
    174 3. Probably overkill, but it'd be cool if we could peel off the first five
    175 iterations of the loop.
    176 
    177 Other optimizations involving functions which use va_arg on floats which don't
    178 have the address of a va_list taken:
    179 1. Conversely to the above, we shouldn't spill general registers if we only
    180    call va_arg on "double".
    181 2. If we know nothing more than 64 bits wide is read from the XMM registers,
    182    we can change the spilling code to reduce the amount of stack used by half.
    183 
    184 //===---------------------------------------------------------------------===//
    185 

README.txt

      1 //===---------------------------------------------------------------------===//
      2 // Random ideas for the X86 backend.
      3 //===---------------------------------------------------------------------===//
      4 
      5 We should add support for the "movbe" instruction, which does a byte-swapping
      6 copy (3-addr bswap + memory support?)  This is available on Atom processors.
      7 
      8 //===---------------------------------------------------------------------===//
      9 
     10 This should be one DIV/IDIV instruction, not a libcall:
     11 
     12 unsigned test(unsigned long long X, unsigned Y) {
     13         return X/Y;
     14 }
     15 
     16 This can be done trivially with a custom legalizer.  What about overflow 
     17 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
     18 
     19 //===---------------------------------------------------------------------===//
     20 
     21 Improvements to the multiply -> shift/add algorithm:
     22 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
     23 
     24 //===---------------------------------------------------------------------===//
     25 
     26 Improve code like this (occurs fairly frequently, e.g. in LLVM):
     27 long long foo(int x) { return 1LL << x; }
     28 
     29 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
     30 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
     31 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
     32 
     33 Another useful one would be  ~0ULL >> X and ~0ULL << X.
     34 
     35 One better solution for 1LL << x is:
     36         xorl    %eax, %eax
     37         xorl    %edx, %edx
     38         testb   $32, %cl
     39         sete    %al
     40         setne   %dl
     41         sall    %cl, %eax
     42         sall    %cl, %edx
     43 
     44 But that requires good 8-bit subreg support.
     45 
     46 Also, this might be better.  It's an extra shift, but it's one instruction
     47 shorter, and doesn't stress 8-bit subreg support.
     48 (From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
     49 but without the unnecessary and.)
     50         movl %ecx, %eax
     51         shrl $5, %eax
     52         movl %eax, %edx
     53         xorl $1, %edx
     54         sall %cl, %eax
     55         sall %cl. %edx
     56 
     57 64-bit shifts (in general) expand to really bad code.  Instead of using
     58 cmovs, we should expand to a conditional branch like GCC produces.
     59 
     60 //===---------------------------------------------------------------------===//
     61 
     62 Some isel ideas:
     63 
     64 1. Dynamic programming based approach when compile time if not an
     65    issue.
     66 2. Code duplication (addressing mode) during isel.
     67 3. Other ideas from "Register-Sensitive Selection, Duplication, and
     68    Sequencing of Instructions".
     69 4. Scheduling for reduced register pressure.  E.g. "Minimum Register 
     70    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs" 
     71    and other related papers.
     72    http://citeseer.ist.psu.edu/govindarajan01minimum.html
     73 
     74 //===---------------------------------------------------------------------===//
     75 
     76 Should we promote i16 to i32 to avoid partial register update stalls?
     77 
     78 //===---------------------------------------------------------------------===//
     79 
     80 Leave any_extend as pseudo instruction and hint to register
     81 allocator. Delay codegen until post register allocation.
     82 Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
     83 the coalescer how to deal with it though.
     84 
     85 //===---------------------------------------------------------------------===//
     86 
     87 It appears icc use push for parameter passing. Need to investigate.
     88 
     89 //===---------------------------------------------------------------------===//
     90 
     91 This:
     92 
     93 void foo(void);
     94 void bar(int x, int *P) { 
     95   x >>= 2;
     96   if (x) 
     97     foo();
     98   *P = x;
     99 }
    100 
    101 compiles into:
    102 
    103 	movq	%rsi, %rbx
    104 	movl	%edi, %r14d
    105 	sarl	$2, %r14d
    106 	testl	%r14d, %r14d
    107 	je	LBB0_2
    108 
    109 Instead of doing an explicit test, we can use the flags off the sar.  This
    110 occurs in a bigger testcase like this, which is pretty common:
    111 
    112 #include <vector>
    113 int test1(std::vector<int> &X) {
    114   int Sum = 0;
    115   for (long i = 0, e = X.size(); i != e; ++i)
    116     X[i] = 0;
    117   return Sum;
    118 }
    119 
    120 //===---------------------------------------------------------------------===//
    121 
    122 Only use inc/neg/not instructions on processors where they are faster than
    123 add/sub/xor.  They are slower on the P4 due to only updating some processor
    124 flags.
    125 
    126 //===---------------------------------------------------------------------===//
    127 
    128 The instruction selector sometimes misses folding a load into a compare.  The
    129 pattern is written as (cmp reg, (load p)).  Because the compare isn't 
    130 commutative, it is not matched with the load on both sides.  The dag combiner
    131 should be made smart enough to cannonicalize the load into the RHS of a compare
    132 when it can invert the result of the compare for free.
    133 
    134 //===---------------------------------------------------------------------===//
    135 
    136 In many cases, LLVM generates code like this:
    137 
    138 _test:
    139         movl 8(%esp), %eax
    140         cmpl %eax, 4(%esp)
    141         setl %al
    142         movzbl %al, %eax
    143         ret
    144 
    145 on some processors (which ones?), it is more efficient to do this:
    146 
    147 _test:
    148         movl 8(%esp), %ebx
    149         xor  %eax, %eax
    150         cmpl %ebx, 4(%esp)
    151         setl %al
    152         ret
    153 
    154 Doing this correctly is tricky though, as the xor clobbers the flags.
    155 
    156 //===---------------------------------------------------------------------===//
    157 
    158 We should generate bts/btr/etc instructions on targets where they are cheap or
    159 when codesize is important.  e.g., for:
    160 
    161 void setbit(int *target, int bit) {
    162     *target |= (1 << bit);
    163 }
    164 void clearbit(int *target, int bit) {
    165     *target &= ~(1 << bit);
    166 }
    167 
    168 //===---------------------------------------------------------------------===//
    169 
    170 Instead of the following for memset char*, 1, 10:
    171 
    172 	movl $16843009, 4(%edx)
    173 	movl $16843009, (%edx)
    174 	movw $257, 8(%edx)
    175 
    176 It might be better to generate
    177 
    178 	movl $16843009, %eax
    179 	movl %eax, 4(%edx)
    180 	movl %eax, (%edx)
    181 	movw al, 8(%edx)
    182 	
    183 when we can spare a register. It reduces code size.
    184 
    185 //===---------------------------------------------------------------------===//
    186 
    187 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
    188 get this:
    189 
    190 define i32 @test1(i32 %X) {
    191     %Y = sdiv i32 %X, 8
    192     ret i32 %Y
    193 }
    194 
    195 _test1:
    196         movl 4(%esp), %eax
    197         movl %eax, %ecx
    198         sarl $31, %ecx
    199         shrl $29, %ecx
    200         addl %ecx, %eax
    201         sarl $3, %eax
    202         ret
    203 
    204 GCC knows several different ways to codegen it, one of which is this:
    205 
    206 _test1:
    207         movl    4(%esp), %eax
    208         cmpl    $-1, %eax
    209         leal    7(%eax), %ecx
    210         cmovle  %ecx, %eax
    211         sarl    $3, %eax
    212         ret
    213 
    214 which is probably slower, but it's interesting at least :)
    215 
    216 //===---------------------------------------------------------------------===//
    217 
    218 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
    219 We should leave these as libcalls for everything over a much lower threshold,
    220 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
    221 stores, TLB preheating, etc)
    222 
    223 //===---------------------------------------------------------------------===//
    224 
    225 Optimize this into something reasonable:
    226  x * copysign(1.0, y) * copysign(1.0, z)
    227 
    228 //===---------------------------------------------------------------------===//
    229 
    230 Optimize copysign(x, *y) to use an integer load from y.
    231 
    232 //===---------------------------------------------------------------------===//
    233 
    234 The following tests perform worse with LSR:
    235 
    236 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
    237 
    238 //===---------------------------------------------------------------------===//
    239 
    240 Adding to the list of cmp / test poor codegen issues:
    241 
    242 int test(__m128 *A, __m128 *B) {
    243   if (_mm_comige_ss(*A, *B))
    244     return 3;
    245   else
    246     return 4;
    247 }
    248 
    249 _test:
    250 	movl 8(%esp), %eax
    251 	movaps (%eax), %xmm0
    252 	movl 4(%esp), %eax
    253 	movaps (%eax), %xmm1
    254 	comiss %xmm0, %xmm1
    255 	setae %al
    256 	movzbl %al, %ecx
    257 	movl $3, %eax
    258 	movl $4, %edx
    259 	cmpl $0, %ecx
    260 	cmove %edx, %eax
    261 	ret
    262 
    263 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
    264 are a number of issues. 1) We are introducing a setcc between the result of the
    265 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
    266 so a any extend (which becomes a zero extend) is added.
    267 
    268 We probably need some kind of target DAG combine hook to fix this.
    269 
    270 //===---------------------------------------------------------------------===//
    271 
    272 We generate significantly worse code for this than GCC:
    273 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
    274 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
    275 
    276 There is also one case we do worse on PPC.
    277 
    278 //===---------------------------------------------------------------------===//
    279 
    280 For this:
    281 
    282 int test(int a)
    283 {
    284   return a * 3;
    285 }
    286 
    287 We currently emits
    288 	imull $3, 4(%esp), %eax
    289 
    290 Perhaps this is what we really should generate is? Is imull three or four
    291 cycles? Note: ICC generates this:
    292 	movl	4(%esp), %eax
    293 	leal	(%eax,%eax,2), %eax
    294 
    295 The current instruction priority is based on pattern complexity. The former is
    296 more "complex" because it folds a load so the latter will not be emitted.
    297 
    298 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
    299 should always try to match LEA first since the LEA matching code does some
    300 estimate to determine whether the match is profitable.
    301 
    302 However, if we care more about code size, then imull is better. It's two bytes
    303 shorter than movl + leal.
    304 
    305 On a Pentium M, both variants have the same characteristics with regard
    306 to throughput; however, the multiplication has a latency of four cycles, as
    307 opposed to two cycles for the movl+lea variant.
    308 
    309 //===---------------------------------------------------------------------===//
    310 
    311 __builtin_ffs codegen is messy.
    312 
    313 int ffs_(unsigned X) { return __builtin_ffs(X); }
    314 
    315 llvm produces:
    316 ffs_:
    317         movl    4(%esp), %ecx
    318         bsfl    %ecx, %eax
    319         movl    $32, %edx
    320         cmove   %edx, %eax
    321         incl    %eax
    322         xorl    %edx, %edx
    323         testl   %ecx, %ecx
    324         cmove   %edx, %eax
    325         ret
    326 
    327 vs gcc:
    328 
    329 _ffs_:
    330         movl    $-1, %edx
    331         bsfl    4(%esp), %eax
    332         cmove   %edx, %eax
    333         addl    $1, %eax
    334         ret
    335 
    336 Another example of __builtin_ffs (use predsimplify to eliminate a select):
    337 
    338 int foo (unsigned long j) {
    339   if (j)
    340     return __builtin_ffs (j) - 1;
    341   else
    342     return 0;
    343 }
    344 
    345 //===---------------------------------------------------------------------===//
    346 
    347 It appears gcc place string data with linkonce linkage in
    348 .section __TEXT,__const_coal,coalesced instead of
    349 .section __DATA,__const_coal,coalesced.
    350 Take a look at darwin.h, there are other Darwin assembler directives that we
    351 do not make use of.
    352 
    353 //===---------------------------------------------------------------------===//
    354 
    355 define i32 @foo(i32* %a, i32 %t) {
    356 entry:
    357 	br label %cond_true
    358 
    359 cond_true:		; preds = %cond_true, %entry
    360 	%x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]		; <i32> [#uses=3]
    361 	%t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]		; <i32> [#uses=1]
    362 	%tmp2 = getelementptr i32* %a, i32 %x.0.0		; <i32*> [#uses=1]
    363 	%tmp3 = load i32* %tmp2		; <i32> [#uses=1]
    364 	%tmp5 = add i32 %t_addr.0.0, %x.0.0		; <i32> [#uses=1]
    365 	%tmp7 = add i32 %tmp5, %tmp3		; <i32> [#uses=2]
    366 	%tmp9 = add i32 %x.0.0, 1		; <i32> [#uses=2]
    367 	%tmp = icmp sgt i32 %tmp9, 39		; <i1> [#uses=1]
    368 	br i1 %tmp, label %bb12, label %cond_true
    369 
    370 bb12:		; preds = %cond_true
    371 	ret i32 %tmp7
    372 }
    373 is pessimized by -loop-reduce and -indvars
    374 
    375 //===---------------------------------------------------------------------===//
    376 
    377 u32 to float conversion improvement:
    378 
    379 float uint32_2_float( unsigned u ) {
    380   float fl = (int) (u & 0xffff);
    381   float fh = (int) (u >> 16);
    382   fh *= 0x1.0p16f;
    383   return fh + fl;
    384 }
    385 
    386 00000000        subl    $0x04,%esp
    387 00000003        movl    0x08(%esp,1),%eax
    388 00000007        movl    %eax,%ecx
    389 00000009        shrl    $0x10,%ecx
    390 0000000c        cvtsi2ss        %ecx,%xmm0
    391 00000010        andl    $0x0000ffff,%eax
    392 00000015        cvtsi2ss        %eax,%xmm1
    393 00000019        mulss   0x00000078,%xmm0
    394 00000021        addss   %xmm1,%xmm0
    395 00000025        movss   %xmm0,(%esp,1)
    396 0000002a        flds    (%esp,1)
    397 0000002d        addl    $0x04,%esp
    398 00000030        ret
    399 
    400 //===---------------------------------------------------------------------===//
    401 
    402 When using fastcc abi, align stack slot of argument of type double on 8 byte
    403 boundary to improve performance.
    404 
    405 //===---------------------------------------------------------------------===//
    406 
    407 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
    408 simplifications for integer "x cmp y ? a : b".
    409 
    410 //===---------------------------------------------------------------------===//
    411 
    412 Consider the expansion of:
    413 
    414 define i32 @test3(i32 %X) {
    415         %tmp1 = urem i32 %X, 255
    416         ret i32 %tmp1
    417 }
    418 
    419 Currently it compiles to:
    420 
    421 ...
    422         movl $2155905153, %ecx
    423         movl 8(%esp), %esi
    424         movl %esi, %eax
    425         mull %ecx
    426 ...
    427 
    428 This could be "reassociated" into:
    429 
    430         movl $2155905153, %eax
    431         movl 8(%esp), %ecx
    432         mull %ecx
    433 
    434 to avoid the copy.  In fact, the existing two-address stuff would do this
    435 except that mul isn't a commutative 2-addr instruction.  I guess this has
    436 to be done at isel time based on the #uses to mul?
    437 
    438 //===---------------------------------------------------------------------===//
    439 
    440 Make sure the instruction which starts a loop does not cross a cacheline
    441 boundary. This requires knowning the exact length of each machine instruction.
    442 That is somewhat complicated, but doable. Example 256.bzip2:
    443 
    444 In the new trace, the hot loop has an instruction which crosses a cacheline
    445 boundary.  In addition to potential cache misses, this can't help decoding as I
    446 imagine there has to be some kind of complicated decoder reset and realignment
    447 to grab the bytes from the next cacheline.
    448 
    449 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
    450 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
    451 937  937 0x3d0a incl     %esi
    452 3    3   0x3d0b cmpb     %bl, %dl
    453 27   27  0x3d0d jnz      0x000062db <main+11707>
    454 
    455 //===---------------------------------------------------------------------===//
    456 
    457 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
    458 
    459 //===---------------------------------------------------------------------===//
    460 
    461 This could be a single 16-bit load.
    462 
    463 int f(char *p) {
    464     if ((p[0] == 1) & (p[1] == 2)) return 1;
    465     return 0;
    466 }
    467 
    468 //===---------------------------------------------------------------------===//
    469 
    470 We should inline lrintf and probably other libc functions.
    471 
    472 //===---------------------------------------------------------------------===//
    473 
    474 Use the FLAGS values from arithmetic instructions more.  For example, compile:
    475 
    476 int add_zf(int *x, int y, int a, int b) {
    477      if ((*x += y) == 0)
    478           return a;
    479      else
    480           return b;
    481 }
    482 
    483 to:
    484        addl    %esi, (%rdi)
    485        movl    %edx, %eax
    486        cmovne  %ecx, %eax
    487        ret
    488 instead of:
    489 
    490 _add_zf:
    491         addl (%rdi), %esi
    492         movl %esi, (%rdi)
    493         testl %esi, %esi
    494         cmove %edx, %ecx
    495         movl %ecx, %eax
    496         ret
    497 
    498 As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll
    499 without a test instruction.
    500 
    501 //===---------------------------------------------------------------------===//
    502 
    503 These two functions have identical effects:
    504 
    505 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
    506 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
    507 
    508 We currently compile them to:
    509 
    510 _f:
    511         movl 4(%esp), %eax
    512         movl %eax, %ecx
    513         incl %ecx
    514         movl 8(%esp), %edx
    515         cmpl %edx, %ecx
    516         jne LBB1_2      #UnifiedReturnBlock
    517 LBB1_1: #cond_true
    518         addl $2, %eax
    519         ret
    520 LBB1_2: #UnifiedReturnBlock
    521         movl %ecx, %eax
    522         ret
    523 _f2:
    524         movl 4(%esp), %eax
    525         movl %eax, %ecx
    526         incl %ecx
    527         cmpl 8(%esp), %ecx
    528         sete %cl
    529         movzbl %cl, %ecx
    530         leal 1(%ecx,%eax), %eax
    531         ret
    532 
    533 both of which are inferior to GCC's:
    534 
    535 _f:
    536         movl    4(%esp), %edx
    537         leal    1(%edx), %eax
    538         addl    $2, %edx
    539         cmpl    8(%esp), %eax
    540         cmove   %edx, %eax
    541         ret
    542 _f2:
    543         movl    4(%esp), %eax
    544         addl    $1, %eax
    545         xorl    %edx, %edx
    546         cmpl    8(%esp), %eax
    547         sete    %dl
    548         addl    %edx, %eax
    549         ret
    550 
    551 //===---------------------------------------------------------------------===//
    552 
    553 This code:
    554 
    555 void test(int X) {
    556   if (X) abort();
    557 }
    558 
    559 is currently compiled to:
    560 
    561 _test:
    562         subl $12, %esp
    563         cmpl $0, 16(%esp)
    564         jne LBB1_1
    565         addl $12, %esp
    566         ret
    567 LBB1_1:
    568         call L_abort$stub
    569 
    570 It would be better to produce:
    571 
    572 _test:
    573         subl $12, %esp
    574         cmpl $0, 16(%esp)
    575         jne L_abort$stub
    576         addl $12, %esp
    577         ret
    578 
    579 This can be applied to any no-return function call that takes no arguments etc.
    580 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
    581 something like this:
    582 
    583 _test:
    584         cmpl $0, 4(%esp)
    585         jne LBB1_1
    586         ret
    587 LBB1_1:
    588         subl $12, %esp
    589         call L_abort$stub
    590 
    591 Both are useful in different situations.  Finally, it could be shrink-wrapped
    592 and tail called, like this:
    593 
    594 _test:
    595         cmpl $0, 4(%esp)
    596         jne LBB1_1
    597         ret
    598 LBB1_1:
    599         pop %eax   # realign stack.
    600         call L_abort$stub
    601 
    602 Though this probably isn't worth it.
    603 
    604 //===---------------------------------------------------------------------===//
    605 
    606 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
    607 a neg instead of a sub instruction.  Consider:
    608 
    609 int test(char X) { return 7-X; }
    610 
    611 we currently produce:
    612 _test:
    613         movl $7, %eax
    614         movsbl 4(%esp), %ecx
    615         subl %ecx, %eax
    616         ret
    617 
    618 We would use one fewer register if codegen'd as:
    619 
    620         movsbl 4(%esp), %eax
    621 	neg %eax
    622         add $7, %eax
    623         ret
    624 
    625 Note that this isn't beneficial if the load can be folded into the sub.  In
    626 this case, we want a sub:
    627 
    628 int test(int X) { return 7-X; }
    629 _test:
    630         movl $7, %eax
    631         subl 4(%esp), %eax
    632         ret
    633 
    634 //===---------------------------------------------------------------------===//
    635 
    636 Leaf functions that require one 4-byte spill slot have a prolog like this:
    637 
    638 _foo:
    639         pushl   %esi
    640         subl    $4, %esp
    641 ...
    642 and an epilog like this:
    643         addl    $4, %esp
    644         popl    %esi
    645         ret
    646 
    647 It would be smaller, and potentially faster, to push eax on entry and to
    648 pop into a dummy register instead of using addl/subl of esp.  Just don't pop 
    649 into any return registers :)
    650 
    651 //===---------------------------------------------------------------------===//
    652 
    653 The X86 backend should fold (branch (or (setcc, setcc))) into multiple 
    654 branches.  We generate really poor code for:
    655 
    656 double testf(double a) {
    657        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
    658 }
    659 
    660 For example, the entry BB is:
    661 
    662 _testf:
    663         subl    $20, %esp
    664         pxor    %xmm0, %xmm0
    665         movsd   24(%esp), %xmm1
    666         ucomisd %xmm0, %xmm1
    667         setnp   %al
    668         sete    %cl
    669         testb   %cl, %al
    670         jne     LBB1_5  # UnifiedReturnBlock
    671 LBB1_1: # cond_true
    672 
    673 
    674 it would be better to replace the last four instructions with:
    675 
    676 	jp LBB1_1
    677 	je LBB1_5
    678 LBB1_1:
    679 
    680 We also codegen the inner ?: into a diamond:
    681 
    682        cvtss2sd        LCPI1_0(%rip), %xmm2
    683         cvtss2sd        LCPI1_1(%rip), %xmm3
    684         ucomisd %xmm1, %xmm0
    685         ja      LBB1_3  # cond_true
    686 LBB1_2: # cond_true
    687         movapd  %xmm3, %xmm2
    688 LBB1_3: # cond_true
    689         movapd  %xmm2, %xmm0
    690         ret
    691 
    692 We should sink the load into xmm3 into the LBB1_2 block.  This should
    693 be pretty easy, and will nuke all the copies.
    694 
    695 //===---------------------------------------------------------------------===//
    696 
    697 This:
    698         #include <algorithm>
    699         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
    700         { return std::make_pair(a + b, a + b < a); }
    701         bool no_overflow(unsigned a, unsigned b)
    702         { return !full_add(a, b).second; }
    703 
    704 Should compile to:
    705 	addl	%esi, %edi
    706 	setae	%al
    707 	movzbl	%al, %eax
    708 	ret
    709 
    710 on x86-64, instead of the rather stupid-looking:
    711 	addl	%esi, %edi
    712 	setb	%al
    713 	xorb	$1, %al
    714 	movzbl	%al, %eax
    715 	ret
    716 
    717 
    718 //===---------------------------------------------------------------------===//
    719 
    720 The following code:
    721 
    722 bb114.preheader:		; preds = %cond_next94
    723 	%tmp231232 = sext i16 %tmp62 to i32		; <i32> [#uses=1]
    724 	%tmp233 = sub i32 32, %tmp231232		; <i32> [#uses=1]
    725 	%tmp245246 = sext i16 %tmp65 to i32		; <i32> [#uses=1]
    726 	%tmp252253 = sext i16 %tmp68 to i32		; <i32> [#uses=1]
    727 	%tmp254 = sub i32 32, %tmp252253		; <i32> [#uses=1]
    728 	%tmp553554 = bitcast i16* %tmp37 to i8*		; <i8*> [#uses=2]
    729 	%tmp583584 = sext i16 %tmp98 to i32		; <i32> [#uses=1]
    730 	%tmp585 = sub i32 32, %tmp583584		; <i32> [#uses=1]
    731 	%tmp614615 = sext i16 %tmp101 to i32		; <i32> [#uses=1]
    732 	%tmp621622 = sext i16 %tmp104 to i32		; <i32> [#uses=1]
    733 	%tmp623 = sub i32 32, %tmp621622		; <i32> [#uses=1]
    734 	br label %bb114
    735 
    736 produces:
    737 
    738 LBB3_5:	# bb114.preheader
    739 	movswl	-68(%ebp), %eax
    740 	movl	$32, %ecx
    741 	movl	%ecx, -80(%ebp)
    742 	subl	%eax, -80(%ebp)
    743 	movswl	-52(%ebp), %eax
    744 	movl	%ecx, -84(%ebp)
    745 	subl	%eax, -84(%ebp)
    746 	movswl	-70(%ebp), %eax
    747 	movl	%ecx, -88(%ebp)
    748 	subl	%eax, -88(%ebp)
    749 	movswl	-50(%ebp), %eax
    750 	subl	%eax, %ecx
    751 	movl	%ecx, -76(%ebp)
    752 	movswl	-42(%ebp), %eax
    753 	movl	%eax, -92(%ebp)
    754 	movswl	-66(%ebp), %eax
    755 	movl	%eax, -96(%ebp)
    756 	movw	$0, -98(%ebp)
    757 
    758 This appears to be bad because the RA is not folding the store to the stack 
    759 slot into the movl.  The above instructions could be:
    760 	movl    $32, -80(%ebp)
    761 ...
    762 	movl    $32, -84(%ebp)
    763 ...
    764 This seems like a cross between remat and spill folding.
    765 
    766 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
    767 change, so we could simply subtract %eax from %ecx first and then use %ecx (or
    768 vice-versa).
    769 
    770 //===---------------------------------------------------------------------===//
    771 
    772 This code:
    773 
    774 	%tmp659 = icmp slt i16 %tmp654, 0		; <i1> [#uses=1]
    775 	br i1 %tmp659, label %cond_true662, label %cond_next715
    776 
    777 produces this:
    778 
    779 	testw	%cx, %cx
    780 	movswl	%cx, %esi
    781 	jns	LBB4_109	# cond_next715
    782 
    783 Shark tells us that using %cx in the testw instruction is sub-optimal. It
    784 suggests using the 32-bit register (which is what ICC uses).
    785 
    786 //===---------------------------------------------------------------------===//
    787 
    788 We compile this:
    789 
    790 void compare (long long foo) {
    791   if (foo < 4294967297LL)
    792     abort();
    793 }
    794 
    795 to:
    796 
    797 compare:
    798         subl    $4, %esp
    799         cmpl    $0, 8(%esp)
    800         setne   %al
    801         movzbw  %al, %ax
    802         cmpl    $1, 12(%esp)
    803         setg    %cl
    804         movzbw  %cl, %cx
    805         cmove   %ax, %cx
    806         testb   $1, %cl
    807         jne     .LBB1_2 # UnifiedReturnBlock
    808 .LBB1_1:        # ifthen
    809         call    abort
    810 .LBB1_2:        # UnifiedReturnBlock
    811         addl    $4, %esp
    812         ret
    813 
    814 (also really horrible code on ppc).  This is due to the expand code for 64-bit
    815 compares.  GCC produces multiple branches, which is much nicer:
    816 
    817 compare:
    818         subl    $12, %esp
    819         movl    20(%esp), %edx
    820         movl    16(%esp), %eax
    821         decl    %edx
    822         jle     .L7
    823 .L5:
    824         addl    $12, %esp
    825         ret
    826         .p2align 4,,7
    827 .L7:
    828         jl      .L4
    829         cmpl    $0, %eax
    830         .p2align 4,,8
    831         ja      .L5
    832 .L4:
    833         .p2align 4,,9
    834         call    abort
    835 
    836 //===---------------------------------------------------------------------===//
    837 
    838 Tail call optimization improvements: Tail call optimization currently
    839 pushes all arguments on the top of the stack (their normal place for
    840 non-tail call optimized calls) that source from the callers arguments
    841 or  that source from a virtual register (also possibly sourcing from
    842 callers arguments).
    843 This is done to prevent overwriting of parameters (see example
    844 below) that might be used later.
    845 
    846 example:  
    847 
    848 int callee(int32, int64); 
    849 int caller(int32 arg1, int32 arg2) { 
    850   int64 local = arg2 * 2; 
    851   return callee(arg2, (int64)local); 
    852 }
    853 
    854 [arg1]          [!arg2 no longer valid since we moved local onto it]
    855 [arg2]      ->  [(int64)
    856 [RETADDR]        local  ]
    857 
    858 Moving arg1 onto the stack slot of callee function would overwrite
    859 arg2 of the caller.
    860 
    861 Possible optimizations:
    862 
    863 
    864  - Analyse the actual parameters of the callee to see which would
    865    overwrite a caller parameter which is used by the callee and only
    866    push them onto the top of the stack.
    867 
    868    int callee (int32 arg1, int32 arg2);
    869    int caller (int32 arg1, int32 arg2) {
    870        return callee(arg1,arg2);
    871    }
    872 
    873    Here we don't need to write any variables to the top of the stack
    874    since they don't overwrite each other.
    875 
    876    int callee (int32 arg1, int32 arg2);
    877    int caller (int32 arg1, int32 arg2) {
    878        return callee(arg2,arg1);
    879    }
    880 
    881    Here we need to push the arguments because they overwrite each
    882    other.
    883 
    884 //===---------------------------------------------------------------------===//
    885 
    886 main ()
    887 {
    888   int i = 0;
    889   unsigned long int z = 0;
    890 
    891   do {
    892     z -= 0x00004000;
    893     i++;
    894     if (i > 0x00040000)
    895       abort ();
    896   } while (z > 0);
    897   exit (0);
    898 }
    899 
    900 gcc compiles this to:
    901 
    902 _main:
    903 	subl	$28, %esp
    904 	xorl	%eax, %eax
    905 	jmp	L2
    906 L3:
    907 	cmpl	$262144, %eax
    908 	je	L10
    909 L2:
    910 	addl	$1, %eax
    911 	cmpl	$262145, %eax
    912 	jne	L3
    913 	call	L_abort$stub
    914 L10:
    915 	movl	$0, (%esp)
    916 	call	L_exit$stub
    917 
    918 llvm:
    919 
    920 _main:
    921 	subl	$12, %esp
    922 	movl	$1, %eax
    923 	movl	$16384, %ecx
    924 LBB1_1:	# bb
    925 	cmpl	$262145, %eax
    926 	jge	LBB1_4	# cond_true
    927 LBB1_2:	# cond_next
    928 	incl	%eax
    929 	addl	$4294950912, %ecx
    930 	cmpl	$16384, %ecx
    931 	jne	LBB1_1	# bb
    932 LBB1_3:	# bb11
    933 	xorl	%eax, %eax
    934 	addl	$12, %esp
    935 	ret
    936 LBB1_4:	# cond_true
    937 	call	L_abort$stub
    938 
    939 1. LSR should rewrite the first cmp with induction variable %ecx.
    940 2. DAG combiner should fold
    941         leal    1(%eax), %edx
    942         cmpl    $262145, %edx
    943    =>
    944         cmpl    $262144, %eax
    945 
    946 //===---------------------------------------------------------------------===//
    947 
    948 define i64 @test(double %X) {
    949 	%Y = fptosi double %X to i64
    950 	ret i64 %Y
    951 }
    952 
    953 compiles to:
    954 
    955 _test:
    956 	subl	$20, %esp
    957 	movsd	24(%esp), %xmm0
    958 	movsd	%xmm0, 8(%esp)
    959 	fldl	8(%esp)
    960 	fisttpll	(%esp)
    961 	movl	4(%esp), %edx
    962 	movl	(%esp), %eax
    963 	addl	$20, %esp
    964 	#FP_REG_KILL
    965 	ret
    966 
    967 This should just fldl directly from the input stack slot.
    968 
    969 //===---------------------------------------------------------------------===//
    970 
    971 This code:
    972 int foo (int x) { return (x & 65535) | 255; }
    973 
    974 Should compile into:
    975 
    976 _foo:
    977         movzwl  4(%esp), %eax
    978         orl     $255, %eax
    979         ret
    980 
    981 instead of:
    982 _foo:
    983 	movl	$65280, %eax
    984 	andl	4(%esp), %eax
    985 	orl	$255, %eax
    986 	ret
    987 
    988 //===---------------------------------------------------------------------===//
    989 
    990 We're codegen'ing multiply of long longs inefficiently:
    991 
    992 unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
    993   return arg1 *  arg2;
    994 }
    995 
    996 We compile to (fomit-frame-pointer):
    997 
    998 _LLM:
    999 	pushl	%esi
   1000 	movl	8(%esp), %ecx
   1001 	movl	16(%esp), %esi
   1002 	movl	%esi, %eax
   1003 	mull	%ecx
   1004 	imull	12(%esp), %esi
   1005 	addl	%edx, %esi
   1006 	imull	20(%esp), %ecx
   1007 	movl	%esi, %edx
   1008 	addl	%ecx, %edx
   1009 	popl	%esi
   1010 	ret
   1011 
   1012 This looks like a scheduling deficiency and lack of remat of the load from
   1013 the argument area.  ICC apparently produces:
   1014 
   1015         movl      8(%esp), %ecx
   1016         imull     12(%esp), %ecx
   1017         movl      16(%esp), %eax
   1018         imull     4(%esp), %eax 
   1019         addl      %eax, %ecx  
   1020         movl      4(%esp), %eax
   1021         mull      12(%esp) 
   1022         addl      %ecx, %edx
   1023         ret
   1024 
   1025 Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
   1026 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
   1027 
   1028 //===---------------------------------------------------------------------===//
   1029 
   1030 We can fold a store into "zeroing a reg".  Instead of:
   1031 
   1032 xorl    %eax, %eax
   1033 movl    %eax, 124(%esp)
   1034 
   1035 we should get:
   1036 
   1037 movl    $0, 124(%esp)
   1038 
   1039 if the flags of the xor are dead.
   1040 
   1041 Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
   1042 be folded into: shl [mem], 1
   1043 
   1044 //===---------------------------------------------------------------------===//
   1045 
   1046 In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
   1047 or and instruction, for example:
   1048 
   1049 	xorpd	LCPI1_0, %xmm2
   1050 
   1051 However, if xmm2 gets spilled, we end up with really ugly code like this:
   1052 
   1053 	movsd	(%esp), %xmm0
   1054 	xorpd	LCPI1_0, %xmm0
   1055 	movsd	%xmm0, (%esp)
   1056 
   1057 Since we 'know' that this is a 'neg', we can actually "fold" the spill into
   1058 the neg/abs instruction, turning it into an *integer* operation, like this:
   1059 
   1060 	xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
   1061 
   1062 you could also use xorb, but xorl is less likely to lead to a partial register
   1063 stall.  Here is a contrived testcase:
   1064 
   1065 double a, b, c;
   1066 void test(double *P) {
   1067   double X = *P;
   1068   a = X;
   1069   bar();
   1070   X = -X;
   1071   b = X;
   1072   bar();
   1073   c = X;
   1074 }
   1075 
   1076 //===---------------------------------------------------------------------===//
   1077 
   1078 The generated code on x86 for checking for signed overflow on a multiply the
   1079 obvious way is much longer than it needs to be.
   1080 
   1081 int x(int a, int b) {
   1082   long long prod = (long long)a*b;
   1083   return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
   1084 }
   1085 
   1086 See PR2053 for more details.
   1087 
   1088 //===---------------------------------------------------------------------===//
   1089 
   1090 We should investigate using cdq/ctld (effect: edx = sar eax, 31)
   1091 more aggressively; it should cost the same as a move+shift on any modern
   1092 processor, but it's a lot shorter. Downside is that it puts more
   1093 pressure on register allocation because it has fixed operands.
   1094 
   1095 Example:
   1096 int abs(int x) {return x < 0 ? -x : x;}
   1097 
   1098 gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
   1099 abs:
   1100         movl    4(%esp), %eax
   1101         cltd
   1102         xorl    %edx, %eax
   1103         subl    %edx, %eax
   1104         ret
   1105 
   1106 //===---------------------------------------------------------------------===//
   1107 
   1108 Take the following code (from 
   1109 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
   1110 
   1111 extern unsigned char first_one[65536];
   1112 int FirstOnet(unsigned long long arg1)
   1113 {
   1114   if (arg1 >> 48)
   1115     return (first_one[arg1 >> 48]);
   1116   return 0;
   1117 }
   1118 
   1119 
   1120 The following code is currently generated:
   1121 FirstOnet:
   1122         movl    8(%esp), %eax
   1123         cmpl    $65536, %eax
   1124         movl    4(%esp), %ecx
   1125         jb      .LBB1_2 # UnifiedReturnBlock
   1126 .LBB1_1:        # ifthen
   1127         shrl    $16, %eax
   1128         movzbl  first_one(%eax), %eax
   1129         ret
   1130 .LBB1_2:        # UnifiedReturnBlock
   1131         xorl    %eax, %eax
   1132         ret
   1133 
   1134 We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
   1135 lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
   1136 
   1137 //===---------------------------------------------------------------------===//
   1138 
   1139 We compile this function:
   1140 
   1141 define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext  %d) nounwind  {
   1142 entry:
   1143 	%tmp2 = icmp eq i8 %d, 0		; <i1> [#uses=1]
   1144 	br i1 %tmp2, label %bb7, label %bb
   1145 
   1146 bb:		; preds = %entry
   1147 	%tmp6 = add i32 %b, %a		; <i32> [#uses=1]
   1148 	ret i32 %tmp6
   1149 
   1150 bb7:		; preds = %entry
   1151 	%tmp10 = sub i32 %a, %c		; <i32> [#uses=1]
   1152 	ret i32 %tmp10
   1153 }
   1154 
   1155 to:
   1156 
   1157 foo:                                    # @foo
   1158 # BB#0:                                 # %entry
   1159 	movl	4(%esp), %ecx
   1160 	cmpb	$0, 16(%esp)
   1161 	je	.LBB0_2
   1162 # BB#1:                                 # %bb
   1163 	movl	8(%esp), %eax
   1164 	addl	%ecx, %eax
   1165 	ret
   1166 .LBB0_2:                                # %bb7
   1167 	movl	12(%esp), %edx
   1168 	movl	%ecx, %eax
   1169 	subl	%edx, %eax
   1170 	ret
   1171 
   1172 There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
   1173 couple more movls by putting 4(%esp) into %eax instead of %ecx.
   1174 
   1175 //===---------------------------------------------------------------------===//
   1176 
   1177 See rdar://4653682.
   1178 
   1179 From flops:
   1180 
   1181 LBB1_15:        # bb310
   1182         cvtss2sd        LCPI1_0, %xmm1
   1183         addsd   %xmm1, %xmm0
   1184         movsd   176(%esp), %xmm2
   1185         mulsd   %xmm0, %xmm2
   1186         movapd  %xmm2, %xmm3
   1187         mulsd   %xmm3, %xmm3
   1188         movapd  %xmm3, %xmm4
   1189         mulsd   LCPI1_23, %xmm4
   1190         addsd   LCPI1_24, %xmm4
   1191         mulsd   %xmm3, %xmm4
   1192         addsd   LCPI1_25, %xmm4
   1193         mulsd   %xmm3, %xmm4
   1194         addsd   LCPI1_26, %xmm4
   1195         mulsd   %xmm3, %xmm4
   1196         addsd   LCPI1_27, %xmm4
   1197         mulsd   %xmm3, %xmm4
   1198         addsd   LCPI1_28, %xmm4
   1199         mulsd   %xmm3, %xmm4
   1200         addsd   %xmm1, %xmm4
   1201         mulsd   %xmm2, %xmm4
   1202         movsd   152(%esp), %xmm1
   1203         addsd   %xmm4, %xmm1
   1204         movsd   %xmm1, 152(%esp)
   1205         incl    %eax
   1206         cmpl    %eax, %esi
   1207         jge     LBB1_15 # bb310
   1208 LBB1_16:        # bb358.loopexit
   1209         movsd   152(%esp), %xmm0
   1210         addsd   %xmm0, %xmm0
   1211         addsd   LCPI1_22, %xmm0
   1212         movsd   %xmm0, 152(%esp)
   1213 
   1214 Rather than spilling the result of the last addsd in the loop, we should have
   1215 insert a copy to split the interval (one for the duration of the loop, one
   1216 extending to the fall through). The register pressure in the loop isn't high
   1217 enough to warrant the spill.
   1218 
   1219 Also check why xmm7 is not used at all in the function.
   1220 
   1221 //===---------------------------------------------------------------------===//
   1222 
   1223 Take the following:
   1224 
   1225 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
   1226 target triple = "i386-apple-darwin8"
   1227 @in_exit.4870.b = internal global i1 false		; <i1*> [#uses=2]
   1228 define fastcc void @abort_gzip() noreturn nounwind  {
   1229 entry:
   1230 	%tmp.b.i = load i1* @in_exit.4870.b		; <i1> [#uses=1]
   1231 	br i1 %tmp.b.i, label %bb.i, label %bb4.i
   1232 bb.i:		; preds = %entry
   1233 	tail call void @exit( i32 1 ) noreturn nounwind 
   1234 	unreachable
   1235 bb4.i:		; preds = %entry
   1236 	store i1 true, i1* @in_exit.4870.b
   1237 	tail call void @exit( i32 1 ) noreturn nounwind 
   1238 	unreachable
   1239 }
   1240 declare void @exit(i32) noreturn nounwind 
   1241 
   1242 This compiles into:
   1243 _abort_gzip:                            ## @abort_gzip
   1244 ## BB#0:                                ## %entry
   1245 	subl	$12, %esp
   1246 	movb	_in_exit.4870.b, %al
   1247 	cmpb	$1, %al
   1248 	jne	LBB0_2
   1249 
   1250 We somehow miss folding the movb into the cmpb.
   1251 
   1252 //===---------------------------------------------------------------------===//
   1253 
   1254 We compile:
   1255 
   1256 int test(int x, int y) {
   1257   return x-y-1;
   1258 }
   1259 
   1260 into (-m64):
   1261 
   1262 _test:
   1263 	decl	%edi
   1264 	movl	%edi, %eax
   1265 	subl	%esi, %eax
   1266 	ret
   1267 
   1268 it would be better to codegen as: x+~y  (notl+addl)
   1269 
   1270 //===---------------------------------------------------------------------===//
   1271 
   1272 This code:
   1273 
   1274 int foo(const char *str,...)
   1275 {
   1276  __builtin_va_list a; int x;
   1277  __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
   1278  return x;
   1279 }
   1280 
   1281 gets compiled into this on x86-64:
   1282 	subq    $200, %rsp
   1283         movaps  %xmm7, 160(%rsp)
   1284         movaps  %xmm6, 144(%rsp)
   1285         movaps  %xmm5, 128(%rsp)
   1286         movaps  %xmm4, 112(%rsp)
   1287         movaps  %xmm3, 96(%rsp)
   1288         movaps  %xmm2, 80(%rsp)
   1289         movaps  %xmm1, 64(%rsp)
   1290         movaps  %xmm0, 48(%rsp)
   1291         movq    %r9, 40(%rsp)
   1292         movq    %r8, 32(%rsp)
   1293         movq    %rcx, 24(%rsp)
   1294         movq    %rdx, 16(%rsp)
   1295         movq    %rsi, 8(%rsp)
   1296         leaq    (%rsp), %rax
   1297         movq    %rax, 192(%rsp)
   1298         leaq    208(%rsp), %rax
   1299         movq    %rax, 184(%rsp)
   1300         movl    $48, 180(%rsp)
   1301         movl    $8, 176(%rsp)
   1302         movl    176(%rsp), %eax
   1303         cmpl    $47, %eax
   1304         jbe     .LBB1_3 # bb
   1305 .LBB1_1:        # bb3
   1306         movq    184(%rsp), %rcx
   1307         leaq    8(%rcx), %rax
   1308         movq    %rax, 184(%rsp)
   1309 .LBB1_2:        # bb4
   1310         movl    (%rcx), %eax
   1311         addq    $200, %rsp
   1312         ret
   1313 .LBB1_3:        # bb
   1314         movl    %eax, %ecx
   1315         addl    $8, %eax
   1316         addq    192(%rsp), %rcx
   1317         movl    %eax, 176(%rsp)
   1318         jmp     .LBB1_2 # bb4
   1319 
   1320 gcc 4.3 generates:
   1321 	subq    $96, %rsp
   1322 .LCFI0:
   1323         leaq    104(%rsp), %rax
   1324         movq    %rsi, -80(%rsp)
   1325         movl    $8, -120(%rsp)
   1326         movq    %rax, -112(%rsp)
   1327         leaq    -88(%rsp), %rax
   1328         movq    %rax, -104(%rsp)
   1329         movl    $8, %eax
   1330         cmpl    $48, %eax
   1331         jb      .L6
   1332         movq    -112(%rsp), %rdx
   1333         movl    (%rdx), %eax
   1334         addq    $96, %rsp
   1335         ret
   1336         .p2align 4,,10
   1337         .p2align 3
   1338 .L6:
   1339         mov     %eax, %edx
   1340         addq    -104(%rsp), %rdx
   1341         addl    $8, %eax
   1342         movl    %eax, -120(%rsp)
   1343         movl    (%rdx), %eax
   1344         addq    $96, %rsp
   1345         ret
   1346 
   1347 and it gets compiled into this on x86:
   1348 	pushl   %ebp
   1349         movl    %esp, %ebp
   1350         subl    $4, %esp
   1351         leal    12(%ebp), %eax
   1352         movl    %eax, -4(%ebp)
   1353         leal    16(%ebp), %eax
   1354         movl    %eax, -4(%ebp)
   1355         movl    12(%ebp), %eax
   1356         addl    $4, %esp
   1357         popl    %ebp
   1358         ret
   1359 
   1360 gcc 4.3 generates:
   1361 	pushl   %ebp
   1362         movl    %esp, %ebp
   1363         movl    12(%ebp), %eax
   1364         popl    %ebp
   1365         ret
   1366 
   1367 //===---------------------------------------------------------------------===//
   1368 
   1369 Teach tblgen not to check bitconvert source type in some cases. This allows us
   1370 to consolidate the following patterns in X86InstrMMX.td:
   1371 
   1372 def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
   1373                                                   (iPTR 0))))),
   1374           (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
   1375 def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
   1376                                                   (iPTR 0))))),
   1377           (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
   1378 def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
   1379                                                   (iPTR 0))))),
   1380           (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
   1381 
   1382 There are other cases in various td files.
   1383 
   1384 //===---------------------------------------------------------------------===//
   1385 
   1386 Take something like the following on x86-32:
   1387 unsigned a(unsigned long long x, unsigned y) {return x % y;}
   1388 
   1389 We currently generate a libcall, but we really shouldn't: the expansion is
   1390 shorter and likely faster than the libcall.  The expected code is something
   1391 like the following:
   1392 
   1393 	movl	12(%ebp), %eax
   1394 	movl	16(%ebp), %ecx
   1395 	xorl	%edx, %edx
   1396 	divl	%ecx
   1397 	movl	8(%ebp), %eax
   1398 	divl	%ecx
   1399 	movl	%edx, %eax
   1400 	ret
   1401 
   1402 A similar code sequence works for division.
   1403 
   1404 //===---------------------------------------------------------------------===//
   1405 
   1406 These should compile to the same code, but the later codegen's to useless
   1407 instructions on X86. This may be a trivial dag combine (GCC PR7061):
   1408 
   1409 struct s1 { unsigned char a, b; };
   1410 unsigned long f1(struct s1 x) {
   1411     return x.a + x.b;
   1412 }
   1413 struct s2 { unsigned a: 8, b: 8; };
   1414 unsigned long f2(struct s2 x) {
   1415     return x.a + x.b;
   1416 }
   1417 
   1418 //===---------------------------------------------------------------------===//
   1419 
   1420 We currently compile this:
   1421 
   1422 define i32 @func1(i32 %v1, i32 %v2) nounwind {
   1423 entry:
   1424   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
   1425   %sum = extractvalue {i32, i1} %t, 0
   1426   %obit = extractvalue {i32, i1} %t, 1
   1427   br i1 %obit, label %overflow, label %normal
   1428 normal:
   1429   ret i32 %sum
   1430 overflow:
   1431   call void @llvm.trap()
   1432   unreachable
   1433 }
   1434 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
   1435 declare void @llvm.trap()
   1436 
   1437 to:
   1438 
   1439 _func1:
   1440 	movl	4(%esp), %eax
   1441 	addl	8(%esp), %eax
   1442 	jo	LBB1_2	## overflow
   1443 LBB1_1:	## normal
   1444 	ret
   1445 LBB1_2:	## overflow
   1446 	ud2
   1447 
   1448 it would be nice to produce "into" someday.
   1449 
   1450 //===---------------------------------------------------------------------===//
   1451 
   1452 This code:
   1453 
   1454 void vec_mpys1(int y[], const int x[], int scaler) {
   1455 int i;
   1456 for (i = 0; i < 150; i++)
   1457  y[i] += (((long long)scaler * (long long)x[i]) >> 31);
   1458 }
   1459 
   1460 Compiles to this loop with GCC 3.x:
   1461 
   1462 .L5:
   1463 	movl	%ebx, %eax
   1464 	imull	(%edi,%ecx,4)
   1465 	shrdl	$31, %edx, %eax
   1466 	addl	%eax, (%esi,%ecx,4)
   1467 	incl	%ecx
   1468 	cmpl	$149, %ecx
   1469 	jle	.L5
   1470 
   1471 llvm-gcc compiles it to the much uglier:
   1472 
   1473 LBB1_1:	## bb1
   1474 	movl	24(%esp), %eax
   1475 	movl	(%eax,%edi,4), %ebx
   1476 	movl	%ebx, %ebp
   1477 	imull	%esi, %ebp
   1478 	movl	%ebx, %eax
   1479 	mull	%ecx
   1480 	addl	%ebp, %edx
   1481 	sarl	$31, %ebx
   1482 	imull	%ecx, %ebx
   1483 	addl	%edx, %ebx
   1484 	shldl	$1, %eax, %ebx
   1485 	movl	20(%esp), %eax
   1486 	addl	%ebx, (%eax,%edi,4)
   1487 	incl	%edi
   1488 	cmpl	$150, %edi
   1489 	jne	LBB1_1	## bb1
   1490 
   1491 The issue is that we hoist the cast of "scaler" to long long outside of the
   1492 loop, the value comes into the loop as two values, and
   1493 RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
   1494 constructed BUILD_PAIR which represents the cast value.
   1495 
   1496 This can be handled by making CodeGenPrepare sink the cast.
   1497 
   1498 //===---------------------------------------------------------------------===//
   1499 
   1500 Test instructions can be eliminated by using EFLAGS values from arithmetic
   1501 instructions. This is currently not done for mul, and, or, xor, neg, shl,
   1502 sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
   1503 for read-modify-write instructions. It is also current not done if the
   1504 OF or CF flags are needed.
   1505 
   1506 The shift operators have the complication that when the shift count is
   1507 zero, EFLAGS is not set, so they can only subsume a test instruction if
   1508 the shift count is known to be non-zero. Also, using the EFLAGS value
   1509 from a shift is apparently very slow on some x86 implementations.
   1510 
   1511 In read-modify-write instructions, the root node in the isel match is
   1512 the store, and isel has no way for the use of the EFLAGS result of the
   1513 arithmetic to be remapped to the new node.
   1514 
   1515 Add and subtract instructions set OF on signed overflow and CF on unsiged
   1516 overflow, while test instructions always clear OF and CF. In order to
   1517 replace a test with an add or subtract in a situation where OF or CF is
   1518 needed, codegen must be able to prove that the operation cannot see
   1519 signed or unsigned overflow, respectively.
   1520 
   1521 //===---------------------------------------------------------------------===//
   1522 
   1523 memcpy/memmove do not lower to SSE copies when possible.  A silly example is:
   1524 define <16 x float> @foo(<16 x float> %A) nounwind {
   1525 	%tmp = alloca <16 x float>, align 16
   1526 	%tmp2 = alloca <16 x float>, align 16
   1527 	store <16 x float> %A, <16 x float>* %tmp
   1528 	%s = bitcast <16 x float>* %tmp to i8*
   1529 	%s2 = bitcast <16 x float>* %tmp2 to i8*
   1530 	call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
   1531 	%R = load <16 x float>* %tmp2
   1532 	ret <16 x float> %R
   1533 }
   1534 
   1535 declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
   1536 
   1537 which compiles to:
   1538 
   1539 _foo:
   1540 	subl	$140, %esp
   1541 	movaps	%xmm3, 112(%esp)
   1542 	movaps	%xmm2, 96(%esp)
   1543 	movaps	%xmm1, 80(%esp)
   1544 	movaps	%xmm0, 64(%esp)
   1545 	movl	60(%esp), %eax
   1546 	movl	%eax, 124(%esp)
   1547 	movl	56(%esp), %eax
   1548 	movl	%eax, 120(%esp)
   1549 	movl	52(%esp), %eax
   1550         <many many more 32-bit copies>
   1551       	movaps	(%esp), %xmm0
   1552 	movaps	16(%esp), %xmm1
   1553 	movaps	32(%esp), %xmm2
   1554 	movaps	48(%esp), %xmm3
   1555 	addl	$140, %esp
   1556 	ret
   1557 
   1558 On Nehalem, it may even be cheaper to just use movups when unaligned than to
   1559 fall back to lower-granularity chunks.
   1560 
   1561 //===---------------------------------------------------------------------===//
   1562 
   1563 Implement processor-specific optimizations for parity with GCC on these
   1564 processors.  GCC does two optimizations:
   1565 
   1566 1. ix86_pad_returns inserts a noop before ret instructions if immediately
   1567    preceded by a conditional branch or is the target of a jump.
   1568 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
   1569    code contains more than 3 branches.
   1570    
   1571 The first one is done for all AMDs, Core2, and "Generic"
   1572 The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
   1573   Core 2, and "Generic"
   1574 
   1575 //===---------------------------------------------------------------------===//
   1576 
   1577 Testcase:
   1578 int a(int x) { return (x & 127) > 31; }
   1579 
   1580 Current output:
   1581 	movl	4(%esp), %eax
   1582 	andl	$127, %eax
   1583 	cmpl	$31, %eax
   1584 	seta	%al
   1585 	movzbl	%al, %eax
   1586 	ret
   1587 
   1588 Ideal output:
   1589 	xorl	%eax, %eax
   1590 	testl	$96, 4(%esp)
   1591 	setne	%al
   1592 	ret
   1593 
   1594 This should definitely be done in instcombine, canonicalizing the range
   1595 condition into a != condition.  We get this IR:
   1596 
   1597 define i32 @a(i32 %x) nounwind readnone {
   1598 entry:
   1599 	%0 = and i32 %x, 127		; <i32> [#uses=1]
   1600 	%1 = icmp ugt i32 %0, 31		; <i1> [#uses=1]
   1601 	%2 = zext i1 %1 to i32		; <i32> [#uses=1]
   1602 	ret i32 %2
   1603 }
   1604 
   1605 Instcombine prefers to strength reduce relational comparisons to equality
   1606 comparisons when possible, this should be another case of that.  This could
   1607 be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it
   1608 looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already
   1609 be redesigned to use ComputeMaskedBits and friends.
   1610 
   1611 
   1612 //===---------------------------------------------------------------------===//
   1613 Testcase:
   1614 int x(int a) { return (a&0xf0)>>4; }
   1615 
   1616 Current output:
   1617 	movl	4(%esp), %eax
   1618 	shrl	$4, %eax
   1619 	andl	$15, %eax
   1620 	ret
   1621 
   1622 Ideal output:
   1623 	movzbl	4(%esp), %eax
   1624 	shrl	$4, %eax
   1625 	ret
   1626 
   1627 //===---------------------------------------------------------------------===//
   1628 
   1629 Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
   1630 properly.
   1631 
   1632 When the return value is not used (i.e. only care about the value in the
   1633 memory), x86 does not have to use add to implement these. Instead, it can use
   1634 add, sub, inc, dec instructions with the "lock" prefix.
   1635 
   1636 This is currently implemented using a bit of instruction selection trick. The
   1637 issue is the target independent pattern produces one output and a chain and we
   1638 want to map it into one that just output a chain. The current trick is to select
   1639 it into a MERGE_VALUES with the first definition being an implicit_def. The
   1640 proper solution is to add new ISD opcodes for the no-output variant. DAG
   1641 combiner can then transform the node before it gets to target node selection.
   1642 
   1643 Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
   1644 fact these instructions are identical to the non-lock versions. We need a way to
   1645 add target specific information to target nodes and have this information
   1646 carried over to machine instructions. Asm printer (or JIT) can use this
   1647 information to add the "lock" prefix.
   1648 
   1649 //===---------------------------------------------------------------------===//
   1650 
   1651 struct B {
   1652   unsigned char y0 : 1;
   1653 };
   1654 
   1655 int bar(struct B* a) { return a->y0; }
   1656 
   1657 define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
   1658   %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
   1659   %2 = load i8* %1, align 1
   1660   %3 = and i8 %2, 1
   1661   %4 = zext i8 %3 to i32
   1662   ret i32 %4
   1663 }
   1664 
   1665 bar:                                    # @bar
   1666 # BB#0:
   1667         movb    (%rdi), %al
   1668         andb    $1, %al
   1669         movzbl  %al, %eax
   1670         ret
   1671 
   1672 Missed optimization: should be movl+andl.
   1673 
   1674 //===---------------------------------------------------------------------===//
   1675 
   1676 The x86_64 abi says:
   1677 
   1678 Booleans, when stored in a memory object, are stored as single byte objects the
   1679 value of which is always 0 (false) or 1 (true).
   1680 
   1681 We are not using this fact:
   1682 
   1683 int bar(_Bool *a) { return *a; }
   1684 
   1685 define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
   1686   %1 = load i8* %a, align 1, !tbaa !0
   1687   %tmp = and i8 %1, 1
   1688   %2 = zext i8 %tmp to i32
   1689   ret i32 %2
   1690 }
   1691 
   1692 bar:
   1693         movb    (%rdi), %al
   1694         andb    $1, %al
   1695         movzbl  %al, %eax
   1696         ret
   1697 
   1698 GCC produces
   1699 
   1700 bar:
   1701         movzbl  (%rdi), %eax
   1702         ret
   1703 
   1704 //===---------------------------------------------------------------------===//
   1705 
   1706 Consider the following two functions compiled with clang:
   1707 _Bool foo(int *x) { return !(*x & 4); }
   1708 unsigned bar(int *x) { return !(*x & 4); }
   1709 
   1710 foo:
   1711 	movl	4(%esp), %eax
   1712 	testb	$4, (%eax)
   1713 	sete	%al
   1714 	movzbl	%al, %eax
   1715 	ret
   1716 
   1717 bar:
   1718 	movl	4(%esp), %eax
   1719 	movl	(%eax), %eax
   1720 	shrl	$2, %eax
   1721 	andl	$1, %eax
   1722 	xorl	$1, %eax
   1723 	ret
   1724 
   1725 The second function generates more code even though the two functions are
   1726 are functionally identical.
   1727 
   1728 //===---------------------------------------------------------------------===//
   1729 
   1730 Take the following C code:
   1731 int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
   1732 
   1733 We generate the following IR with clang:
   1734 define i32 @f(i32 %a, i32 %b) nounwind readnone {
   1735 entry:
   1736   %tmp = xor i32 %b, %a                           ; <i32> [#uses=1]
   1737   %tmp6 = and i32 %tmp, 255                       ; <i32> [#uses=1]
   1738   %cmp = icmp eq i32 %tmp6, 0                     ; <i1> [#uses=1]
   1739   %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
   1740   ret i32 %conv5
   1741 }
   1742 
   1743 And the following x86 code:
   1744 	xorl	%esi, %edi
   1745 	testb	$-1, %dil
   1746 	sete	%al
   1747 	movzbl	%al, %eax
   1748 	ret
   1749 
   1750 A cmpb instead of the xorl+testb would be one instruction shorter.
   1751 
   1752 //===---------------------------------------------------------------------===//
   1753 
   1754 Given the following C code:
   1755 int f(int a, int b) { return (signed char)a == (signed char)b; }
   1756 
   1757 We generate the following IR with clang:
   1758 define i32 @f(i32 %a, i32 %b) nounwind readnone {
   1759 entry:
   1760   %sext = shl i32 %a, 24                          ; <i32> [#uses=1]
   1761   %conv1 = ashr i32 %sext, 24                     ; <i32> [#uses=1]
   1762   %sext6 = shl i32 %b, 24                         ; <i32> [#uses=1]
   1763   %conv4 = ashr i32 %sext6, 24                    ; <i32> [#uses=1]
   1764   %cmp = icmp eq i32 %conv1, %conv4               ; <i1> [#uses=1]
   1765   %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
   1766   ret i32 %conv5
   1767 }
   1768 
   1769 And the following x86 code:
   1770 	movsbl	%sil, %eax
   1771 	movsbl	%dil, %ecx
   1772 	cmpl	%eax, %ecx
   1773 	sete	%al
   1774 	movzbl	%al, %eax
   1775 	ret
   1776 
   1777 
   1778 It should be possible to eliminate the sign extensions.
   1779 
   1780 //===---------------------------------------------------------------------===//
   1781 
   1782 LLVM misses a load+store narrowing opportunity in this code:
   1783 
   1784 %struct.bf = type { i64, i16, i16, i32 }
   1785 
   1786 @bfi = external global %struct.bf*                ; <%struct.bf**> [#uses=2]
   1787 
   1788 define void @t1() nounwind ssp {
   1789 entry:
   1790   %0 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
   1791   %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
   1792   %2 = bitcast i16* %1 to i32*                    ; <i32*> [#uses=2]
   1793   %3 = load i32* %2, align 1                      ; <i32> [#uses=1]
   1794   %4 = and i32 %3, -65537                         ; <i32> [#uses=1]
   1795   store i32 %4, i32* %2, align 1
   1796   %5 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
   1797   %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
   1798   %7 = bitcast i16* %6 to i32*                    ; <i32*> [#uses=2]
   1799   %8 = load i32* %7, align 1                      ; <i32> [#uses=1]
   1800   %9 = and i32 %8, -131073                        ; <i32> [#uses=1]
   1801   store i32 %9, i32* %7, align 1
   1802   ret void
   1803 }
   1804 
   1805 LLVM currently emits this:
   1806 
   1807   movq  bfi(%rip), %rax
   1808   andl  $-65537, 8(%rax)
   1809   movq  bfi(%rip), %rax
   1810   andl  $-131073, 8(%rax)
   1811   ret
   1812 
   1813 It could narrow the loads and stores to emit this:
   1814 
   1815   movq  bfi(%rip), %rax
   1816   andb  $-2, 10(%rax)
   1817   movq  bfi(%rip), %rax
   1818   andb  $-3, 10(%rax)
   1819   ret
   1820 
   1821 The trouble is that there is a TokenFactor between the store and the
   1822 load, making it non-trivial to determine if there's anything between
   1823 the load and the store which would prohibit narrowing.
   1824 
   1825 //===---------------------------------------------------------------------===//
   1826 
   1827 This code:
   1828 void foo(unsigned x) {
   1829   if (x == 0) bar();
   1830   else if (x == 1) qux();
   1831 }
   1832 
   1833 currently compiles into:
   1834 _foo:
   1835 	movl	4(%esp), %eax
   1836 	cmpl	$1, %eax
   1837 	je	LBB0_3
   1838 	testl	%eax, %eax
   1839 	jne	LBB0_4
   1840 
   1841 the testl could be removed:
   1842 _foo:
   1843 	movl	4(%esp), %eax
   1844 	cmpl	$1, %eax
   1845 	je	LBB0_3
   1846 	jb	LBB0_4
   1847 
   1848 0 is the only unsigned number < 1.
   1849 
   1850 //===---------------------------------------------------------------------===//
   1851 
   1852 This code:
   1853 
   1854 %0 = type { i32, i1 }
   1855 
   1856 define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
   1857 entry:
   1858   %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
   1859   %cmp = extractvalue %0 %uadd, 1
   1860   %inc = zext i1 %cmp to i32
   1861   %add = add i32 %x, %sum
   1862   %z.0 = add i32 %add, %inc
   1863   ret i32 %z.0
   1864 }
   1865 
   1866 declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
   1867 
   1868 compiles to:
   1869 
   1870 _add32carry:                            ## @add32carry
   1871 	addl	%esi, %edi
   1872 	sbbl	%ecx, %ecx
   1873 	movl	%edi, %eax
   1874 	subl	%ecx, %eax
   1875 	ret
   1876 
   1877 But it could be:
   1878 
   1879 _add32carry:
   1880 	leal	(%rsi,%rdi), %eax
   1881 	cmpl	%esi, %eax
   1882 	adcl	$0, %eax
   1883 	ret
   1884 
   1885 //===---------------------------------------------------------------------===//
   1886 
   1887 The hot loop of 256.bzip2 contains code that looks a bit like this:
   1888 
   1889 int foo(char *P, char *Q, int x, int y) {
   1890   if (P[0] != Q[0])
   1891      return P[0] < Q[0];
   1892   if (P[1] != Q[1])
   1893      return P[1] < Q[1];
   1894   if (P[2] != Q[2])
   1895      return P[2] < Q[2];
   1896    return P[3] < Q[3];
   1897 }
   1898 
   1899 In the real code, we get a lot more wrong than this.  However, even in this
   1900 code we generate:
   1901 
   1902 _foo:                                   ## @foo
   1903 ## BB#0:                                ## %entry
   1904 	movb	(%rsi), %al
   1905 	movb	(%rdi), %cl
   1906 	cmpb	%al, %cl
   1907 	je	LBB0_2
   1908 LBB0_1:                                 ## %if.then
   1909 	cmpb	%al, %cl
   1910 	jmp	LBB0_5
   1911 LBB0_2:                                 ## %if.end
   1912 	movb	1(%rsi), %al
   1913 	movb	1(%rdi), %cl
   1914 	cmpb	%al, %cl
   1915 	jne	LBB0_1
   1916 ## BB#3:                                ## %if.end38
   1917 	movb	2(%rsi), %al
   1918 	movb	2(%rdi), %cl
   1919 	cmpb	%al, %cl
   1920 	jne	LBB0_1
   1921 ## BB#4:                                ## %if.end60
   1922 	movb	3(%rdi), %al
   1923 	cmpb	3(%rsi), %al
   1924 LBB0_5:                                 ## %if.end60
   1925 	setl	%al
   1926 	movzbl	%al, %eax
   1927 	ret
   1928 
   1929 Note that we generate jumps to LBB0_1 which does a redundant compare.  The
   1930 redundant compare also forces the register values to be live, which prevents
   1931 folding one of the loads into the compare.  In contrast, GCC 4.2 produces:
   1932 
   1933 _foo:
   1934 	movzbl	(%rsi), %eax
   1935 	cmpb	%al, (%rdi)
   1936 	jne	L10
   1937 L12:
   1938 	movzbl	1(%rsi), %eax
   1939 	cmpb	%al, 1(%rdi)
   1940 	jne	L10
   1941 	movzbl	2(%rsi), %eax
   1942 	cmpb	%al, 2(%rdi)
   1943 	jne	L10
   1944 	movzbl	3(%rdi), %eax
   1945 	cmpb	3(%rsi), %al
   1946 L10:
   1947 	setl	%al
   1948 	movzbl	%al, %eax
   1949 	ret
   1950 
   1951 which is "perfect".
   1952 
   1953 //===---------------------------------------------------------------------===//
   1954 
   1955 For the branch in the following code:
   1956 int a();
   1957 int b(int x, int y) {
   1958   if (x & (1<<(y&7)))
   1959     return a();
   1960   return y;
   1961 }
   1962 
   1963 We currently generate:
   1964 	movb	%sil, %al
   1965 	andb	$7, %al
   1966 	movzbl	%al, %eax
   1967 	btl	%eax, %edi
   1968 	jae	.LBB0_2
   1969 
   1970 movl+andl would be shorter than the movb+andb+movzbl sequence.
   1971 
   1972 //===---------------------------------------------------------------------===//
   1973 
   1974 For the following:
   1975 struct u1 {
   1976     float x, y;
   1977 };
   1978 float foo(struct u1 u) {
   1979     return u.x + u.y;
   1980 }
   1981 
   1982 We currently generate:
   1983 	movdqa	%xmm0, %xmm1
   1984 	pshufd	$1, %xmm0, %xmm0        # xmm0 = xmm0[1,0,0,0]
   1985 	addss	%xmm1, %xmm0
   1986 	ret
   1987 
   1988 We could save an instruction here by commuting the addss.
   1989 
   1990 //===---------------------------------------------------------------------===//
   1991 
   1992 This (from PR9661):
   1993 
   1994 float clamp_float(float a) {
   1995         if (a > 1.0f)
   1996                 return 1.0f;
   1997         else if (a < 0.0f)
   1998                 return 0.0f;
   1999         else
   2000                 return a;
   2001 }
   2002 
   2003 Could compile to:
   2004 
   2005 clamp_float:                            # @clamp_float
   2006         movss   .LCPI0_0(%rip), %xmm1
   2007         minss   %xmm1, %xmm0
   2008         pxor    %xmm1, %xmm1
   2009         maxss   %xmm1, %xmm0
   2010         ret
   2011 
   2012 with -ffast-math.
   2013 
   2014 //===---------------------------------------------------------------------===//
   2015 
   2016 This function (from PR9803):
   2017 
   2018 int clamp2(int a) {
   2019         if (a > 5)
   2020                 a = 5;
   2021         if (a < 0) 
   2022                 return 0;
   2023         return a;
   2024 }
   2025 
   2026 Compiles to:
   2027 
   2028 _clamp2:                                ## @clamp2
   2029         pushq   %rbp
   2030         movq    %rsp, %rbp
   2031         cmpl    $5, %edi
   2032         movl    $5, %ecx
   2033         cmovlel %edi, %ecx
   2034         testl   %ecx, %ecx
   2035         movl    $0, %eax
   2036         cmovnsl %ecx, %eax
   2037         popq    %rbp
   2038         ret
   2039 
   2040 The move of 0 could be scheduled above the test to make it is xor reg,reg.
   2041 
   2042 //===---------------------------------------------------------------------===//
   2043 
   2044 GCC PR48986.  We currently compile this:
   2045 
   2046 void bar(void);
   2047 void yyy(int* p) {
   2048     if (__sync_fetch_and_add(p, -1) == 1)
   2049       bar();
   2050 }
   2051 
   2052 into:
   2053 	movl	$-1, %eax
   2054 	lock
   2055 	xaddl	%eax, (%rdi)
   2056 	cmpl	$1, %eax
   2057 	je	LBB0_2
   2058 
   2059 Instead we could generate:
   2060 
   2061 	lock
   2062 	dec %rdi
   2063 	je LBB0_2
   2064 
   2065 The trick is to match "fetch_and_add(X, -C) == C".
   2066 
   2067 //===---------------------------------------------------------------------===//
   2068 
   2069