Home | History | Annotate | only in /external/llvm/lib/Target/X86
Up to higher level directory
NameDateSize
Android.mk04-Nov-20141.6K
AsmParser/04-Nov-2014
CMakeLists.txt04-Nov-20141.8K
Disassembler/04-Nov-2014
InstPrinter/04-Nov-2014
LLVMBuild.txt04-Nov-20141K
Makefile04-Nov-2014840
MCTargetDesc/04-Nov-2014
README-FPStack.txt04-Nov-20142.7K
README-MMX.txt04-Nov-20141.5K
README-SSE.txt04-Nov-201426.3K
README-UNIMPLEMENTED.txt04-Nov-2014679
README-X86-64.txt04-Nov-20146K
README.txt04-Nov-201451.6K
TargetInfo/04-Nov-2014
Utils/04-Nov-2014
X86.h04-Nov-20143.1K
X86.td04-Nov-201421.8K
X86AsmPrinter.cpp04-Nov-201425.6K
X86AsmPrinter.h04-Nov-20141.7K
X86AtomicExpandPass.cpp04-Nov-20149.8K
X86CallingConv.h04-Nov-20141.1K
X86CallingConv.td04-Nov-201425.7K
X86CodeEmitter.cpp04-Nov-201453.1K
X86CompilationCallback_Win64.asm04-Nov-20141.6K
X86FastISel.cpp04-Nov-2014115.6K
X86FixupLEAs.cpp04-Nov-201411.9K
X86FloatingPoint.cpp04-Nov-201465.9K
X86FrameLowering.cpp04-Nov-201462.7K
X86FrameLowering.h04-Nov-20142.8K
X86Instr3DNow.td04-Nov-20144.4K
X86InstrArithmetic.td04-Nov-201466K
X86InstrAVX512.td04-Nov-2014227.4K
X86InstrBuilder.h04-Nov-20146.6K
X86InstrCMovSetCC.td04-Nov-20145.3K
X86InstrCompiler.td04-Nov-201479.1K
X86InstrControl.td04-Nov-201415.2K
X86InstrExtension.td04-Nov-20148.8K
X86InstrFMA.td04-Nov-201419.2K
X86InstrFormats.td04-Nov-201437.9K
X86InstrFPStack.td04-Nov-201434.3K
X86InstrFragmentsSIMD.td04-Nov-201426.4K
X86InstrInfo.cpp04-Nov-2014226.3K
X86InstrInfo.h04-Nov-201421.1K
X86InstrInfo.td04-Nov-2014131.3K
X86InstrMMX.td04-Nov-201428.6K
X86InstrShiftRotate.td04-Nov-201446.1K
X86InstrSSE.td04-Nov-2014430.6K
X86InstrSVM.td04-Nov-20142.1K
X86InstrSystem.td04-Nov-201426.7K
X86InstrTSX.td04-Nov-20141.7K
X86InstrVMX.td04-Nov-20143.2K
X86InstrXOP.td04-Nov-201414.4K
X86ISelDAGToDAG.cpp04-Nov-2014102K
X86ISelLowering.cpp04-Nov-2014878.9K
X86ISelLowering.h04-Nov-201440.7K
X86JITInfo.cpp04-Nov-201419.3K
X86JITInfo.h04-Nov-20143K
X86MachineFunctionInfo.cpp04-Nov-2014444
X86MachineFunctionInfo.h04-Nov-20145.6K
X86MCInstLower.cpp04-Nov-201435.3K
X86PadShortFunction.cpp04-Nov-20146.8K
X86RegisterInfo.cpp04-Nov-201425K
X86RegisterInfo.h04-Nov-20144.9K
X86RegisterInfo.td04-Nov-201419.3K
X86Relocations.h04-Nov-20142K
X86SchedHaswell.td04-Nov-20148.6K
X86SchedSandyBridge.td04-Nov-20148K
X86Schedule.td04-Nov-201421.9K
X86ScheduleAtom.td04-Nov-201428.4K
X86ScheduleSLM.td04-Nov-20147.4K
X86SelectionDAGInfo.cpp04-Nov-201410.1K
X86SelectionDAGInfo.h04-Nov-20141.6K
X86Subtarget.cpp04-Nov-201412.2K
X86Subtarget.h04-Nov-201415.9K
X86TargetMachine.cpp04-Nov-20146K
X86TargetMachine.h04-Nov-20142.3K
X86TargetObjectFile.cpp04-Nov-20143.9K
X86TargetObjectFile.h04-Nov-20142K
X86TargetTransformInfo.cpp04-Nov-201438.9K
X86VZeroUpper.cpp04-Nov-201411.5K

README-FPStack.txt

      1 //===---------------------------------------------------------------------===//
      2 // Random ideas for the X86 backend: FP stack related stuff
      3 //===---------------------------------------------------------------------===//
      4 
      5 //===---------------------------------------------------------------------===//
      6 
      7 Some targets (e.g. athlons) prefer freep to fstp ST(0):
      8 http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html
      9 
     10 //===---------------------------------------------------------------------===//
     11 
     12 This should use fiadd on chips where it is profitable:
     13 double foo(double P, int *I) { return P+*I; }
     14 
     15 We have fiadd patterns now but the followings have the same cost and
     16 complexity. We need a way to specify the later is more profitable.
     17 
     18 def FpADD32m  : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW,
     19                     [(set RFP:$dst, (fadd RFP:$src1,
     20                                      (extloadf64f32 addr:$src2)))]>;
     21                 // ST(0) = ST(0) + [mem32]
     22 
     23 def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW,
     24                     [(set RFP:$dst, (fadd RFP:$src1,
     25                                      (X86fild addr:$src2, i32)))]>;
     26                 // ST(0) = ST(0) + [mem32int]
     27 
     28 //===---------------------------------------------------------------------===//
     29 
     30 The FP stackifier should handle simple permutates to reduce number of shuffle
     31 instructions, e.g. turning:
     32 
     33 fld P	->		fld Q
     34 fld Q			fld P
     35 fxch
     36 
     37 or:
     38 
     39 fxch	->		fucomi
     40 fucomi			jl X
     41 jg X
     42 
     43 Ideas:
     44 http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html
     45 
     46 
     47 //===---------------------------------------------------------------------===//
     48 
     49 Add a target specific hook to DAG combiner to handle SINT_TO_FP and
     50 FP_TO_SINT when the source operand is already in memory.
     51 
     52 //===---------------------------------------------------------------------===//
     53 
     54 Open code rint,floor,ceil,trunc:
     55 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html
     56 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html
     57 
     58 Opencode the sincos[f] libcall.
     59 
     60 //===---------------------------------------------------------------------===//
     61 
     62 None of the FPStack instructions are handled in
     63 X86RegisterInfo::foldMemoryOperand, which prevents the spiller from
     64 folding spill code into the instructions.
     65 
     66 //===---------------------------------------------------------------------===//
     67 
     68 Currently the x86 codegen isn't very good at mixing SSE and FPStack
     69 code:
     70 
     71 unsigned int foo(double x) { return x; }
     72 
     73 foo:
     74 	subl $20, %esp
     75 	movsd 24(%esp), %xmm0
     76 	movsd %xmm0, 8(%esp)
     77 	fldl 8(%esp)
     78 	fisttpll (%esp)
     79 	movl (%esp), %eax
     80 	addl $20, %esp
     81 	ret
     82 
     83 This just requires being smarter when custom expanding fptoui.
     84 
     85 //===---------------------------------------------------------------------===//
     86 

README-MMX.txt

      1 //===---------------------------------------------------------------------===//
      2 // Random ideas for the X86 backend: MMX-specific stuff.
      3 //===---------------------------------------------------------------------===//
      4 
      5 //===---------------------------------------------------------------------===//
      6 
      7 This:
      8 
      9 #include <mmintrin.h>
     10 
     11 __v2si qux(int A) {
     12   return (__v2si){ 0, A };
     13 }
     14 
     15 is compiled into:
     16 
     17 _qux:
     18         subl $28, %esp
     19         movl 32(%esp), %eax
     20         movd %eax, %mm0
     21         movq %mm0, (%esp)
     22         movl (%esp), %eax
     23         movl %eax, 20(%esp)
     24         movq %mm0, 8(%esp)
     25         movl 12(%esp), %eax
     26         movl %eax, 16(%esp)
     27         movq 16(%esp), %mm0
     28         addl $28, %esp
     29         ret
     30 
     31 Yuck!
     32 
     33 GCC gives us:
     34 
     35 _qux:
     36         subl    $12, %esp
     37         movl    16(%esp), %eax
     38         movl    20(%esp), %edx
     39         movl    $0, (%eax)
     40         movl    %edx, 4(%eax)
     41         addl    $12, %esp
     42         ret     $4
     43 
     44 //===---------------------------------------------------------------------===//
     45 
     46 We generate crappy code for this:
     47 
     48 __m64 t() {
     49   return _mm_cvtsi32_si64(1);
     50 }
     51 
     52 _t:
     53 	subl	$12, %esp
     54 	movl	$1, %eax
     55 	movd	%eax, %mm0
     56 	movq	%mm0, (%esp)
     57 	movl	(%esp), %eax
     58 	movl	4(%esp), %edx
     59 	addl	$12, %esp
     60 	ret
     61 
     62 The extra stack traffic is covered in the previous entry. But the other reason
     63 is we are not smart about materializing constants in MMX registers. With -m64
     64 
     65 	movl	$1, %eax
     66 	movd	%eax, %mm0
     67 	movd	%mm0, %rax
     68 	ret
     69 
     70 We should be using a constantpool load instead:
     71 	movq	LC0(%rip), %rax
     72 

README-SSE.txt

      1 //===---------------------------------------------------------------------===//
      2 // Random ideas for the X86 backend: SSE-specific stuff.
      3 //===---------------------------------------------------------------------===//
      4 
      5 //===---------------------------------------------------------------------===//
      6 
      7 SSE Variable shift can be custom lowered to something like this, which uses a
      8 small table + unaligned load + shuffle instead of going through memory.
      9 
     10 __m128i_shift_right:
     11 	.byte	  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
     12 	.byte	 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
     13 
     14 ...
     15 __m128i shift_right(__m128i value, unsigned long offset) {
     16   return _mm_shuffle_epi8(value,
     17                _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
     18 }
     19 
     20 //===---------------------------------------------------------------------===//
     21 
     22 SSE has instructions for doing operations on complex numbers, we should pattern
     23 match them.   For example, this should turn into a horizontal add:
     24 
     25 typedef float __attribute__((vector_size(16))) v4f32;
     26 float f32(v4f32 A) {
     27   return A[0]+A[1]+A[2]+A[3];
     28 }
     29 
     30 Instead we get this:
     31 
     32 _f32:                                   ## @f32
     33 	pshufd	$1, %xmm0, %xmm1        ## xmm1 = xmm0[1,0,0,0]
     34 	addss	%xmm0, %xmm1
     35 	pshufd	$3, %xmm0, %xmm2        ## xmm2 = xmm0[3,0,0,0]
     36 	movhlps	%xmm0, %xmm0            ## xmm0 = xmm0[1,1]
     37 	movaps	%xmm0, %xmm3
     38 	addss	%xmm1, %xmm3
     39 	movdqa	%xmm2, %xmm0
     40 	addss	%xmm3, %xmm0
     41 	ret
     42 
     43 Also, there are cases where some simple local SLP would improve codegen a bit.
     44 compiling this:
     45 
     46 _Complex float f32(_Complex float A, _Complex float B) {
     47   return A+B;
     48 }
     49 
     50 into:
     51 
     52 _f32:                                   ## @f32
     53 	movdqa	%xmm0, %xmm2
     54 	addss	%xmm1, %xmm2
     55 	pshufd	$1, %xmm1, %xmm1        ## xmm1 = xmm1[1,0,0,0]
     56 	pshufd	$1, %xmm0, %xmm3        ## xmm3 = xmm0[1,0,0,0]
     57 	addss	%xmm1, %xmm3
     58 	movaps	%xmm2, %xmm0
     59 	unpcklps	%xmm3, %xmm0    ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
     60 	ret
     61 
     62 seems silly when it could just be one addps.
     63 
     64 
     65 //===---------------------------------------------------------------------===//
     66 
     67 Expand libm rounding functions inline:  Significant speedups possible.
     68 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
     69 
     70 //===---------------------------------------------------------------------===//
     71 
     72 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
     73 other fast SSE modes.
     74 
     75 //===---------------------------------------------------------------------===//
     76 
     77 Think about doing i64 math in SSE regs on x86-32.
     78 
     79 //===---------------------------------------------------------------------===//
     80 
     81 This testcase should have no SSE instructions in it, and only one load from
     82 a constant pool:
     83 
     84 double %test3(bool %B) {
     85         %C = select bool %B, double 123.412, double 523.01123123
     86         ret double %C
     87 }
     88 
     89 Currently, the select is being lowered, which prevents the dag combiner from
     90 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
     91 
     92 The pattern isel got this one right.
     93 
     94 //===---------------------------------------------------------------------===//
     95 
     96 SSE should implement 'select_cc' using 'emulated conditional moves' that use
     97 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
     98 
     99 double %X(double %Y, double %Z, double %A, double %B) {
    100         %C = setlt double %A, %B
    101         %z = fadd double %Z, 0.0    ;; select operand is not a load
    102         %D = select bool %C, double %Y, double %z
    103         ret double %D
    104 }
    105 
    106 We currently emit:
    107 
    108 _X:
    109         subl $12, %esp
    110         xorpd %xmm0, %xmm0
    111         addsd 24(%esp), %xmm0
    112         movsd 32(%esp), %xmm1
    113         movsd 16(%esp), %xmm2
    114         ucomisd 40(%esp), %xmm1
    115         jb LBB_X_2
    116 LBB_X_1:
    117         movsd %xmm0, %xmm2
    118 LBB_X_2:
    119         movsd %xmm2, (%esp)
    120         fldl (%esp)
    121         addl $12, %esp
    122         ret
    123 
    124 //===---------------------------------------------------------------------===//
    125 
    126 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
    127 feasible.
    128 
    129 //===---------------------------------------------------------------------===//
    130 
    131 Codegen:
    132   if (copysign(1.0, x) == copysign(1.0, y))
    133 into:
    134   if (x^y & mask)
    135 when using SSE.
    136 
    137 //===---------------------------------------------------------------------===//
    138 
    139 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
    140 of a v4sf value.
    141 
    142 //===---------------------------------------------------------------------===//
    143 
    144 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
    145 Perhaps use pxor / xorp* to clear a XMM register first?
    146 
    147 //===---------------------------------------------------------------------===//
    148 
    149 External test Nurbs exposed some problems. Look for
    150 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
    151 emits:
    152 
    153         movaps    (%edx), %xmm2                                 #59.21
    154         movaps    (%edx), %xmm5                                 #60.21
    155         movaps    (%edx), %xmm4                                 #61.21
    156         movaps    (%edx), %xmm3                                 #62.21
    157         movl      40(%ecx), %ebp                                #69.49
    158         shufps    $0, %xmm2, %xmm5                              #60.21
    159         movl      100(%esp), %ebx                               #69.20
    160         movl      (%ebx), %edi                                  #69.20
    161         imull     %ebp, %edi                                    #69.49
    162         addl      (%eax), %edi                                  #70.33
    163         shufps    $85, %xmm2, %xmm4                             #61.21
    164         shufps    $170, %xmm2, %xmm3                            #62.21
    165         shufps    $255, %xmm2, %xmm2                            #63.21
    166         lea       (%ebp,%ebp,2), %ebx                           #69.49
    167         negl      %ebx                                          #69.49
    168         lea       -3(%edi,%ebx), %ebx                           #70.33
    169         shll      $4, %ebx                                      #68.37
    170         addl      32(%ecx), %ebx                                #68.37
    171         testb     $15, %bl                                      #91.13
    172         jne       L_B1.24       # Prob 5%                       #91.13
    173 
    174 This is the llvm code after instruction scheduling:
    175 
    176 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
    177 	%reg1078 = MOV32ri -3
    178 	%reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
    179 	%reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
    180 	%reg1080 = IMUL32rr %reg1079, %reg1037
    181 	%reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
    182 	%reg1038 = LEA32r %reg1081, 1, %reg1080, -3
    183 	%reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
    184 	%reg1082 = SHL32ri %reg1038, 4
    185 	%reg1039 = ADD32rr %reg1036, %reg1082
    186 	%reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
    187 	%reg1034 = SHUFPSrr %reg1083, %reg1083, 170
    188 	%reg1032 = SHUFPSrr %reg1083, %reg1083, 0
    189 	%reg1035 = SHUFPSrr %reg1083, %reg1083, 255
    190 	%reg1033 = SHUFPSrr %reg1083, %reg1083, 85
    191 	%reg1040 = MOV32rr %reg1039
    192 	%reg1084 = AND32ri8 %reg1039, 15
    193 	CMP32ri8 %reg1084, 0
    194 	JE mbb<cond_next204,0xa914d30>
    195 
    196 Still ok. After register allocation:
    197 
    198 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
    199 	%EAX = MOV32ri -3
    200 	%EDX = MOV32rm <fi#3>, 1, %NOREG, 0
    201 	ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
    202 	%EDX = MOV32rm <fi#7>, 1, %NOREG, 0
    203 	%EDX = MOV32rm %EDX, 1, %NOREG, 40
    204 	IMUL32rr %EAX<def&use>, %EDX
    205 	%ESI = MOV32rm <fi#5>, 1, %NOREG, 0
    206 	%ESI = MOV32rm %ESI, 1, %NOREG, 0
    207 	MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
    208 	%EAX = LEA32r %ESI, 1, %EAX, -3
    209 	%ESI = MOV32rm <fi#7>, 1, %NOREG, 0
    210 	%ESI = MOV32rm %ESI, 1, %NOREG, 32
    211 	%EDI = MOV32rr %EAX
    212 	SHL32ri %EDI<def&use>, 4
    213 	ADD32rr %EDI<def&use>, %ESI
    214 	%XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
    215 	%XMM1 = MOVAPSrr %XMM0
    216 	SHUFPSrr %XMM1<def&use>, %XMM1, 170
    217 	%XMM2 = MOVAPSrr %XMM0
    218 	SHUFPSrr %XMM2<def&use>, %XMM2, 0
    219 	%XMM3 = MOVAPSrr %XMM0
    220 	SHUFPSrr %XMM3<def&use>, %XMM3, 255
    221 	SHUFPSrr %XMM0<def&use>, %XMM0, 85
    222 	%EBX = MOV32rr %EDI
    223 	AND32ri8 %EBX<def&use>, 15
    224 	CMP32ri8 %EBX, 0
    225 	JE mbb<cond_next204,0xa914d30>
    226 
    227 This looks really bad. The problem is shufps is a destructive opcode. Since it
    228 appears as operand two in more than one shufps ops. It resulted in a number of
    229 copies. Note icc also suffers from the same problem. Either the instruction
    230 selector should select pshufd or The register allocator can made the two-address
    231 to three-address transformation.
    232 
    233 It also exposes some other problems. See MOV32ri -3 and the spills.
    234 
    235 //===---------------------------------------------------------------------===//
    236 
    237 Consider:
    238 
    239 __m128 test(float a) {
    240   return _mm_set_ps(0.0, 0.0, 0.0, a*a);
    241 }
    242 
    243 This compiles into:
    244 
    245 movss 4(%esp), %xmm1
    246 mulss %xmm1, %xmm1
    247 xorps %xmm0, %xmm0
    248 movss %xmm1, %xmm0
    249 ret
    250 
    251 Because mulss doesn't modify the top 3 elements, the top elements of 
    252 xmm1 are already zero'd.  We could compile this to:
    253 
    254 movss 4(%esp), %xmm0
    255 mulss %xmm0, %xmm0
    256 ret
    257 
    258 //===---------------------------------------------------------------------===//
    259 
    260 Here's a sick and twisted idea.  Consider code like this:
    261 
    262 __m128 test(__m128 a) {
    263   float b = *(float*)&A;
    264   ...
    265   return _mm_set_ps(0.0, 0.0, 0.0, b);
    266 }
    267 
    268 This might compile to this code:
    269 
    270 movaps c(%esp), %xmm1
    271 xorps %xmm0, %xmm0
    272 movss %xmm1, %xmm0
    273 ret
    274 
    275 Now consider if the ... code caused xmm1 to get spilled.  This might produce
    276 this code:
    277 
    278 movaps c(%esp), %xmm1
    279 movaps %xmm1, c2(%esp)
    280 ...
    281 
    282 xorps %xmm0, %xmm0
    283 movaps c2(%esp), %xmm1
    284 movss %xmm1, %xmm0
    285 ret
    286 
    287 However, since the reload is only used by these instructions, we could 
    288 "fold" it into the uses, producing something like this:
    289 
    290 movaps c(%esp), %xmm1
    291 movaps %xmm1, c2(%esp)
    292 ...
    293 
    294 movss c2(%esp), %xmm0
    295 ret
    296 
    297 ... saving two instructions.
    298 
    299 The basic idea is that a reload from a spill slot, can, if only one 4-byte 
    300 chunk is used, bring in 3 zeros the one element instead of 4 elements.
    301 This can be used to simplify a variety of shuffle operations, where the
    302 elements are fixed zeros.
    303 
    304 //===---------------------------------------------------------------------===//
    305 
    306 This code generates ugly code, probably due to costs being off or something:
    307 
    308 define void @test(float* %P, <4 x float>* %P2 ) {
    309         %xFloat0.688 = load float* %P
    310         %tmp = load <4 x float>* %P2
    311         %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
    312         store <4 x float> %inFloat3.713, <4 x float>* %P2
    313         ret void
    314 }
    315 
    316 Generates:
    317 
    318 _test:
    319 	movl	8(%esp), %eax
    320 	movaps	(%eax), %xmm0
    321 	pxor	%xmm1, %xmm1
    322 	movaps	%xmm0, %xmm2
    323 	shufps	$50, %xmm1, %xmm2
    324 	shufps	$132, %xmm2, %xmm0
    325 	movaps	%xmm0, (%eax)
    326 	ret
    327 
    328 Would it be better to generate:
    329 
    330 _test:
    331         movl 8(%esp), %ecx
    332         movaps (%ecx), %xmm0
    333 	xor %eax, %eax
    334         pinsrw $6, %eax, %xmm0
    335         pinsrw $7, %eax, %xmm0
    336         movaps %xmm0, (%ecx)
    337         ret
    338 
    339 ?
    340 
    341 //===---------------------------------------------------------------------===//
    342 
    343 Some useful information in the Apple Altivec / SSE Migration Guide:
    344 
    345 http://developer.apple.com/documentation/Performance/Conceptual/
    346 Accelerate_sse_migration/index.html
    347 
    348 e.g. SSE select using and, andnot, or. Various SSE compare translations.
    349 
    350 //===---------------------------------------------------------------------===//
    351 
    352 Add hooks to commute some CMPP operations.
    353 
    354 //===---------------------------------------------------------------------===//
    355 
    356 Apply the same transformation that merged four float into a single 128-bit load
    357 to loads from constant pool.
    358 
    359 //===---------------------------------------------------------------------===//
    360 
    361 Floating point max / min are commutable when -enable-unsafe-fp-path is
    362 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
    363 nodes which are selected to max / min instructions that are marked commutable.
    364 
    365 //===---------------------------------------------------------------------===//
    366 
    367 We should materialize vector constants like "all ones" and "signbit" with 
    368 code like:
    369 
    370      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
    371 
    372 and:
    373      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
    374      psrlq   xmm1, 31     ; xmm1 = all 100000000000...
    375 
    376 instead of using a load from the constant pool.  The later is important for
    377 ABS/NEG/copysign etc.
    378 
    379 //===---------------------------------------------------------------------===//
    380 
    381 These functions:
    382 
    383 #include <xmmintrin.h>
    384 __m128i a;
    385 void x(unsigned short n) {
    386   a = _mm_slli_epi32 (a, n);
    387 }
    388 void y(unsigned n) {
    389   a = _mm_slli_epi32 (a, n);
    390 }
    391 
    392 compile to ( -O3 -static -fomit-frame-pointer):
    393 _x:
    394         movzwl  4(%esp), %eax
    395         movd    %eax, %xmm0
    396         movaps  _a, %xmm1
    397         pslld   %xmm0, %xmm1
    398         movaps  %xmm1, _a
    399         ret
    400 _y:
    401         movd    4(%esp), %xmm0
    402         movaps  _a, %xmm1
    403         pslld   %xmm0, %xmm1
    404         movaps  %xmm1, _a
    405         ret
    406 
    407 "y" looks good, but "x" does silly movzwl stuff around into a GPR.  It seems
    408 like movd would be sufficient in both cases as the value is already zero 
    409 extended in the 32-bit stack slot IIRC.  For signed short, it should also be
    410 save, as a really-signed value would be undefined for pslld.
    411 
    412 
    413 //===---------------------------------------------------------------------===//
    414 
    415 #include <math.h>
    416 int t1(double d) { return signbit(d); }
    417 
    418 This currently compiles to:
    419 	subl	$12, %esp
    420 	movsd	16(%esp), %xmm0
    421 	movsd	%xmm0, (%esp)
    422 	movl	4(%esp), %eax
    423 	shrl	$31, %eax
    424 	addl	$12, %esp
    425 	ret
    426 
    427 We should use movmskp{s|d} instead.
    428 
    429 //===---------------------------------------------------------------------===//
    430 
    431 CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
    432 (aligned) vector load.  This functionality has a couple of problems.
    433 
    434 1. The code to infer alignment from loads of globals is in the X86 backend,
    435    not the dag combiner.  This is because dagcombine2 needs to be able to see
    436    through the X86ISD::Wrapper node, which DAGCombine can't really do.
    437 2. The code for turning 4 x load into a single vector load is target 
    438    independent and should be moved to the dag combiner.
    439 3. The code for turning 4 x load into a vector load can only handle a direct 
    440    load from a global or a direct load from the stack.  It should be generalized
    441    to handle any load from P, P+4, P+8, P+12, where P can be anything.
    442 4. The alignment inference code cannot handle loads from globals in non-static
    443    mode because it doesn't look through the extra dyld stub load.  If you try
    444    vec_align.ll without -relocation-model=static, you'll see what I mean.
    445 
    446 //===---------------------------------------------------------------------===//
    447 
    448 We should lower store(fneg(load p), q) into an integer load+xor+store, which
    449 eliminates a constant pool load.  For example, consider:
    450 
    451 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
    452 entry:
    453  %tmp6 = fsub float -0.000000e+00, %z.1		; <float> [#uses=1]
    454  %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
    455  ret i64 %tmp20
    456 }
    457 declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
    458 
    459 This currently compiles to:
    460 
    461 LCPI1_0:					#  <4 x float>
    462 	.long	2147483648	# float -0
    463 	.long	2147483648	# float -0
    464 	.long	2147483648	# float -0
    465 	.long	2147483648	# float -0
    466 _ccosf:
    467 	subl	$12, %esp
    468 	movss	16(%esp), %xmm0
    469 	movss	%xmm0, 4(%esp)
    470 	movss	20(%esp), %xmm0
    471 	xorps	LCPI1_0, %xmm0
    472 	movss	%xmm0, (%esp)
    473 	call	L_ccoshf$stub
    474 	addl	$12, %esp
    475 	ret
    476 
    477 Note the load into xmm0, then xor (to negate), then store.  In PIC mode,
    478 this code computes the pic base and does two loads to do the constant pool 
    479 load, so the improvement is much bigger.
    480 
    481 The tricky part about this xform is that the argument load/store isn't exposed
    482 until post-legalize, and at that point, the fneg has been custom expanded into 
    483 an X86 fxor.  This means that we need to handle this case in the x86 backend
    484 instead of in target independent code.
    485 
    486 //===---------------------------------------------------------------------===//
    487 
    488 Non-SSE4 insert into 16 x i8 is atrociously bad.
    489 
    490 //===---------------------------------------------------------------------===//
    491 
    492 <2 x i64> extract is substantially worse than <2 x f64>, even if the destination
    493 is memory.
    494 
    495 //===---------------------------------------------------------------------===//
    496 
    497 INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
    498 any number of 0.0 simultaneously.  Currently we only use it for simple
    499 insertions.
    500 
    501 See comments in LowerINSERT_VECTOR_ELT_SSE4.
    502 
    503 //===---------------------------------------------------------------------===//
    504 
    505 On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
    506 Custom.  All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
    507 legal, it'll just take a few extra patterns written in the .td file.
    508 
    509 Note: this is not a code quality issue; the custom lowered code happens to be
    510 right, but we shouldn't have to custom lower anything.  This is probably related
    511 to <2 x i64> ops being so bad.
    512 
    513 //===---------------------------------------------------------------------===//
    514 
    515 LLVM currently generates stack realignment code, when it is not necessary
    516 needed. The problem is that we need to know about stack alignment too early,
    517 before RA runs.
    518 
    519 At that point we don't know, whether there will be vector spill, or not.
    520 Stack realignment logic is overly conservative here, but otherwise we can
    521 produce unaligned loads/stores.
    522 
    523 Fixing this will require some huge RA changes.
    524 
    525 Testcase:
    526 #include <emmintrin.h>
    527 
    528 typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
    529 
    530 static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
    531 - 22725, - 12873};;
    532 
    533 vSInt16 madd(vSInt16 b)
    534 {
    535     return _mm_madd_epi16(a, b);
    536 }
    537 
    538 Generated code (x86-32, linux):
    539 madd:
    540         pushl   %ebp
    541         movl    %esp, %ebp
    542         andl    $-16, %esp
    543         movaps  .LCPI1_0, %xmm1
    544         pmaddwd %xmm1, %xmm0
    545         movl    %ebp, %esp
    546         popl    %ebp
    547         ret
    548 
    549 //===---------------------------------------------------------------------===//
    550 
    551 Consider:
    552 #include <emmintrin.h> 
    553 __m128 foo2 (float x) {
    554  return _mm_set_ps (0, 0, x, 0);
    555 }
    556 
    557 In x86-32 mode, we generate this spiffy code:
    558 
    559 _foo2:
    560 	movss	4(%esp), %xmm0
    561 	pshufd	$81, %xmm0, %xmm0
    562 	ret
    563 
    564 in x86-64 mode, we generate this code, which could be better:
    565 
    566 _foo2:
    567 	xorps	%xmm1, %xmm1
    568 	movss	%xmm0, %xmm1
    569 	pshufd	$81, %xmm1, %xmm0
    570 	ret
    571 
    572 In sse4 mode, we could use insertps to make both better.
    573 
    574 Here's another testcase that could use insertps [mem]:
    575 
    576 #include <xmmintrin.h>
    577 extern float x2, x3;
    578 __m128 foo1 (float x1, float x4) {
    579  return _mm_set_ps (x2, x1, x3, x4);
    580 }
    581 
    582 gcc mainline compiles it to:
    583 
    584 foo1:
    585        insertps        $0x10, x2(%rip), %xmm0
    586        insertps        $0x10, x3(%rip), %xmm1
    587        movaps  %xmm1, %xmm2
    588        movlhps %xmm0, %xmm2
    589        movaps  %xmm2, %xmm0
    590        ret
    591 
    592 //===---------------------------------------------------------------------===//
    593 
    594 We compile vector multiply-by-constant into poor code:
    595 
    596 define <4 x i32> @f(<4 x i32> %i) nounwind  {
    597 	%A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
    598 	ret <4 x i32> %A
    599 }
    600 
    601 On targets without SSE4.1, this compiles into:
    602 
    603 LCPI1_0:					##  <4 x i32>
    604 	.long	10
    605 	.long	10
    606 	.long	10
    607 	.long	10
    608 	.text
    609 	.align	4,0x90
    610 	.globl	_f
    611 _f:
    612 	pshufd	$3, %xmm0, %xmm1
    613 	movd	%xmm1, %eax
    614 	imull	LCPI1_0+12, %eax
    615 	movd	%eax, %xmm1
    616 	pshufd	$1, %xmm0, %xmm2
    617 	movd	%xmm2, %eax
    618 	imull	LCPI1_0+4, %eax
    619 	movd	%eax, %xmm2
    620 	punpckldq	%xmm1, %xmm2
    621 	movd	%xmm0, %eax
    622 	imull	LCPI1_0, %eax
    623 	movd	%eax, %xmm1
    624 	movhlps	%xmm0, %xmm0
    625 	movd	%xmm0, %eax
    626 	imull	LCPI1_0+8, %eax
    627 	movd	%eax, %xmm0
    628 	punpckldq	%xmm0, %xmm1
    629 	movaps	%xmm1, %xmm0
    630 	punpckldq	%xmm2, %xmm0
    631 	ret
    632 
    633 It would be better to synthesize integer vector multiplication by constants
    634 using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
    635 simple cases such as multiplication by powers of two would be better as
    636 vector shifts than as multiplications.
    637 
    638 //===---------------------------------------------------------------------===//
    639 
    640 We compile this:
    641 
    642 __m128i
    643 foo2 (char x)
    644 {
    645   return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
    646 }
    647 
    648 into:
    649 	movl	$1, %eax
    650 	xorps	%xmm0, %xmm0
    651 	pinsrw	$2, %eax, %xmm0
    652 	movzbl	4(%esp), %eax
    653 	pinsrw	$3, %eax, %xmm0
    654 	movl	$256, %eax
    655 	pinsrw	$7, %eax, %xmm0
    656 	ret
    657 
    658 
    659 gcc-4.2:
    660 	subl	$12, %esp
    661 	movzbl	16(%esp), %eax
    662 	movdqa	LC0, %xmm0
    663 	pinsrw	$3, %eax, %xmm0
    664 	addl	$12, %esp
    665 	ret
    666 	.const
    667 	.align 4
    668 LC0:
    669 	.word	0
    670 	.word	0
    671 	.word	1
    672 	.word	0
    673 	.word	0
    674 	.word	0
    675 	.word	0
    676 	.word	256
    677 
    678 With SSE4, it should be
    679       movdqa  .LC0(%rip), %xmm0
    680       pinsrb  $6, %edi, %xmm0
    681 
    682 //===---------------------------------------------------------------------===//
    683 
    684 We should transform a shuffle of two vectors of constants into a single vector
    685 of constants. Also, insertelement of a constant into a vector of constants
    686 should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
    687 
    688 We compiled it to something horrible:
    689 
    690 	.align	4
    691 LCPI1_1:					##  float
    692 	.long	1065353216	## float 1
    693 	.const
    694 
    695 	.align	4
    696 LCPI1_0:					##  <4 x float>
    697 	.space	4
    698 	.long	1065353216	## float 1
    699 	.space	4
    700 	.long	1065353216	## float 1
    701 	.text
    702 	.align	4,0x90
    703 	.globl	_t
    704 _t:
    705 	xorps	%xmm0, %xmm0
    706 	movhps	LCPI1_0, %xmm0
    707 	movss	LCPI1_1, %xmm1
    708 	movaps	%xmm0, %xmm2
    709 	shufps	$2, %xmm1, %xmm2
    710 	shufps	$132, %xmm2, %xmm0
    711 	movaps	%xmm0, 0
    712 
    713 //===---------------------------------------------------------------------===//
    714 rdar://5907648
    715 
    716 This function:
    717 
    718 float foo(unsigned char x) {
    719   return x;
    720 }
    721 
    722 compiles to (x86-32):
    723 
    724 define float @foo(i8 zeroext  %x) nounwind  {
    725 	%tmp12 = uitofp i8 %x to float		; <float> [#uses=1]
    726 	ret float %tmp12
    727 }
    728 
    729 compiles to:
    730 
    731 _foo:
    732 	subl	$4, %esp
    733 	movzbl	8(%esp), %eax
    734 	cvtsi2ss	%eax, %xmm0
    735 	movss	%xmm0, (%esp)
    736 	flds	(%esp)
    737 	addl	$4, %esp
    738 	ret
    739 
    740 We should be able to use:
    741   cvtsi2ss 8($esp), %xmm0
    742 since we know the stack slot is already zext'd.
    743 
    744 //===---------------------------------------------------------------------===//
    745 
    746 Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
    747 when code size is critical. movlps is slower than movsd on core2 but it's one
    748 byte shorter.
    749 
    750 //===---------------------------------------------------------------------===//
    751 
    752 We should use a dynamic programming based approach to tell when using FPStack
    753 operations is cheaper than SSE.  SciMark montecarlo contains code like this
    754 for example:
    755 
    756 double MonteCarlo_num_flops(int Num_samples) {
    757     return ((double) Num_samples)* 4.0;
    758 }
    759 
    760 In fpstack mode, this compiles into:
    761 
    762 LCPI1_0:					
    763 	.long	1082130432	## float 4.000000e+00
    764 _MonteCarlo_num_flops:
    765 	subl	$4, %esp
    766 	movl	8(%esp), %eax
    767 	movl	%eax, (%esp)
    768 	fildl	(%esp)
    769 	fmuls	LCPI1_0
    770 	addl	$4, %esp
    771 	ret
    772         
    773 in SSE mode, it compiles into significantly slower code:
    774 
    775 _MonteCarlo_num_flops:
    776 	subl	$12, %esp
    777 	cvtsi2sd	16(%esp), %xmm0
    778 	mulsd	LCPI1_0, %xmm0
    779 	movsd	%xmm0, (%esp)
    780 	fldl	(%esp)
    781 	addl	$12, %esp
    782 	ret
    783 
    784 There are also other cases in scimark where using fpstack is better, it is
    785 cheaper to do fld1 than load from a constant pool for example, so
    786 "load, add 1.0, store" is better done in the fp stack, etc.
    787 
    788 //===---------------------------------------------------------------------===//
    789 
    790 The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to
    791 "cmpsd".  For example, this code:
    792 
    793 double d1(double x) { return x == x ? x : x + x; }
    794 
    795 Compiles into:
    796 
    797 _d1:
    798 	ucomisd	%xmm0, %xmm0
    799 	jnp	LBB1_2
    800 	addsd	%xmm0, %xmm0
    801 	ret
    802 LBB1_2:
    803 	ret
    804 
    805 Also, the 'ret's should be shared.  This is PR6032.
    806 
    807 //===---------------------------------------------------------------------===//
    808 
    809 These should compile into the same code (PR6214): Perhaps instcombine should
    810 canonicalize the former into the later?
    811 
    812 define float @foo(float %x) nounwind {
    813   %t = bitcast float %x to i32
    814   %s = and i32 %t, 2147483647
    815   %d = bitcast i32 %s to float
    816   ret float %d
    817 }
    818 
    819 declare float @fabsf(float %n)
    820 define float @bar(float %x) nounwind {
    821   %d = call float @fabsf(float %x)
    822   ret float %d
    823 }
    824 
    825 //===---------------------------------------------------------------------===//
    826 
    827 This IR (from PR6194):
    828 
    829 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
    830 target triple = "x86_64-apple-darwin10.0.0"
    831 
    832 %0 = type { double, double }
    833 %struct.float3 = type { float, float, float }
    834 
    835 define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
    836 entry:
    837   %tmp18 = extractvalue %0 %0, 0                  ; <double> [#uses=1]
    838   %tmp19 = bitcast double %tmp18 to i64           ; <i64> [#uses=1]
    839   %tmp20 = zext i64 %tmp19 to i128                ; <i128> [#uses=1]
    840   %tmp10 = lshr i128 %tmp20, 32                   ; <i128> [#uses=1]
    841   %tmp11 = trunc i128 %tmp10 to i32               ; <i32> [#uses=1]
    842   %tmp12 = bitcast i32 %tmp11 to float            ; <float> [#uses=1]
    843   %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
    844   store float %tmp12, float* %tmp5
    845   ret void
    846 }
    847 
    848 Compiles to:
    849 
    850 _test:                                  ## @test
    851 	movd	%xmm0, %rax
    852 	shrq	$32, %rax
    853 	movl	%eax, 4(%rdi)
    854 	ret
    855 
    856 This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
    857 doing a shuffle from v[1] to v[0] then a float store.
    858 
    859 //===---------------------------------------------------------------------===//
    860 
    861 On SSE4 machines, we compile this code:
    862 
    863 define <2 x float> @test2(<2 x float> %Q, <2 x float> %R,
    864        <2 x float> *%P) nounwind {
    865   %Z = fadd <2 x float> %Q, %R
    866 
    867   store <2 x float> %Z, <2 x float> *%P
    868   ret <2 x float> %Z
    869 }
    870 
    871 into:
    872 
    873 _test2:                                 ## @test2
    874 ## BB#0:
    875 	insertps	$0, %xmm2, %xmm2
    876 	insertps	$16, %xmm3, %xmm2
    877 	insertps	$0, %xmm0, %xmm3
    878 	insertps	$16, %xmm1, %xmm3
    879 	addps	%xmm2, %xmm3
    880 	movq	%xmm3, (%rdi)
    881 	movaps	%xmm3, %xmm0
    882 	pshufd	$1, %xmm3, %xmm1
    883                                         ## kill: XMM1<def> XMM1<kill>
    884 	ret
    885 
    886 The insertps's of $0 are pointless complex copies.
    887 
    888 //===---------------------------------------------------------------------===//
    889 
    890 [UNSAFE FP]
    891 
    892 void foo(double, double, double);
    893 void norm(double x, double y, double z) {
    894   double scale = __builtin_sqrt(x*x + y*y + z*z);
    895   foo(x/scale, y/scale, z/scale);
    896 }
    897 
    898 We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is
    899 slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first
    900 and emit 3 mulsd in place of the divs. This can be done as a target-independent
    901 transform.
    902 
    903 If we're dealing with floats instead of doubles we could even replace the sqrtss
    904 and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
    905 cost of reduced accuracy.
    906 
    907 //===---------------------------------------------------------------------===//
    908 
    909 This function should be matched to haddpd when the appropriate CPU is enabled:
    910 
    911 #include <x86intrin.h>
    912 double f (__m128d p) {
    913   return p[0] + p[1];
    914 }
    915 
    916 similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should
    917 turn into hsubpd also.
    918 
    919 //===---------------------------------------------------------------------===//
    920 
    921 define <2 x i32> @foo(<2 x double> %in) {
    922   %x = fptosi <2 x double> %in to <2 x i32>
    923   ret <2 x i32> %x
    924 }
    925 
    926 Should compile into cvttpd2dq instead of being scalarized into 2 cvttsd2si.
    927 
    928 //===---------------------------------------------------------------------===//
    929 

README-UNIMPLEMENTED.txt

      1 //===---------------------------------------------------------------------===//
      2 // Testcases that crash the X86 backend because they aren't implemented
      3 //===---------------------------------------------------------------------===//
      4 
      5 These are cases we know the X86 backend doesn't handle.  Patches are welcome
      6 and appreciated, because no one has signed up to implemented these yet.
      7 Implementing these would allow elimination of the corresponding intrinsics,
      8 which would be great.
      9 
     10 1) vector shifts
     11 2) vector comparisons
     12 3) vector fp<->int conversions: PR2683, PR2684, PR2685, PR2686, PR2688
     13 4) bitcasts from vectors to scalars: PR2804
     14 5) llvm.atomic.cmp.swap.i128.p0i128: PR3462
     15 

README-X86-64.txt

      1 //===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===//
      2 
      3 AMD64 Optimization Manual 8.2 has some nice information about optimizing integer
      4 multiplication by a constant. How much of it applies to Intel's X86-64
      5 implementation? There are definite trade-offs to consider: latency vs. register
      6 pressure vs. code size.
      7 
      8 //===---------------------------------------------------------------------===//
      9 
     10 Are we better off using branches instead of cmove to implement FP to
     11 unsigned i64?
     12 
     13 _conv:
     14 	ucomiss	LC0(%rip), %xmm0
     15 	cvttss2siq	%xmm0, %rdx
     16 	jb	L3
     17 	subss	LC0(%rip), %xmm0
     18 	movabsq	$-9223372036854775808, %rax
     19 	cvttss2siq	%xmm0, %rdx
     20 	xorq	%rax, %rdx
     21 L3:
     22 	movq	%rdx, %rax
     23 	ret
     24 
     25 instead of
     26 
     27 _conv:
     28 	movss LCPI1_0(%rip), %xmm1
     29 	cvttss2siq %xmm0, %rcx
     30 	movaps %xmm0, %xmm2
     31 	subss %xmm1, %xmm2
     32 	cvttss2siq %xmm2, %rax
     33 	movabsq $-9223372036854775808, %rdx
     34 	xorq %rdx, %rax
     35 	ucomiss %xmm1, %xmm0
     36 	cmovb %rcx, %rax
     37 	ret
     38 
     39 Seems like the jb branch has high likelihood of being taken. It would have
     40 saved a few instructions.
     41 
     42 //===---------------------------------------------------------------------===//
     43 
     44 It's not possible to reference AH, BH, CH, and DH registers in an instruction
     45 requiring REX prefix. However, divb and mulb both produce results in AH. If isel
     46 emits a CopyFromReg which gets turned into a movb and that can be allocated a
     47 r8b - r15b.
     48 
     49 To get around this, isel emits a CopyFromReg from AX and then right shift it
     50 down by 8 and truncate it. It's not pretty but it works. We need some register
     51 allocation magic to make the hack go away (e.g. putting additional constraints
     52 on the result of the movb).
     53 
     54 //===---------------------------------------------------------------------===//
     55 
     56 The x86-64 ABI for hidden-argument struct returns requires that the
     57 incoming value of %rdi be copied into %rax by the callee upon return.
     58 
     59 The idea is that it saves callers from having to remember this value,
     60 which would often require a callee-saved register. Callees usually
     61 need to keep this value live for most of their body anyway, so it
     62 doesn't add a significant burden on them.
     63 
     64 We currently implement this in codegen, however this is suboptimal
     65 because it means that it would be quite awkward to implement the
     66 optimization for callers.
     67 
     68 A better implementation would be to relax the LLVM IR rules for sret
     69 arguments to allow a function with an sret argument to have a non-void
     70 return type, and to have the front-end to set up the sret argument value
     71 as the return value of the function. The front-end could more easily
     72 emit uses of the returned struct value to be in terms of the function's
     73 lowered return value, and it would free non-C frontends from a
     74 complication only required by a C-based ABI.
     75 
     76 //===---------------------------------------------------------------------===//
     77 
     78 We get a redundant zero extension for code like this:
     79 
     80 int mask[1000];
     81 int foo(unsigned x) {
     82  if (x < 10)
     83    x = x * 45;
     84  else
     85    x = x * 78;
     86  return mask[x];
     87 }
     88 
     89 _foo:
     90 LBB1_0:	## entry
     91 	cmpl	$9, %edi
     92 	jbe	LBB1_3	## bb
     93 LBB1_1:	## bb1
     94 	imull	$78, %edi, %eax
     95 LBB1_2:	## bb2
     96 	movl	%eax, %eax                    <----
     97 	movq	_mask@GOTPCREL(%rip), %rcx
     98 	movl	(%rcx,%rax,4), %eax
     99 	ret
    100 LBB1_3:	## bb
    101 	imull	$45, %edi, %eax
    102 	jmp	LBB1_2	## bb2
    103   
    104 Before regalloc, we have:
    105 
    106         %reg1025<def> = IMUL32rri8 %reg1024, 45, %EFLAGS<imp-def>
    107         JMP mbb<bb2,0x203afb0>
    108     Successors according to CFG: 0x203afb0 (#3)
    109 
    110 bb1: 0x203af60, LLVM BB @0x1e02310, ID#2:
    111     Predecessors according to CFG: 0x203aec0 (#0)
    112         %reg1026<def> = IMUL32rri8 %reg1024, 78, %EFLAGS<imp-def>
    113     Successors according to CFG: 0x203afb0 (#3)
    114 
    115 bb2: 0x203afb0, LLVM BB @0x1e02340, ID#3:
    116     Predecessors according to CFG: 0x203af10 (#1) 0x203af60 (#2)
    117         %reg1027<def> = PHI %reg1025, mbb<bb,0x203af10>,
    118                             %reg1026, mbb<bb1,0x203af60>
    119         %reg1029<def> = MOVZX64rr32 %reg1027
    120 
    121 so we'd have to know that IMUL32rri8 leaves the high word zero extended and to
    122 be able to recognize the zero extend.  This could also presumably be implemented
    123 if we have whole-function selectiondags.
    124 
    125 //===---------------------------------------------------------------------===//
    126 
    127 Take the following code
    128 (from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653):
    129 extern unsigned long table[];
    130 unsigned long foo(unsigned char *p) {
    131   unsigned long tag = *p;
    132   return table[tag >> 4] + table[tag & 0xf];
    133 }
    134 
    135 Current code generated:
    136 	movzbl	(%rdi), %eax
    137 	movq	%rax, %rcx
    138 	andq	$240, %rcx
    139 	shrq	%rcx
    140 	andq	$15, %rax
    141 	movq	table(,%rax,8), %rax
    142 	addq	table(%rcx), %rax
    143 	ret
    144 
    145 Issues:
    146 1. First movq should be movl; saves a byte.
    147 2. Both andq's should be andl; saves another two bytes.  I think this was
    148    implemented at one point, but subsequently regressed.
    149 3. shrq should be shrl; saves another byte.
    150 4. The first andq can be completely eliminated by using a slightly more
    151    expensive addressing mode.
    152 
    153 //===---------------------------------------------------------------------===//
    154 
    155 Consider the following (contrived testcase, but contains common factors):
    156 
    157 #include <stdarg.h>
    158 int test(int x, ...) {
    159   int sum, i;
    160   va_list l;
    161   va_start(l, x);
    162   for (i = 0; i < x; i++)
    163     sum += va_arg(l, int);
    164   va_end(l);
    165   return sum;
    166 }
    167 
    168 Testcase given in C because fixing it will likely involve changing the IR
    169 generated for it.  The primary issue with the result is that it doesn't do any
    170 of the optimizations which are possible if we know the address of a va_list
    171 in the current function is never taken:
    172 1. We shouldn't spill the XMM registers because we only call va_arg with "int".
    173 2. It would be nice if we could scalarrepl the va_list.
    174 3. Probably overkill, but it'd be cool if we could peel off the first five
    175 iterations of the loop.
    176 
    177 Other optimizations involving functions which use va_arg on floats which don't
    178 have the address of a va_list taken:
    179 1. Conversely to the above, we shouldn't spill general registers if we only
    180    call va_arg on "double".
    181 2. If we know nothing more than 64 bits wide is read from the XMM registers,
    182    we can change the spilling code to reduce the amount of stack used by half.
    183 
    184 //===---------------------------------------------------------------------===//
    185 

README.txt

      1 //===---------------------------------------------------------------------===//
      2 // Random ideas for the X86 backend.
      3 //===---------------------------------------------------------------------===//
      4 
      5 This should be one DIV/IDIV instruction, not a libcall:
      6 
      7 unsigned test(unsigned long long X, unsigned Y) {
      8         return X/Y;
      9 }
     10 
     11 This can be done trivially with a custom legalizer.  What about overflow 
     12 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
     13 
     14 //===---------------------------------------------------------------------===//
     15 
     16 Improvements to the multiply -> shift/add algorithm:
     17 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
     18 
     19 //===---------------------------------------------------------------------===//
     20 
     21 Improve code like this (occurs fairly frequently, e.g. in LLVM):
     22 long long foo(int x) { return 1LL << x; }
     23 
     24 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
     25 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
     26 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
     27 
     28 Another useful one would be  ~0ULL >> X and ~0ULL << X.
     29 
     30 One better solution for 1LL << x is:
     31         xorl    %eax, %eax
     32         xorl    %edx, %edx
     33         testb   $32, %cl
     34         sete    %al
     35         setne   %dl
     36         sall    %cl, %eax
     37         sall    %cl, %edx
     38 
     39 But that requires good 8-bit subreg support.
     40 
     41 Also, this might be better.  It's an extra shift, but it's one instruction
     42 shorter, and doesn't stress 8-bit subreg support.
     43 (From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
     44 but without the unnecessary and.)
     45         movl %ecx, %eax
     46         shrl $5, %eax
     47         movl %eax, %edx
     48         xorl $1, %edx
     49         sall %cl, %eax
     50         sall %cl. %edx
     51 
     52 64-bit shifts (in general) expand to really bad code.  Instead of using
     53 cmovs, we should expand to a conditional branch like GCC produces.
     54 
     55 //===---------------------------------------------------------------------===//
     56 
     57 Some isel ideas:
     58 
     59 1. Dynamic programming based approach when compile time is not an
     60    issue.
     61 2. Code duplication (addressing mode) during isel.
     62 3. Other ideas from "Register-Sensitive Selection, Duplication, and
     63    Sequencing of Instructions".
     64 4. Scheduling for reduced register pressure.  E.g. "Minimum Register 
     65    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs" 
     66    and other related papers.
     67    http://citeseer.ist.psu.edu/govindarajan01minimum.html
     68 
     69 //===---------------------------------------------------------------------===//
     70 
     71 Should we promote i16 to i32 to avoid partial register update stalls?
     72 
     73 //===---------------------------------------------------------------------===//
     74 
     75 Leave any_extend as pseudo instruction and hint to register
     76 allocator. Delay codegen until post register allocation.
     77 Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
     78 the coalescer how to deal with it though.
     79 
     80 //===---------------------------------------------------------------------===//
     81 
     82 It appears icc use push for parameter passing. Need to investigate.
     83 
     84 //===---------------------------------------------------------------------===//
     85 
     86 This:
     87 
     88 void foo(void);
     89 void bar(int x, int *P) { 
     90   x >>= 2;
     91   if (x) 
     92     foo();
     93   *P = x;
     94 }
     95 
     96 compiles into:
     97 
     98 	movq	%rsi, %rbx
     99 	movl	%edi, %r14d
    100 	sarl	$2, %r14d
    101 	testl	%r14d, %r14d
    102 	je	LBB0_2
    103 
    104 Instead of doing an explicit test, we can use the flags off the sar.  This
    105 occurs in a bigger testcase like this, which is pretty common:
    106 
    107 #include <vector>
    108 int test1(std::vector<int> &X) {
    109   int Sum = 0;
    110   for (long i = 0, e = X.size(); i != e; ++i)
    111     X[i] = 0;
    112   return Sum;
    113 }
    114 
    115 //===---------------------------------------------------------------------===//
    116 
    117 Only use inc/neg/not instructions on processors where they are faster than
    118 add/sub/xor.  They are slower on the P4 due to only updating some processor
    119 flags.
    120 
    121 //===---------------------------------------------------------------------===//
    122 
    123 The instruction selector sometimes misses folding a load into a compare.  The
    124 pattern is written as (cmp reg, (load p)).  Because the compare isn't 
    125 commutative, it is not matched with the load on both sides.  The dag combiner
    126 should be made smart enough to canonicalize the load into the RHS of a compare
    127 when it can invert the result of the compare for free.
    128 
    129 //===---------------------------------------------------------------------===//
    130 
    131 In many cases, LLVM generates code like this:
    132 
    133 _test:
    134         movl 8(%esp), %eax
    135         cmpl %eax, 4(%esp)
    136         setl %al
    137         movzbl %al, %eax
    138         ret
    139 
    140 on some processors (which ones?), it is more efficient to do this:
    141 
    142 _test:
    143         movl 8(%esp), %ebx
    144         xor  %eax, %eax
    145         cmpl %ebx, 4(%esp)
    146         setl %al
    147         ret
    148 
    149 Doing this correctly is tricky though, as the xor clobbers the flags.
    150 
    151 //===---------------------------------------------------------------------===//
    152 
    153 We should generate bts/btr/etc instructions on targets where they are cheap or
    154 when codesize is important.  e.g., for:
    155 
    156 void setbit(int *target, int bit) {
    157     *target |= (1 << bit);
    158 }
    159 void clearbit(int *target, int bit) {
    160     *target &= ~(1 << bit);
    161 }
    162 
    163 //===---------------------------------------------------------------------===//
    164 
    165 Instead of the following for memset char*, 1, 10:
    166 
    167 	movl $16843009, 4(%edx)
    168 	movl $16843009, (%edx)
    169 	movw $257, 8(%edx)
    170 
    171 It might be better to generate
    172 
    173 	movl $16843009, %eax
    174 	movl %eax, 4(%edx)
    175 	movl %eax, (%edx)
    176 	movw al, 8(%edx)
    177 	
    178 when we can spare a register. It reduces code size.
    179 
    180 //===---------------------------------------------------------------------===//
    181 
    182 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
    183 get this:
    184 
    185 define i32 @test1(i32 %X) {
    186     %Y = sdiv i32 %X, 8
    187     ret i32 %Y
    188 }
    189 
    190 _test1:
    191         movl 4(%esp), %eax
    192         movl %eax, %ecx
    193         sarl $31, %ecx
    194         shrl $29, %ecx
    195         addl %ecx, %eax
    196         sarl $3, %eax
    197         ret
    198 
    199 GCC knows several different ways to codegen it, one of which is this:
    200 
    201 _test1:
    202         movl    4(%esp), %eax
    203         cmpl    $-1, %eax
    204         leal    7(%eax), %ecx
    205         cmovle  %ecx, %eax
    206         sarl    $3, %eax
    207         ret
    208 
    209 which is probably slower, but it's interesting at least :)
    210 
    211 //===---------------------------------------------------------------------===//
    212 
    213 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
    214 We should leave these as libcalls for everything over a much lower threshold,
    215 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
    216 stores, TLB preheating, etc)
    217 
    218 //===---------------------------------------------------------------------===//
    219 
    220 Optimize this into something reasonable:
    221  x * copysign(1.0, y) * copysign(1.0, z)
    222 
    223 //===---------------------------------------------------------------------===//
    224 
    225 Optimize copysign(x, *y) to use an integer load from y.
    226 
    227 //===---------------------------------------------------------------------===//
    228 
    229 The following tests perform worse with LSR:
    230 
    231 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
    232 
    233 //===---------------------------------------------------------------------===//
    234 
    235 Adding to the list of cmp / test poor codegen issues:
    236 
    237 int test(__m128 *A, __m128 *B) {
    238   if (_mm_comige_ss(*A, *B))
    239     return 3;
    240   else
    241     return 4;
    242 }
    243 
    244 _test:
    245 	movl 8(%esp), %eax
    246 	movaps (%eax), %xmm0
    247 	movl 4(%esp), %eax
    248 	movaps (%eax), %xmm1
    249 	comiss %xmm0, %xmm1
    250 	setae %al
    251 	movzbl %al, %ecx
    252 	movl $3, %eax
    253 	movl $4, %edx
    254 	cmpl $0, %ecx
    255 	cmove %edx, %eax
    256 	ret
    257 
    258 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
    259 are a number of issues. 1) We are introducing a setcc between the result of the
    260 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
    261 so a any extend (which becomes a zero extend) is added.
    262 
    263 We probably need some kind of target DAG combine hook to fix this.
    264 
    265 //===---------------------------------------------------------------------===//
    266 
    267 We generate significantly worse code for this than GCC:
    268 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
    269 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
    270 
    271 There is also one case we do worse on PPC.
    272 
    273 //===---------------------------------------------------------------------===//
    274 
    275 For this:
    276 
    277 int test(int a)
    278 {
    279   return a * 3;
    280 }
    281 
    282 We currently emits
    283 	imull $3, 4(%esp), %eax
    284 
    285 Perhaps this is what we really should generate is? Is imull three or four
    286 cycles? Note: ICC generates this:
    287 	movl	4(%esp), %eax
    288 	leal	(%eax,%eax,2), %eax
    289 
    290 The current instruction priority is based on pattern complexity. The former is
    291 more "complex" because it folds a load so the latter will not be emitted.
    292 
    293 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
    294 should always try to match LEA first since the LEA matching code does some
    295 estimate to determine whether the match is profitable.
    296 
    297 However, if we care more about code size, then imull is better. It's two bytes
    298 shorter than movl + leal.
    299 
    300 On a Pentium M, both variants have the same characteristics with regard
    301 to throughput; however, the multiplication has a latency of four cycles, as
    302 opposed to two cycles for the movl+lea variant.
    303 
    304 //===---------------------------------------------------------------------===//
    305 
    306 __builtin_ffs codegen is messy.
    307 
    308 int ffs_(unsigned X) { return __builtin_ffs(X); }
    309 
    310 llvm produces:
    311 ffs_:
    312         movl    4(%esp), %ecx
    313         bsfl    %ecx, %eax
    314         movl    $32, %edx
    315         cmove   %edx, %eax
    316         incl    %eax
    317         xorl    %edx, %edx
    318         testl   %ecx, %ecx
    319         cmove   %edx, %eax
    320         ret
    321 
    322 vs gcc:
    323 
    324 _ffs_:
    325         movl    $-1, %edx
    326         bsfl    4(%esp), %eax
    327         cmove   %edx, %eax
    328         addl    $1, %eax
    329         ret
    330 
    331 Another example of __builtin_ffs (use predsimplify to eliminate a select):
    332 
    333 int foo (unsigned long j) {
    334   if (j)
    335     return __builtin_ffs (j) - 1;
    336   else
    337     return 0;
    338 }
    339 
    340 //===---------------------------------------------------------------------===//
    341 
    342 It appears gcc place string data with linkonce linkage in
    343 .section __TEXT,__const_coal,coalesced instead of
    344 .section __DATA,__const_coal,coalesced.
    345 Take a look at darwin.h, there are other Darwin assembler directives that we
    346 do not make use of.
    347 
    348 //===---------------------------------------------------------------------===//
    349 
    350 define i32 @foo(i32* %a, i32 %t) {
    351 entry:
    352 	br label %cond_true
    353 
    354 cond_true:		; preds = %cond_true, %entry
    355 	%x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]		; <i32> [#uses=3]
    356 	%t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]		; <i32> [#uses=1]
    357 	%tmp2 = getelementptr i32* %a, i32 %x.0.0		; <i32*> [#uses=1]
    358 	%tmp3 = load i32* %tmp2		; <i32> [#uses=1]
    359 	%tmp5 = add i32 %t_addr.0.0, %x.0.0		; <i32> [#uses=1]
    360 	%tmp7 = add i32 %tmp5, %tmp3		; <i32> [#uses=2]
    361 	%tmp9 = add i32 %x.0.0, 1		; <i32> [#uses=2]
    362 	%tmp = icmp sgt i32 %tmp9, 39		; <i1> [#uses=1]
    363 	br i1 %tmp, label %bb12, label %cond_true
    364 
    365 bb12:		; preds = %cond_true
    366 	ret i32 %tmp7
    367 }
    368 is pessimized by -loop-reduce and -indvars
    369 
    370 //===---------------------------------------------------------------------===//
    371 
    372 u32 to float conversion improvement:
    373 
    374 float uint32_2_float( unsigned u ) {
    375   float fl = (int) (u & 0xffff);
    376   float fh = (int) (u >> 16);
    377   fh *= 0x1.0p16f;
    378   return fh + fl;
    379 }
    380 
    381 00000000        subl    $0x04,%esp
    382 00000003        movl    0x08(%esp,1),%eax
    383 00000007        movl    %eax,%ecx
    384 00000009        shrl    $0x10,%ecx
    385 0000000c        cvtsi2ss        %ecx,%xmm0
    386 00000010        andl    $0x0000ffff,%eax
    387 00000015        cvtsi2ss        %eax,%xmm1
    388 00000019        mulss   0x00000078,%xmm0
    389 00000021        addss   %xmm1,%xmm0
    390 00000025        movss   %xmm0,(%esp,1)
    391 0000002a        flds    (%esp,1)
    392 0000002d        addl    $0x04,%esp
    393 00000030        ret
    394 
    395 //===---------------------------------------------------------------------===//
    396 
    397 When using fastcc abi, align stack slot of argument of type double on 8 byte
    398 boundary to improve performance.
    399 
    400 //===---------------------------------------------------------------------===//
    401 
    402 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
    403 simplifications for integer "x cmp y ? a : b".
    404 
    405 //===---------------------------------------------------------------------===//
    406 
    407 Consider the expansion of:
    408 
    409 define i32 @test3(i32 %X) {
    410         %tmp1 = urem i32 %X, 255
    411         ret i32 %tmp1
    412 }
    413 
    414 Currently it compiles to:
    415 
    416 ...
    417         movl $2155905153, %ecx
    418         movl 8(%esp), %esi
    419         movl %esi, %eax
    420         mull %ecx
    421 ...
    422 
    423 This could be "reassociated" into:
    424 
    425         movl $2155905153, %eax
    426         movl 8(%esp), %ecx
    427         mull %ecx
    428 
    429 to avoid the copy.  In fact, the existing two-address stuff would do this
    430 except that mul isn't a commutative 2-addr instruction.  I guess this has
    431 to be done at isel time based on the #uses to mul?
    432 
    433 //===---------------------------------------------------------------------===//
    434 
    435 Make sure the instruction which starts a loop does not cross a cacheline
    436 boundary. This requires knowning the exact length of each machine instruction.
    437 That is somewhat complicated, but doable. Example 256.bzip2:
    438 
    439 In the new trace, the hot loop has an instruction which crosses a cacheline
    440 boundary.  In addition to potential cache misses, this can't help decoding as I
    441 imagine there has to be some kind of complicated decoder reset and realignment
    442 to grab the bytes from the next cacheline.
    443 
    444 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
    445 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
    446 937  937 0x3d0a incl     %esi
    447 3    3   0x3d0b cmpb     %bl, %dl
    448 27   27  0x3d0d jnz      0x000062db <main+11707>
    449 
    450 //===---------------------------------------------------------------------===//
    451 
    452 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
    453 
    454 //===---------------------------------------------------------------------===//
    455 
    456 This could be a single 16-bit load.
    457 
    458 int f(char *p) {
    459     if ((p[0] == 1) & (p[1] == 2)) return 1;
    460     return 0;
    461 }
    462 
    463 //===---------------------------------------------------------------------===//
    464 
    465 We should inline lrintf and probably other libc functions.
    466 
    467 //===---------------------------------------------------------------------===//
    468 
    469 Use the FLAGS values from arithmetic instructions more.  For example, compile:
    470 
    471 int add_zf(int *x, int y, int a, int b) {
    472      if ((*x += y) == 0)
    473           return a;
    474      else
    475           return b;
    476 }
    477 
    478 to:
    479        addl    %esi, (%rdi)
    480        movl    %edx, %eax
    481        cmovne  %ecx, %eax
    482        ret
    483 instead of:
    484 
    485 _add_zf:
    486         addl (%rdi), %esi
    487         movl %esi, (%rdi)
    488         testl %esi, %esi
    489         cmove %edx, %ecx
    490         movl %ecx, %eax
    491         ret
    492 
    493 As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll
    494 without a test instruction.
    495 
    496 //===---------------------------------------------------------------------===//
    497 
    498 These two functions have identical effects:
    499 
    500 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
    501 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
    502 
    503 We currently compile them to:
    504 
    505 _f:
    506         movl 4(%esp), %eax
    507         movl %eax, %ecx
    508         incl %ecx
    509         movl 8(%esp), %edx
    510         cmpl %edx, %ecx
    511         jne LBB1_2      #UnifiedReturnBlock
    512 LBB1_1: #cond_true
    513         addl $2, %eax
    514         ret
    515 LBB1_2: #UnifiedReturnBlock
    516         movl %ecx, %eax
    517         ret
    518 _f2:
    519         movl 4(%esp), %eax
    520         movl %eax, %ecx
    521         incl %ecx
    522         cmpl 8(%esp), %ecx
    523         sete %cl
    524         movzbl %cl, %ecx
    525         leal 1(%ecx,%eax), %eax
    526         ret
    527 
    528 both of which are inferior to GCC's:
    529 
    530 _f:
    531         movl    4(%esp), %edx
    532         leal    1(%edx), %eax
    533         addl    $2, %edx
    534         cmpl    8(%esp), %eax
    535         cmove   %edx, %eax
    536         ret
    537 _f2:
    538         movl    4(%esp), %eax
    539         addl    $1, %eax
    540         xorl    %edx, %edx
    541         cmpl    8(%esp), %eax
    542         sete    %dl
    543         addl    %edx, %eax
    544         ret
    545 
    546 //===---------------------------------------------------------------------===//
    547 
    548 This code:
    549 
    550 void test(int X) {
    551   if (X) abort();
    552 }
    553 
    554 is currently compiled to:
    555 
    556 _test:
    557         subl $12, %esp
    558         cmpl $0, 16(%esp)
    559         jne LBB1_1
    560         addl $12, %esp
    561         ret
    562 LBB1_1:
    563         call L_abort$stub
    564 
    565 It would be better to produce:
    566 
    567 _test:
    568         subl $12, %esp
    569         cmpl $0, 16(%esp)
    570         jne L_abort$stub
    571         addl $12, %esp
    572         ret
    573 
    574 This can be applied to any no-return function call that takes no arguments etc.
    575 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
    576 something like this:
    577 
    578 _test:
    579         cmpl $0, 4(%esp)
    580         jne LBB1_1
    581         ret
    582 LBB1_1:
    583         subl $12, %esp
    584         call L_abort$stub
    585 
    586 Both are useful in different situations.  Finally, it could be shrink-wrapped
    587 and tail called, like this:
    588 
    589 _test:
    590         cmpl $0, 4(%esp)
    591         jne LBB1_1
    592         ret
    593 LBB1_1:
    594         pop %eax   # realign stack.
    595         call L_abort$stub
    596 
    597 Though this probably isn't worth it.
    598 
    599 //===---------------------------------------------------------------------===//
    600 
    601 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
    602 a neg instead of a sub instruction.  Consider:
    603 
    604 int test(char X) { return 7-X; }
    605 
    606 we currently produce:
    607 _test:
    608         movl $7, %eax
    609         movsbl 4(%esp), %ecx
    610         subl %ecx, %eax
    611         ret
    612 
    613 We would use one fewer register if codegen'd as:
    614 
    615         movsbl 4(%esp), %eax
    616 	neg %eax
    617         add $7, %eax
    618         ret
    619 
    620 Note that this isn't beneficial if the load can be folded into the sub.  In
    621 this case, we want a sub:
    622 
    623 int test(int X) { return 7-X; }
    624 _test:
    625         movl $7, %eax
    626         subl 4(%esp), %eax
    627         ret
    628 
    629 //===---------------------------------------------------------------------===//
    630 
    631 Leaf functions that require one 4-byte spill slot have a prolog like this:
    632 
    633 _foo:
    634         pushl   %esi
    635         subl    $4, %esp
    636 ...
    637 and an epilog like this:
    638         addl    $4, %esp
    639         popl    %esi
    640         ret
    641 
    642 It would be smaller, and potentially faster, to push eax on entry and to
    643 pop into a dummy register instead of using addl/subl of esp.  Just don't pop 
    644 into any return registers :)
    645 
    646 //===---------------------------------------------------------------------===//
    647 
    648 The X86 backend should fold (branch (or (setcc, setcc))) into multiple 
    649 branches.  We generate really poor code for:
    650 
    651 double testf(double a) {
    652        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
    653 }
    654 
    655 For example, the entry BB is:
    656 
    657 _testf:
    658         subl    $20, %esp
    659         pxor    %xmm0, %xmm0
    660         movsd   24(%esp), %xmm1
    661         ucomisd %xmm0, %xmm1
    662         setnp   %al
    663         sete    %cl
    664         testb   %cl, %al
    665         jne     LBB1_5  # UnifiedReturnBlock
    666 LBB1_1: # cond_true
    667 
    668 
    669 it would be better to replace the last four instructions with:
    670 
    671 	jp LBB1_1
    672 	je LBB1_5
    673 LBB1_1:
    674 
    675 We also codegen the inner ?: into a diamond:
    676 
    677        cvtss2sd        LCPI1_0(%rip), %xmm2
    678         cvtss2sd        LCPI1_1(%rip), %xmm3
    679         ucomisd %xmm1, %xmm0
    680         ja      LBB1_3  # cond_true
    681 LBB1_2: # cond_true
    682         movapd  %xmm3, %xmm2
    683 LBB1_3: # cond_true
    684         movapd  %xmm2, %xmm0
    685         ret
    686 
    687 We should sink the load into xmm3 into the LBB1_2 block.  This should
    688 be pretty easy, and will nuke all the copies.
    689 
    690 //===---------------------------------------------------------------------===//
    691 
    692 This:
    693         #include <algorithm>
    694         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
    695         { return std::make_pair(a + b, a + b < a); }
    696         bool no_overflow(unsigned a, unsigned b)
    697         { return !full_add(a, b).second; }
    698 
    699 Should compile to:
    700 	addl	%esi, %edi
    701 	setae	%al
    702 	movzbl	%al, %eax
    703 	ret
    704 
    705 on x86-64, instead of the rather stupid-looking:
    706 	addl	%esi, %edi
    707 	setb	%al
    708 	xorb	$1, %al
    709 	movzbl	%al, %eax
    710 	ret
    711 
    712 
    713 //===---------------------------------------------------------------------===//
    714 
    715 The following code:
    716 
    717 bb114.preheader:		; preds = %cond_next94
    718 	%tmp231232 = sext i16 %tmp62 to i32		; <i32> [#uses=1]
    719 	%tmp233 = sub i32 32, %tmp231232		; <i32> [#uses=1]
    720 	%tmp245246 = sext i16 %tmp65 to i32		; <i32> [#uses=1]
    721 	%tmp252253 = sext i16 %tmp68 to i32		; <i32> [#uses=1]
    722 	%tmp254 = sub i32 32, %tmp252253		; <i32> [#uses=1]
    723 	%tmp553554 = bitcast i16* %tmp37 to i8*		; <i8*> [#uses=2]
    724 	%tmp583584 = sext i16 %tmp98 to i32		; <i32> [#uses=1]
    725 	%tmp585 = sub i32 32, %tmp583584		; <i32> [#uses=1]
    726 	%tmp614615 = sext i16 %tmp101 to i32		; <i32> [#uses=1]
    727 	%tmp621622 = sext i16 %tmp104 to i32		; <i32> [#uses=1]
    728 	%tmp623 = sub i32 32, %tmp621622		; <i32> [#uses=1]
    729 	br label %bb114
    730 
    731 produces:
    732 
    733 LBB3_5:	# bb114.preheader
    734 	movswl	-68(%ebp), %eax
    735 	movl	$32, %ecx
    736 	movl	%ecx, -80(%ebp)
    737 	subl	%eax, -80(%ebp)
    738 	movswl	-52(%ebp), %eax
    739 	movl	%ecx, -84(%ebp)
    740 	subl	%eax, -84(%ebp)
    741 	movswl	-70(%ebp), %eax
    742 	movl	%ecx, -88(%ebp)
    743 	subl	%eax, -88(%ebp)
    744 	movswl	-50(%ebp), %eax
    745 	subl	%eax, %ecx
    746 	movl	%ecx, -76(%ebp)
    747 	movswl	-42(%ebp), %eax
    748 	movl	%eax, -92(%ebp)
    749 	movswl	-66(%ebp), %eax
    750 	movl	%eax, -96(%ebp)
    751 	movw	$0, -98(%ebp)
    752 
    753 This appears to be bad because the RA is not folding the store to the stack 
    754 slot into the movl.  The above instructions could be:
    755 	movl    $32, -80(%ebp)
    756 ...
    757 	movl    $32, -84(%ebp)
    758 ...
    759 This seems like a cross between remat and spill folding.
    760 
    761 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
    762 change, so we could simply subtract %eax from %ecx first and then use %ecx (or
    763 vice-versa).
    764 
    765 //===---------------------------------------------------------------------===//
    766 
    767 This code:
    768 
    769 	%tmp659 = icmp slt i16 %tmp654, 0		; <i1> [#uses=1]
    770 	br i1 %tmp659, label %cond_true662, label %cond_next715
    771 
    772 produces this:
    773 
    774 	testw	%cx, %cx
    775 	movswl	%cx, %esi
    776 	jns	LBB4_109	# cond_next715
    777 
    778 Shark tells us that using %cx in the testw instruction is sub-optimal. It
    779 suggests using the 32-bit register (which is what ICC uses).
    780 
    781 //===---------------------------------------------------------------------===//
    782 
    783 We compile this:
    784 
    785 void compare (long long foo) {
    786   if (foo < 4294967297LL)
    787     abort();
    788 }
    789 
    790 to:
    791 
    792 compare:
    793         subl    $4, %esp
    794         cmpl    $0, 8(%esp)
    795         setne   %al
    796         movzbw  %al, %ax
    797         cmpl    $1, 12(%esp)
    798         setg    %cl
    799         movzbw  %cl, %cx
    800         cmove   %ax, %cx
    801         testb   $1, %cl
    802         jne     .LBB1_2 # UnifiedReturnBlock
    803 .LBB1_1:        # ifthen
    804         call    abort
    805 .LBB1_2:        # UnifiedReturnBlock
    806         addl    $4, %esp
    807         ret
    808 
    809 (also really horrible code on ppc).  This is due to the expand code for 64-bit
    810 compares.  GCC produces multiple branches, which is much nicer:
    811 
    812 compare:
    813         subl    $12, %esp
    814         movl    20(%esp), %edx
    815         movl    16(%esp), %eax
    816         decl    %edx
    817         jle     .L7
    818 .L5:
    819         addl    $12, %esp
    820         ret
    821         .p2align 4,,7
    822 .L7:
    823         jl      .L4
    824         cmpl    $0, %eax
    825         .p2align 4,,8
    826         ja      .L5
    827 .L4:
    828         .p2align 4,,9
    829         call    abort
    830 
    831 //===---------------------------------------------------------------------===//
    832 
    833 Tail call optimization improvements: Tail call optimization currently
    834 pushes all arguments on the top of the stack (their normal place for
    835 non-tail call optimized calls) that source from the callers arguments
    836 or  that source from a virtual register (also possibly sourcing from
    837 callers arguments).
    838 This is done to prevent overwriting of parameters (see example
    839 below) that might be used later.
    840 
    841 example:  
    842 
    843 int callee(int32, int64); 
    844 int caller(int32 arg1, int32 arg2) { 
    845   int64 local = arg2 * 2; 
    846   return callee(arg2, (int64)local); 
    847 }
    848 
    849 [arg1]          [!arg2 no longer valid since we moved local onto it]
    850 [arg2]      ->  [(int64)
    851 [RETADDR]        local  ]
    852 
    853 Moving arg1 onto the stack slot of callee function would overwrite
    854 arg2 of the caller.
    855 
    856 Possible optimizations:
    857 
    858 
    859  - Analyse the actual parameters of the callee to see which would
    860    overwrite a caller parameter which is used by the callee and only
    861    push them onto the top of the stack.
    862 
    863    int callee (int32 arg1, int32 arg2);
    864    int caller (int32 arg1, int32 arg2) {
    865        return callee(arg1,arg2);
    866    }
    867 
    868    Here we don't need to write any variables to the top of the stack
    869    since they don't overwrite each other.
    870 
    871    int callee (int32 arg1, int32 arg2);
    872    int caller (int32 arg1, int32 arg2) {
    873        return callee(arg2,arg1);
    874    }
    875 
    876    Here we need to push the arguments because they overwrite each
    877    other.
    878 
    879 //===---------------------------------------------------------------------===//
    880 
    881 main ()
    882 {
    883   int i = 0;
    884   unsigned long int z = 0;
    885 
    886   do {
    887     z -= 0x00004000;
    888     i++;
    889     if (i > 0x00040000)
    890       abort ();
    891   } while (z > 0);
    892   exit (0);
    893 }
    894 
    895 gcc compiles this to:
    896 
    897 _main:
    898 	subl	$28, %esp
    899 	xorl	%eax, %eax
    900 	jmp	L2
    901 L3:
    902 	cmpl	$262144, %eax
    903 	je	L10
    904 L2:
    905 	addl	$1, %eax
    906 	cmpl	$262145, %eax
    907 	jne	L3
    908 	call	L_abort$stub
    909 L10:
    910 	movl	$0, (%esp)
    911 	call	L_exit$stub
    912 
    913 llvm:
    914 
    915 _main:
    916 	subl	$12, %esp
    917 	movl	$1, %eax
    918 	movl	$16384, %ecx
    919 LBB1_1:	# bb
    920 	cmpl	$262145, %eax
    921 	jge	LBB1_4	# cond_true
    922 LBB1_2:	# cond_next
    923 	incl	%eax
    924 	addl	$4294950912, %ecx
    925 	cmpl	$16384, %ecx
    926 	jne	LBB1_1	# bb
    927 LBB1_3:	# bb11
    928 	xorl	%eax, %eax
    929 	addl	$12, %esp
    930 	ret
    931 LBB1_4:	# cond_true
    932 	call	L_abort$stub
    933 
    934 1. LSR should rewrite the first cmp with induction variable %ecx.
    935 2. DAG combiner should fold
    936         leal    1(%eax), %edx
    937         cmpl    $262145, %edx
    938    =>
    939         cmpl    $262144, %eax
    940 
    941 //===---------------------------------------------------------------------===//
    942 
    943 define i64 @test(double %X) {
    944 	%Y = fptosi double %X to i64
    945 	ret i64 %Y
    946 }
    947 
    948 compiles to:
    949 
    950 _test:
    951 	subl	$20, %esp
    952 	movsd	24(%esp), %xmm0
    953 	movsd	%xmm0, 8(%esp)
    954 	fldl	8(%esp)
    955 	fisttpll	(%esp)
    956 	movl	4(%esp), %edx
    957 	movl	(%esp), %eax
    958 	addl	$20, %esp
    959 	#FP_REG_KILL
    960 	ret
    961 
    962 This should just fldl directly from the input stack slot.
    963 
    964 //===---------------------------------------------------------------------===//
    965 
    966 This code:
    967 int foo (int x) { return (x & 65535) | 255; }
    968 
    969 Should compile into:
    970 
    971 _foo:
    972         movzwl  4(%esp), %eax
    973         orl     $255, %eax
    974         ret
    975 
    976 instead of:
    977 _foo:
    978 	movl	$65280, %eax
    979 	andl	4(%esp), %eax
    980 	orl	$255, %eax
    981 	ret
    982 
    983 //===---------------------------------------------------------------------===//
    984 
    985 We're codegen'ing multiply of long longs inefficiently:
    986 
    987 unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
    988   return arg1 *  arg2;
    989 }
    990 
    991 We compile to (fomit-frame-pointer):
    992 
    993 _LLM:
    994 	pushl	%esi
    995 	movl	8(%esp), %ecx
    996 	movl	16(%esp), %esi
    997 	movl	%esi, %eax
    998 	mull	%ecx
    999 	imull	12(%esp), %esi
   1000 	addl	%edx, %esi
   1001 	imull	20(%esp), %ecx
   1002 	movl	%esi, %edx
   1003 	addl	%ecx, %edx
   1004 	popl	%esi
   1005 	ret
   1006 
   1007 This looks like a scheduling deficiency and lack of remat of the load from
   1008 the argument area.  ICC apparently produces:
   1009 
   1010         movl      8(%esp), %ecx
   1011         imull     12(%esp), %ecx
   1012         movl      16(%esp), %eax
   1013         imull     4(%esp), %eax 
   1014         addl      %eax, %ecx  
   1015         movl      4(%esp), %eax
   1016         mull      12(%esp) 
   1017         addl      %ecx, %edx
   1018         ret
   1019 
   1020 Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
   1021 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
   1022 
   1023 //===---------------------------------------------------------------------===//
   1024 
   1025 We can fold a store into "zeroing a reg".  Instead of:
   1026 
   1027 xorl    %eax, %eax
   1028 movl    %eax, 124(%esp)
   1029 
   1030 we should get:
   1031 
   1032 movl    $0, 124(%esp)
   1033 
   1034 if the flags of the xor are dead.
   1035 
   1036 Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
   1037 be folded into: shl [mem], 1
   1038 
   1039 //===---------------------------------------------------------------------===//
   1040 
   1041 In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
   1042 or and instruction, for example:
   1043 
   1044 	xorpd	LCPI1_0, %xmm2
   1045 
   1046 However, if xmm2 gets spilled, we end up with really ugly code like this:
   1047 
   1048 	movsd	(%esp), %xmm0
   1049 	xorpd	LCPI1_0, %xmm0
   1050 	movsd	%xmm0, (%esp)
   1051 
   1052 Since we 'know' that this is a 'neg', we can actually "fold" the spill into
   1053 the neg/abs instruction, turning it into an *integer* operation, like this:
   1054 
   1055 	xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
   1056 
   1057 you could also use xorb, but xorl is less likely to lead to a partial register
   1058 stall.  Here is a contrived testcase:
   1059 
   1060 double a, b, c;
   1061 void test(double *P) {
   1062   double X = *P;
   1063   a = X;
   1064   bar();
   1065   X = -X;
   1066   b = X;
   1067   bar();
   1068   c = X;
   1069 }
   1070 
   1071 //===---------------------------------------------------------------------===//
   1072 
   1073 The generated code on x86 for checking for signed overflow on a multiply the
   1074 obvious way is much longer than it needs to be.
   1075 
   1076 int x(int a, int b) {
   1077   long long prod = (long long)a*b;
   1078   return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
   1079 }
   1080 
   1081 See PR2053 for more details.
   1082 
   1083 //===---------------------------------------------------------------------===//
   1084 
   1085 We should investigate using cdq/ctld (effect: edx = sar eax, 31)
   1086 more aggressively; it should cost the same as a move+shift on any modern
   1087 processor, but it's a lot shorter. Downside is that it puts more
   1088 pressure on register allocation because it has fixed operands.
   1089 
   1090 Example:
   1091 int abs(int x) {return x < 0 ? -x : x;}
   1092 
   1093 gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
   1094 abs:
   1095         movl    4(%esp), %eax
   1096         cltd
   1097         xorl    %edx, %eax
   1098         subl    %edx, %eax
   1099         ret
   1100 
   1101 //===---------------------------------------------------------------------===//
   1102 
   1103 Take the following code (from 
   1104 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
   1105 
   1106 extern unsigned char first_one[65536];
   1107 int FirstOnet(unsigned long long arg1)
   1108 {
   1109   if (arg1 >> 48)
   1110     return (first_one[arg1 >> 48]);
   1111   return 0;
   1112 }
   1113 
   1114 
   1115 The following code is currently generated:
   1116 FirstOnet:
   1117         movl    8(%esp), %eax
   1118         cmpl    $65536, %eax
   1119         movl    4(%esp), %ecx
   1120         jb      .LBB1_2 # UnifiedReturnBlock
   1121 .LBB1_1:        # ifthen
   1122         shrl    $16, %eax
   1123         movzbl  first_one(%eax), %eax
   1124         ret
   1125 .LBB1_2:        # UnifiedReturnBlock
   1126         xorl    %eax, %eax
   1127         ret
   1128 
   1129 We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
   1130 lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
   1131 
   1132 //===---------------------------------------------------------------------===//
   1133 
   1134 We compile this function:
   1135 
   1136 define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext  %d) nounwind  {
   1137 entry:
   1138 	%tmp2 = icmp eq i8 %d, 0		; <i1> [#uses=1]
   1139 	br i1 %tmp2, label %bb7, label %bb
   1140 
   1141 bb:		; preds = %entry
   1142 	%tmp6 = add i32 %b, %a		; <i32> [#uses=1]
   1143 	ret i32 %tmp6
   1144 
   1145 bb7:		; preds = %entry
   1146 	%tmp10 = sub i32 %a, %c		; <i32> [#uses=1]
   1147 	ret i32 %tmp10
   1148 }
   1149 
   1150 to:
   1151 
   1152 foo:                                    # @foo
   1153 # BB#0:                                 # %entry
   1154 	movl	4(%esp), %ecx
   1155 	cmpb	$0, 16(%esp)
   1156 	je	.LBB0_2
   1157 # BB#1:                                 # %bb
   1158 	movl	8(%esp), %eax
   1159 	addl	%ecx, %eax
   1160 	ret
   1161 .LBB0_2:                                # %bb7
   1162 	movl	12(%esp), %edx
   1163 	movl	%ecx, %eax
   1164 	subl	%edx, %eax
   1165 	ret
   1166 
   1167 There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
   1168 couple more movls by putting 4(%esp) into %eax instead of %ecx.
   1169 
   1170 //===---------------------------------------------------------------------===//
   1171 
   1172 See rdar://4653682.
   1173 
   1174 From flops:
   1175 
   1176 LBB1_15:        # bb310
   1177         cvtss2sd        LCPI1_0, %xmm1
   1178         addsd   %xmm1, %xmm0
   1179         movsd   176(%esp), %xmm2
   1180         mulsd   %xmm0, %xmm2
   1181         movapd  %xmm2, %xmm3
   1182         mulsd   %xmm3, %xmm3
   1183         movapd  %xmm3, %xmm4
   1184         mulsd   LCPI1_23, %xmm4
   1185         addsd   LCPI1_24, %xmm4
   1186         mulsd   %xmm3, %xmm4
   1187         addsd   LCPI1_25, %xmm4
   1188         mulsd   %xmm3, %xmm4
   1189         addsd   LCPI1_26, %xmm4
   1190         mulsd   %xmm3, %xmm4
   1191         addsd   LCPI1_27, %xmm4
   1192         mulsd   %xmm3, %xmm4
   1193         addsd   LCPI1_28, %xmm4
   1194         mulsd   %xmm3, %xmm4
   1195         addsd   %xmm1, %xmm4
   1196         mulsd   %xmm2, %xmm4
   1197         movsd   152(%esp), %xmm1
   1198         addsd   %xmm4, %xmm1
   1199         movsd   %xmm1, 152(%esp)
   1200         incl    %eax
   1201         cmpl    %eax, %esi
   1202         jge     LBB1_15 # bb310
   1203 LBB1_16:        # bb358.loopexit
   1204         movsd   152(%esp), %xmm0
   1205         addsd   %xmm0, %xmm0
   1206         addsd   LCPI1_22, %xmm0
   1207         movsd   %xmm0, 152(%esp)
   1208 
   1209 Rather than spilling the result of the last addsd in the loop, we should have
   1210 insert a copy to split the interval (one for the duration of the loop, one
   1211 extending to the fall through). The register pressure in the loop isn't high
   1212 enough to warrant the spill.
   1213 
   1214 Also check why xmm7 is not used at all in the function.
   1215 
   1216 //===---------------------------------------------------------------------===//
   1217 
   1218 Take the following:
   1219 
   1220 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128"
   1221 target triple = "i386-apple-darwin8"
   1222 @in_exit.4870.b = internal global i1 false		; <i1*> [#uses=2]
   1223 define fastcc void @abort_gzip() noreturn nounwind  {
   1224 entry:
   1225 	%tmp.b.i = load i1* @in_exit.4870.b		; <i1> [#uses=1]
   1226 	br i1 %tmp.b.i, label %bb.i, label %bb4.i
   1227 bb.i:		; preds = %entry
   1228 	tail call void @exit( i32 1 ) noreturn nounwind 
   1229 	unreachable
   1230 bb4.i:		; preds = %entry
   1231 	store i1 true, i1* @in_exit.4870.b
   1232 	tail call void @exit( i32 1 ) noreturn nounwind 
   1233 	unreachable
   1234 }
   1235 declare void @exit(i32) noreturn nounwind 
   1236 
   1237 This compiles into:
   1238 _abort_gzip:                            ## @abort_gzip
   1239 ## BB#0:                                ## %entry
   1240 	subl	$12, %esp
   1241 	movb	_in_exit.4870.b, %al
   1242 	cmpb	$1, %al
   1243 	jne	LBB0_2
   1244 
   1245 We somehow miss folding the movb into the cmpb.
   1246 
   1247 //===---------------------------------------------------------------------===//
   1248 
   1249 We compile:
   1250 
   1251 int test(int x, int y) {
   1252   return x-y-1;
   1253 }
   1254 
   1255 into (-m64):
   1256 
   1257 _test:
   1258 	decl	%edi
   1259 	movl	%edi, %eax
   1260 	subl	%esi, %eax
   1261 	ret
   1262 
   1263 it would be better to codegen as: x+~y  (notl+addl)
   1264 
   1265 //===---------------------------------------------------------------------===//
   1266 
   1267 This code:
   1268 
   1269 int foo(const char *str,...)
   1270 {
   1271  __builtin_va_list a; int x;
   1272  __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
   1273  return x;
   1274 }
   1275 
   1276 gets compiled into this on x86-64:
   1277 	subq    $200, %rsp
   1278         movaps  %xmm7, 160(%rsp)
   1279         movaps  %xmm6, 144(%rsp)
   1280         movaps  %xmm5, 128(%rsp)
   1281         movaps  %xmm4, 112(%rsp)
   1282         movaps  %xmm3, 96(%rsp)
   1283         movaps  %xmm2, 80(%rsp)
   1284         movaps  %xmm1, 64(%rsp)
   1285         movaps  %xmm0, 48(%rsp)
   1286         movq    %r9, 40(%rsp)
   1287         movq    %r8, 32(%rsp)
   1288         movq    %rcx, 24(%rsp)
   1289         movq    %rdx, 16(%rsp)
   1290         movq    %rsi, 8(%rsp)
   1291         leaq    (%rsp), %rax
   1292         movq    %rax, 192(%rsp)
   1293         leaq    208(%rsp), %rax
   1294         movq    %rax, 184(%rsp)
   1295         movl    $48, 180(%rsp)
   1296         movl    $8, 176(%rsp)
   1297         movl    176(%rsp), %eax
   1298         cmpl    $47, %eax
   1299         jbe     .LBB1_3 # bb
   1300 .LBB1_1:        # bb3
   1301         movq    184(%rsp), %rcx
   1302         leaq    8(%rcx), %rax
   1303         movq    %rax, 184(%rsp)
   1304 .LBB1_2:        # bb4
   1305         movl    (%rcx), %eax
   1306         addq    $200, %rsp
   1307         ret
   1308 .LBB1_3:        # bb
   1309         movl    %eax, %ecx
   1310         addl    $8, %eax
   1311         addq    192(%rsp), %rcx
   1312         movl    %eax, 176(%rsp)
   1313         jmp     .LBB1_2 # bb4
   1314 
   1315 gcc 4.3 generates:
   1316 	subq    $96, %rsp
   1317 .LCFI0:
   1318         leaq    104(%rsp), %rax
   1319         movq    %rsi, -80(%rsp)
   1320         movl    $8, -120(%rsp)
   1321         movq    %rax, -112(%rsp)
   1322         leaq    -88(%rsp), %rax
   1323         movq    %rax, -104(%rsp)
   1324         movl    $8, %eax
   1325         cmpl    $48, %eax
   1326         jb      .L6
   1327         movq    -112(%rsp), %rdx
   1328         movl    (%rdx), %eax
   1329         addq    $96, %rsp
   1330         ret
   1331         .p2align 4,,10
   1332         .p2align 3
   1333 .L6:
   1334         mov     %eax, %edx
   1335         addq    -104(%rsp), %rdx
   1336         addl    $8, %eax
   1337         movl    %eax, -120(%rsp)
   1338         movl    (%rdx), %eax
   1339         addq    $96, %rsp
   1340         ret
   1341 
   1342 and it gets compiled into this on x86:
   1343 	pushl   %ebp
   1344         movl    %esp, %ebp
   1345         subl    $4, %esp
   1346         leal    12(%ebp), %eax
   1347         movl    %eax, -4(%ebp)
   1348         leal    16(%ebp), %eax
   1349         movl    %eax, -4(%ebp)
   1350         movl    12(%ebp), %eax
   1351         addl    $4, %esp
   1352         popl    %ebp
   1353         ret
   1354 
   1355 gcc 4.3 generates:
   1356 	pushl   %ebp
   1357         movl    %esp, %ebp
   1358         movl    12(%ebp), %eax
   1359         popl    %ebp
   1360         ret
   1361 
   1362 //===---------------------------------------------------------------------===//
   1363 
   1364 Teach tblgen not to check bitconvert source type in some cases. This allows us
   1365 to consolidate the following patterns in X86InstrMMX.td:
   1366 
   1367 def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
   1368                                                   (iPTR 0))))),
   1369           (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
   1370 def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
   1371                                                   (iPTR 0))))),
   1372           (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
   1373 def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
   1374                                                   (iPTR 0))))),
   1375           (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
   1376 
   1377 There are other cases in various td files.
   1378 
   1379 //===---------------------------------------------------------------------===//
   1380 
   1381 Take something like the following on x86-32:
   1382 unsigned a(unsigned long long x, unsigned y) {return x % y;}
   1383 
   1384 We currently generate a libcall, but we really shouldn't: the expansion is
   1385 shorter and likely faster than the libcall.  The expected code is something
   1386 like the following:
   1387 
   1388 	movl	12(%ebp), %eax
   1389 	movl	16(%ebp), %ecx
   1390 	xorl	%edx, %edx
   1391 	divl	%ecx
   1392 	movl	8(%ebp), %eax
   1393 	divl	%ecx
   1394 	movl	%edx, %eax
   1395 	ret
   1396 
   1397 A similar code sequence works for division.
   1398 
   1399 //===---------------------------------------------------------------------===//
   1400 
   1401 These should compile to the same code, but the later codegen's to useless
   1402 instructions on X86. This may be a trivial dag combine (GCC PR7061):
   1403 
   1404 struct s1 { unsigned char a, b; };
   1405 unsigned long f1(struct s1 x) {
   1406     return x.a + x.b;
   1407 }
   1408 struct s2 { unsigned a: 8, b: 8; };
   1409 unsigned long f2(struct s2 x) {
   1410     return x.a + x.b;
   1411 }
   1412 
   1413 //===---------------------------------------------------------------------===//
   1414 
   1415 We currently compile this:
   1416 
   1417 define i32 @func1(i32 %v1, i32 %v2) nounwind {
   1418 entry:
   1419   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
   1420   %sum = extractvalue {i32, i1} %t, 0
   1421   %obit = extractvalue {i32, i1} %t, 1
   1422   br i1 %obit, label %overflow, label %normal
   1423 normal:
   1424   ret i32 %sum
   1425 overflow:
   1426   call void @llvm.trap()
   1427   unreachable
   1428 }
   1429 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
   1430 declare void @llvm.trap()
   1431 
   1432 to:
   1433 
   1434 _func1:
   1435 	movl	4(%esp), %eax
   1436 	addl	8(%esp), %eax
   1437 	jo	LBB1_2	## overflow
   1438 LBB1_1:	## normal
   1439 	ret
   1440 LBB1_2:	## overflow
   1441 	ud2
   1442 
   1443 it would be nice to produce "into" someday.
   1444 
   1445 //===---------------------------------------------------------------------===//
   1446 
   1447 Test instructions can be eliminated by using EFLAGS values from arithmetic
   1448 instructions. This is currently not done for mul, and, or, xor, neg, shl,
   1449 sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
   1450 for read-modify-write instructions. It is also current not done if the
   1451 OF or CF flags are needed.
   1452 
   1453 The shift operators have the complication that when the shift count is
   1454 zero, EFLAGS is not set, so they can only subsume a test instruction if
   1455 the shift count is known to be non-zero. Also, using the EFLAGS value
   1456 from a shift is apparently very slow on some x86 implementations.
   1457 
   1458 In read-modify-write instructions, the root node in the isel match is
   1459 the store, and isel has no way for the use of the EFLAGS result of the
   1460 arithmetic to be remapped to the new node.
   1461 
   1462 Add and subtract instructions set OF on signed overflow and CF on unsiged
   1463 overflow, while test instructions always clear OF and CF. In order to
   1464 replace a test with an add or subtract in a situation where OF or CF is
   1465 needed, codegen must be able to prove that the operation cannot see
   1466 signed or unsigned overflow, respectively.
   1467 
   1468 //===---------------------------------------------------------------------===//
   1469 
   1470 memcpy/memmove do not lower to SSE copies when possible.  A silly example is:
   1471 define <16 x float> @foo(<16 x float> %A) nounwind {
   1472 	%tmp = alloca <16 x float>, align 16
   1473 	%tmp2 = alloca <16 x float>, align 16
   1474 	store <16 x float> %A, <16 x float>* %tmp
   1475 	%s = bitcast <16 x float>* %tmp to i8*
   1476 	%s2 = bitcast <16 x float>* %tmp2 to i8*
   1477 	call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
   1478 	%R = load <16 x float>* %tmp2
   1479 	ret <16 x float> %R
   1480 }
   1481 
   1482 declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
   1483 
   1484 which compiles to:
   1485 
   1486 _foo:
   1487 	subl	$140, %esp
   1488 	movaps	%xmm3, 112(%esp)
   1489 	movaps	%xmm2, 96(%esp)
   1490 	movaps	%xmm1, 80(%esp)
   1491 	movaps	%xmm0, 64(%esp)
   1492 	movl	60(%esp), %eax
   1493 	movl	%eax, 124(%esp)
   1494 	movl	56(%esp), %eax
   1495 	movl	%eax, 120(%esp)
   1496 	movl	52(%esp), %eax
   1497         <many many more 32-bit copies>
   1498       	movaps	(%esp), %xmm0
   1499 	movaps	16(%esp), %xmm1
   1500 	movaps	32(%esp), %xmm2
   1501 	movaps	48(%esp), %xmm3
   1502 	addl	$140, %esp
   1503 	ret
   1504 
   1505 On Nehalem, it may even be cheaper to just use movups when unaligned than to
   1506 fall back to lower-granularity chunks.
   1507 
   1508 //===---------------------------------------------------------------------===//
   1509 
   1510 Implement processor-specific optimizations for parity with GCC on these
   1511 processors.  GCC does two optimizations:
   1512 
   1513 1. ix86_pad_returns inserts a noop before ret instructions if immediately
   1514    preceded by a conditional branch or is the target of a jump.
   1515 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
   1516    code contains more than 3 branches.
   1517    
   1518 The first one is done for all AMDs, Core2, and "Generic"
   1519 The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
   1520   Core 2, and "Generic"
   1521 
   1522 //===---------------------------------------------------------------------===//
   1523 Testcase:
   1524 int x(int a) { return (a&0xf0)>>4; }
   1525 
   1526 Current output:
   1527 	movl	4(%esp), %eax
   1528 	shrl	$4, %eax
   1529 	andl	$15, %eax
   1530 	ret
   1531 
   1532 Ideal output:
   1533 	movzbl	4(%esp), %eax
   1534 	shrl	$4, %eax
   1535 	ret
   1536 
   1537 //===---------------------------------------------------------------------===//
   1538 
   1539 Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
   1540 properly.
   1541 
   1542 When the return value is not used (i.e. only care about the value in the
   1543 memory), x86 does not have to use add to implement these. Instead, it can use
   1544 add, sub, inc, dec instructions with the "lock" prefix.
   1545 
   1546 This is currently implemented using a bit of instruction selection trick. The
   1547 issue is the target independent pattern produces one output and a chain and we
   1548 want to map it into one that just output a chain. The current trick is to select
   1549 it into a MERGE_VALUES with the first definition being an implicit_def. The
   1550 proper solution is to add new ISD opcodes for the no-output variant. DAG
   1551 combiner can then transform the node before it gets to target node selection.
   1552 
   1553 Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
   1554 fact these instructions are identical to the non-lock versions. We need a way to
   1555 add target specific information to target nodes and have this information
   1556 carried over to machine instructions. Asm printer (or JIT) can use this
   1557 information to add the "lock" prefix.
   1558 
   1559 //===---------------------------------------------------------------------===//
   1560 
   1561 struct B {
   1562   unsigned char y0 : 1;
   1563 };
   1564 
   1565 int bar(struct B* a) { return a->y0; }
   1566 
   1567 define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
   1568   %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
   1569   %2 = load i8* %1, align 1
   1570   %3 = and i8 %2, 1
   1571   %4 = zext i8 %3 to i32
   1572   ret i32 %4
   1573 }
   1574 
   1575 bar:                                    # @bar
   1576 # BB#0:
   1577         movb    (%rdi), %al
   1578         andb    $1, %al
   1579         movzbl  %al, %eax
   1580         ret
   1581 
   1582 Missed optimization: should be movl+andl.
   1583 
   1584 //===---------------------------------------------------------------------===//
   1585 
   1586 The x86_64 abi says:
   1587 
   1588 Booleans, when stored in a memory object, are stored as single byte objects the
   1589 value of which is always 0 (false) or 1 (true).
   1590 
   1591 We are not using this fact:
   1592 
   1593 int bar(_Bool *a) { return *a; }
   1594 
   1595 define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
   1596   %1 = load i8* %a, align 1, !tbaa !0
   1597   %tmp = and i8 %1, 1
   1598   %2 = zext i8 %tmp to i32
   1599   ret i32 %2
   1600 }
   1601 
   1602 bar:
   1603         movb    (%rdi), %al
   1604         andb    $1, %al
   1605         movzbl  %al, %eax
   1606         ret
   1607 
   1608 GCC produces
   1609 
   1610 bar:
   1611         movzbl  (%rdi), %eax
   1612         ret
   1613 
   1614 //===---------------------------------------------------------------------===//
   1615 
   1616 Consider the following two functions compiled with clang:
   1617 _Bool foo(int *x) { return !(*x & 4); }
   1618 unsigned bar(int *x) { return !(*x & 4); }
   1619 
   1620 foo:
   1621 	movl	4(%esp), %eax
   1622 	testb	$4, (%eax)
   1623 	sete	%al
   1624 	movzbl	%al, %eax
   1625 	ret
   1626 
   1627 bar:
   1628 	movl	4(%esp), %eax
   1629 	movl	(%eax), %eax
   1630 	shrl	$2, %eax
   1631 	andl	$1, %eax
   1632 	xorl	$1, %eax
   1633 	ret
   1634 
   1635 The second function generates more code even though the two functions are
   1636 are functionally identical.
   1637 
   1638 //===---------------------------------------------------------------------===//
   1639 
   1640 Take the following C code:
   1641 int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
   1642 
   1643 We generate the following IR with clang:
   1644 define i32 @f(i32 %a, i32 %b) nounwind readnone {
   1645 entry:
   1646   %tmp = xor i32 %b, %a                           ; <i32> [#uses=1]
   1647   %tmp6 = and i32 %tmp, 255                       ; <i32> [#uses=1]
   1648   %cmp = icmp eq i32 %tmp6, 0                     ; <i1> [#uses=1]
   1649   %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
   1650   ret i32 %conv5
   1651 }
   1652 
   1653 And the following x86 code:
   1654 	xorl	%esi, %edi
   1655 	testb	$-1, %dil
   1656 	sete	%al
   1657 	movzbl	%al, %eax
   1658 	ret
   1659 
   1660 A cmpb instead of the xorl+testb would be one instruction shorter.
   1661 
   1662 //===---------------------------------------------------------------------===//
   1663 
   1664 Given the following C code:
   1665 int f(int a, int b) { return (signed char)a == (signed char)b; }
   1666 
   1667 We generate the following IR with clang:
   1668 define i32 @f(i32 %a, i32 %b) nounwind readnone {
   1669 entry:
   1670   %sext = shl i32 %a, 24                          ; <i32> [#uses=1]
   1671   %conv1 = ashr i32 %sext, 24                     ; <i32> [#uses=1]
   1672   %sext6 = shl i32 %b, 24                         ; <i32> [#uses=1]
   1673   %conv4 = ashr i32 %sext6, 24                    ; <i32> [#uses=1]
   1674   %cmp = icmp eq i32 %conv1, %conv4               ; <i1> [#uses=1]
   1675   %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
   1676   ret i32 %conv5
   1677 }
   1678 
   1679 And the following x86 code:
   1680 	movsbl	%sil, %eax
   1681 	movsbl	%dil, %ecx
   1682 	cmpl	%eax, %ecx
   1683 	sete	%al
   1684 	movzbl	%al, %eax
   1685 	ret
   1686 
   1687 
   1688 It should be possible to eliminate the sign extensions.
   1689 
   1690 //===---------------------------------------------------------------------===//
   1691 
   1692 LLVM misses a load+store narrowing opportunity in this code:
   1693 
   1694 %struct.bf = type { i64, i16, i16, i32 }
   1695 
   1696 @bfi = external global %struct.bf*                ; <%struct.bf**> [#uses=2]
   1697 
   1698 define void @t1() nounwind ssp {
   1699 entry:
   1700   %0 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
   1701   %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
   1702   %2 = bitcast i16* %1 to i32*                    ; <i32*> [#uses=2]
   1703   %3 = load i32* %2, align 1                      ; <i32> [#uses=1]
   1704   %4 = and i32 %3, -65537                         ; <i32> [#uses=1]
   1705   store i32 %4, i32* %2, align 1
   1706   %5 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
   1707   %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
   1708   %7 = bitcast i16* %6 to i32*                    ; <i32*> [#uses=2]
   1709   %8 = load i32* %7, align 1                      ; <i32> [#uses=1]
   1710   %9 = and i32 %8, -131073                        ; <i32> [#uses=1]
   1711   store i32 %9, i32* %7, align 1
   1712   ret void
   1713 }
   1714 
   1715 LLVM currently emits this:
   1716 
   1717   movq  bfi(%rip), %rax
   1718   andl  $-65537, 8(%rax)
   1719   movq  bfi(%rip), %rax
   1720   andl  $-131073, 8(%rax)
   1721   ret
   1722 
   1723 It could narrow the loads and stores to emit this:
   1724 
   1725   movq  bfi(%rip), %rax
   1726   andb  $-2, 10(%rax)
   1727   movq  bfi(%rip), %rax
   1728   andb  $-3, 10(%rax)
   1729   ret
   1730 
   1731 The trouble is that there is a TokenFactor between the store and the
   1732 load, making it non-trivial to determine if there's anything between
   1733 the load and the store which would prohibit narrowing.
   1734 
   1735 //===---------------------------------------------------------------------===//
   1736 
   1737 This code:
   1738 void foo(unsigned x) {
   1739   if (x == 0) bar();
   1740   else if (x == 1) qux();
   1741 }
   1742 
   1743 currently compiles into:
   1744 _foo:
   1745 	movl	4(%esp), %eax
   1746 	cmpl	$1, %eax
   1747 	je	LBB0_3
   1748 	testl	%eax, %eax
   1749 	jne	LBB0_4
   1750 
   1751 the testl could be removed:
   1752 _foo:
   1753 	movl	4(%esp), %eax
   1754 	cmpl	$1, %eax
   1755 	je	LBB0_3
   1756 	jb	LBB0_4
   1757 
   1758 0 is the only unsigned number < 1.
   1759 
   1760 //===---------------------------------------------------------------------===//
   1761 
   1762 This code:
   1763 
   1764 %0 = type { i32, i1 }
   1765 
   1766 define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
   1767 entry:
   1768   %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
   1769   %cmp = extractvalue %0 %uadd, 1
   1770   %inc = zext i1 %cmp to i32
   1771   %add = add i32 %x, %sum
   1772   %z.0 = add i32 %add, %inc
   1773   ret i32 %z.0
   1774 }
   1775 
   1776 declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
   1777 
   1778 compiles to:
   1779 
   1780 _add32carry:                            ## @add32carry
   1781 	addl	%esi, %edi
   1782 	sbbl	%ecx, %ecx
   1783 	movl	%edi, %eax
   1784 	subl	%ecx, %eax
   1785 	ret
   1786 
   1787 But it could be:
   1788 
   1789 _add32carry:
   1790 	leal	(%rsi,%rdi), %eax
   1791 	cmpl	%esi, %eax
   1792 	adcl	$0, %eax
   1793 	ret
   1794 
   1795 //===---------------------------------------------------------------------===//
   1796 
   1797 The hot loop of 256.bzip2 contains code that looks a bit like this:
   1798 
   1799 int foo(char *P, char *Q, int x, int y) {
   1800   if (P[0] != Q[0])
   1801      return P[0] < Q[0];
   1802   if (P[1] != Q[1])
   1803      return P[1] < Q[1];
   1804   if (P[2] != Q[2])
   1805      return P[2] < Q[2];
   1806    return P[3] < Q[3];
   1807 }
   1808 
   1809 In the real code, we get a lot more wrong than this.  However, even in this
   1810 code we generate:
   1811 
   1812 _foo:                                   ## @foo
   1813 ## BB#0:                                ## %entry
   1814 	movb	(%rsi), %al
   1815 	movb	(%rdi), %cl
   1816 	cmpb	%al, %cl
   1817 	je	LBB0_2
   1818 LBB0_1:                                 ## %if.then
   1819 	cmpb	%al, %cl
   1820 	jmp	LBB0_5
   1821 LBB0_2:                                 ## %if.end
   1822 	movb	1(%rsi), %al
   1823 	movb	1(%rdi), %cl
   1824 	cmpb	%al, %cl
   1825 	jne	LBB0_1
   1826 ## BB#3:                                ## %if.end38
   1827 	movb	2(%rsi), %al
   1828 	movb	2(%rdi), %cl
   1829 	cmpb	%al, %cl
   1830 	jne	LBB0_1
   1831 ## BB#4:                                ## %if.end60
   1832 	movb	3(%rdi), %al
   1833 	cmpb	3(%rsi), %al
   1834 LBB0_5:                                 ## %if.end60
   1835 	setl	%al
   1836 	movzbl	%al, %eax
   1837 	ret
   1838 
   1839 Note that we generate jumps to LBB0_1 which does a redundant compare.  The
   1840 redundant compare also forces the register values to be live, which prevents
   1841 folding one of the loads into the compare.  In contrast, GCC 4.2 produces:
   1842 
   1843 _foo:
   1844 	movzbl	(%rsi), %eax
   1845 	cmpb	%al, (%rdi)
   1846 	jne	L10
   1847 L12:
   1848 	movzbl	1(%rsi), %eax
   1849 	cmpb	%al, 1(%rdi)
   1850 	jne	L10
   1851 	movzbl	2(%rsi), %eax
   1852 	cmpb	%al, 2(%rdi)
   1853 	jne	L10
   1854 	movzbl	3(%rdi), %eax
   1855 	cmpb	3(%rsi), %al
   1856 L10:
   1857 	setl	%al
   1858 	movzbl	%al, %eax
   1859 	ret
   1860 
   1861 which is "perfect".
   1862 
   1863 //===---------------------------------------------------------------------===//
   1864 
   1865 For the branch in the following code:
   1866 int a();
   1867 int b(int x, int y) {
   1868   if (x & (1<<(y&7)))
   1869     return a();
   1870   return y;
   1871 }
   1872 
   1873 We currently generate:
   1874 	movb	%sil, %al
   1875 	andb	$7, %al
   1876 	movzbl	%al, %eax
   1877 	btl	%eax, %edi
   1878 	jae	.LBB0_2
   1879 
   1880 movl+andl would be shorter than the movb+andb+movzbl sequence.
   1881 
   1882 //===---------------------------------------------------------------------===//
   1883 
   1884 For the following:
   1885 struct u1 {
   1886     float x, y;
   1887 };
   1888 float foo(struct u1 u) {
   1889     return u.x + u.y;
   1890 }
   1891 
   1892 We currently generate:
   1893 	movdqa	%xmm0, %xmm1
   1894 	pshufd	$1, %xmm0, %xmm0        # xmm0 = xmm0[1,0,0,0]
   1895 	addss	%xmm1, %xmm0
   1896 	ret
   1897 
   1898 We could save an instruction here by commuting the addss.
   1899 
   1900 //===---------------------------------------------------------------------===//
   1901 
   1902 This (from PR9661):
   1903 
   1904 float clamp_float(float a) {
   1905         if (a > 1.0f)
   1906                 return 1.0f;
   1907         else if (a < 0.0f)
   1908                 return 0.0f;
   1909         else
   1910                 return a;
   1911 }
   1912 
   1913 Could compile to:
   1914 
   1915 clamp_float:                            # @clamp_float
   1916         movss   .LCPI0_0(%rip), %xmm1
   1917         minss   %xmm1, %xmm0
   1918         pxor    %xmm1, %xmm1
   1919         maxss   %xmm1, %xmm0
   1920         ret
   1921 
   1922 with -ffast-math.
   1923 
   1924 //===---------------------------------------------------------------------===//
   1925 
   1926 This function (from PR9803):
   1927 
   1928 int clamp2(int a) {
   1929         if (a > 5)
   1930                 a = 5;
   1931         if (a < 0) 
   1932                 return 0;
   1933         return a;
   1934 }
   1935 
   1936 Compiles to:
   1937 
   1938 _clamp2:                                ## @clamp2
   1939         pushq   %rbp
   1940         movq    %rsp, %rbp
   1941         cmpl    $5, %edi
   1942         movl    $5, %ecx
   1943         cmovlel %edi, %ecx
   1944         testl   %ecx, %ecx
   1945         movl    $0, %eax
   1946         cmovnsl %ecx, %eax
   1947         popq    %rbp
   1948         ret
   1949 
   1950 The move of 0 could be scheduled above the test to make it is xor reg,reg.
   1951 
   1952 //===---------------------------------------------------------------------===//
   1953 
   1954 GCC PR48986.  We currently compile this:
   1955 
   1956 void bar(void);
   1957 void yyy(int* p) {
   1958     if (__sync_fetch_and_add(p, -1) == 1)
   1959       bar();
   1960 }
   1961 
   1962 into:
   1963 	movl	$-1, %eax
   1964 	lock
   1965 	xaddl	%eax, (%rdi)
   1966 	cmpl	$1, %eax
   1967 	je	LBB0_2
   1968 
   1969 Instead we could generate:
   1970 
   1971 	lock
   1972 	dec %rdi
   1973 	je LBB0_2
   1974 
   1975 The trick is to match "fetch_and_add(X, -C) == C".
   1976 
   1977 //===---------------------------------------------------------------------===//
   1978 
   1979 unsigned t(unsigned a, unsigned b) {
   1980   return a <= b ? 5 : -5;
   1981 }
   1982 
   1983 We generate:
   1984 	movl	$5, %ecx
   1985 	cmpl	%esi, %edi
   1986 	movl	$-5, %eax
   1987 	cmovbel	%ecx, %eax
   1988 
   1989 GCC:
   1990 	cmpl	%edi, %esi
   1991 	sbbl	%eax, %eax
   1992 	andl	$-10, %eax
   1993 	addl	$5, %eax
   1994 
   1995 //===---------------------------------------------------------------------===//
   1996