Home | History | Annotate | only in /external/swiftshader/third_party/llvm-7.0/llvm/lib/Target/X86
Up to higher level directory
NameDateSize
AsmParser/22-Oct-2020
CMakeLists.txt22-Oct-20202.1K
Disassembler/22-Oct-2020
InstPrinter/22-Oct-2020
LLVMBuild.txt22-Oct-20201K
MCTargetDesc/22-Oct-2020
README-FPStack.txt22-Oct-20202.7K
README-SSE.txt22-Oct-202024.2K
README-X86-64.txt22-Oct-20206K
README.txt22-Oct-202047.1K
ShadowCallStack.cpp22-Oct-202011.3K
TargetInfo/22-Oct-2020
Utils/22-Oct-2020
X86.h22-Oct-20205.1K
X86.td22-Oct-202043.9K
X86AsmPrinter.cpp22-Oct-202024.2K
X86AsmPrinter.h22-Oct-20205.3K
X86AvoidStoreForwardingBlocks.cpp22-Oct-202027.7K
X86CallFrameOptimization.cpp22-Oct-202022.6K
X86CallingConv.cpp22-Oct-20207.9K
X86CallingConv.h22-Oct-20204.8K
X86CallingConv.td22-Oct-202045K
X86CallLowering.cpp22-Oct-202016.9K
X86CallLowering.h22-Oct-20201.8K
X86CmovConversion.cpp22-Oct-202034.3K
X86DomainReassignment.cpp22-Oct-202025.6K
X86EvexToVex.cpp22-Oct-20208.9K
X86ExpandPseudo.cpp22-Oct-202013.6K
X86FastISel.cpp22-Oct-2020140.3K
X86FixupBWInsts.cpp22-Oct-202016.1K
X86FixupLEAs.cpp22-Oct-202020.8K
X86FixupSetCC.cpp22-Oct-20206K
X86FlagsCopyLowering.cpp22-Oct-202042.6K
X86FloatingPoint.cpp22-Oct-202062K
X86FrameLowering.cpp22-Oct-2020120.7K
X86FrameLowering.h22-Oct-20209.6K
X86GenRegisterBankInfo.def22-Oct-20203.3K
X86IndirectBranchTracking.cpp22-Oct-20204.1K
X86Instr3DNow.td22-Oct-20205.1K
X86InstrArithmetic.td22-Oct-202063.5K
X86InstrAVX512.td22-Oct-2020610.5K
X86InstrBuilder.h22-Oct-20208.4K
X86InstrCMovSetCC.td22-Oct-20205.6K
X86InstrCompiler.td22-Oct-202092.8K
X86InstrControl.td22-Oct-202019.5K
X86InstrExtension.td22-Oct-202010.3K
X86InstrFMA.td22-Oct-202032.5K
X86InstrFMA3Info.cpp22-Oct-20206.5K
X86InstrFMA3Info.h22-Oct-20203.3K
X86InstrFoldTables.cpp22-Oct-2020375.8K
X86InstrFoldTables.h22-Oct-20202.6K
X86InstrFormats.td22-Oct-202040.5K
X86InstrFPStack.td22-Oct-202035.8K
X86InstrFragmentsSIMD.td22-Oct-202053.1K
X86InstrInfo.cpp22-Oct-2020288.5K
X86InstrInfo.h22-Oct-202028.7K
X86InstrInfo.td22-Oct-2020163.3K
X86InstrMMX.td22-Oct-202030.4K
X86InstrMPX.td22-Oct-20203.8K
X86InstrSGX.td22-Oct-20201.1K
X86InstrShiftRotate.td22-Oct-202045.7K
X86InstrSSE.td22-Oct-2020400.1K
X86InstrSVM.td22-Oct-20202.1K
X86InstrSystem.td22-Oct-202033.5K
X86InstrTSX.td22-Oct-20202.1K
X86InstructionSelector.cpp22-Oct-202054.2K
X86InstrVecCompiler.td22-Oct-202027.7K
X86InstrVMX.td22-Oct-20203.5K
X86InstrXOP.td22-Oct-202022.7K
X86InterleavedAccess.cpp22-Oct-202032.1K
X86IntrinsicsInfo.h22-Oct-202083K
X86ISelDAGToDAG.cpp22-Oct-2020127K
X86ISelLowering.cpp22-Oct-20201.6M
X86ISelLowering.h22-Oct-202059.5K
X86LegalizerInfo.cpp22-Oct-202014.1K
X86LegalizerInfo.h22-Oct-20201.5K
X86MachineFunctionInfo.cpp22-Oct-20201.1K
X86MachineFunctionInfo.h22-Oct-20207.3K
X86MacroFusion.cpp22-Oct-20204.7K
X86MacroFusion.h22-Oct-2020862
X86MCInstLower.cpp22-Oct-202082K
X86OptimizeLEAs.cpp22-Oct-202026.6K
X86PadShortFunction.cpp22-Oct-20206.5K
X86PfmCounters.td22-Oct-20204.2K
X86RegisterBankInfo.cpp22-Oct-20209.4K
X86RegisterBankInfo.h22-Oct-20202.8K
X86RegisterBanks.td22-Oct-2020601
X86RegisterInfo.cpp22-Oct-202026.8K
X86RegisterInfo.h22-Oct-20205.4K
X86RegisterInfo.td22-Oct-202024.3K
X86RetpolineThunks.cpp22-Oct-20209.6K
X86SchedBroadwell.td22-Oct-202066.5K
X86SchedHaswell.td22-Oct-202071K
X86SchedPredicates.td22-Oct-20201.7K
X86SchedSandyBridge.td22-Oct-202046.7K
X86SchedSkylakeClient.td22-Oct-202072K
X86SchedSkylakeServer.td22-Oct-2020111.2K
X86Schedule.td22-Oct-202030.7K
X86ScheduleAtom.td22-Oct-202037.6K
X86ScheduleBtVer2.td22-Oct-202032.3K
X86ScheduleSLM.td22-Oct-202021.9K
X86ScheduleZnver1.td22-Oct-202047.4K
X86SelectionDAGInfo.cpp22-Oct-202011K
X86SelectionDAGInfo.h22-Oct-20201.8K
X86ShuffleDecodeConstantPool.cpp22-Oct-202011.1K
X86ShuffleDecodeConstantPool.h22-Oct-20202K
X86SpeculativeLoadHardening.cpp22-Oct-202088.6K
X86Subtarget.cpp22-Oct-202012.1K
X86Subtarget.h22-Oct-202027.6K
X86TargetMachine.cpp22-Oct-202017.4K
X86TargetMachine.h22-Oct-20202.1K
X86TargetObjectFile.cpp22-Oct-20203.4K
X86TargetObjectFile.h22-Oct-20203K
X86TargetTransformInfo.cpp22-Oct-2020123.2K
X86TargetTransformInfo.h22-Oct-20206K
X86VZeroUpper.cpp22-Oct-202012.4K
X86WinAllocaExpander.cpp22-Oct-20209.5K
X86WinEHState.cpp22-Oct-202028.9K

README-FPStack.txt

      1 //===---------------------------------------------------------------------===//
      2 // Random ideas for the X86 backend: FP stack related stuff
      3 //===---------------------------------------------------------------------===//
      4 
      5 //===---------------------------------------------------------------------===//
      6 
      7 Some targets (e.g. athlons) prefer freep to fstp ST(0):
      8 http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html
      9 
     10 //===---------------------------------------------------------------------===//
     11 
     12 This should use fiadd on chips where it is profitable:
     13 double foo(double P, int *I) { return P+*I; }
     14 
     15 We have fiadd patterns now but the followings have the same cost and
     16 complexity. We need a way to specify the later is more profitable.
     17 
     18 def FpADD32m  : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW,
     19                     [(set RFP:$dst, (fadd RFP:$src1,
     20                                      (extloadf64f32 addr:$src2)))]>;
     21                 // ST(0) = ST(0) + [mem32]
     22 
     23 def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW,
     24                     [(set RFP:$dst, (fadd RFP:$src1,
     25                                      (X86fild addr:$src2, i32)))]>;
     26                 // ST(0) = ST(0) + [mem32int]
     27 
     28 //===---------------------------------------------------------------------===//
     29 
     30 The FP stackifier should handle simple permutates to reduce number of shuffle
     31 instructions, e.g. turning:
     32 
     33 fld P	->		fld Q
     34 fld Q			fld P
     35 fxch
     36 
     37 or:
     38 
     39 fxch	->		fucomi
     40 fucomi			jl X
     41 jg X
     42 
     43 Ideas:
     44 http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html
     45 
     46 
     47 //===---------------------------------------------------------------------===//
     48 
     49 Add a target specific hook to DAG combiner to handle SINT_TO_FP and
     50 FP_TO_SINT when the source operand is already in memory.
     51 
     52 //===---------------------------------------------------------------------===//
     53 
     54 Open code rint,floor,ceil,trunc:
     55 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html
     56 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html
     57 
     58 Opencode the sincos[f] libcall.
     59 
     60 //===---------------------------------------------------------------------===//
     61 
     62 None of the FPStack instructions are handled in
     63 X86RegisterInfo::foldMemoryOperand, which prevents the spiller from
     64 folding spill code into the instructions.
     65 
     66 //===---------------------------------------------------------------------===//
     67 
     68 Currently the x86 codegen isn't very good at mixing SSE and FPStack
     69 code:
     70 
     71 unsigned int foo(double x) { return x; }
     72 
     73 foo:
     74 	subl $20, %esp
     75 	movsd 24(%esp), %xmm0
     76 	movsd %xmm0, 8(%esp)
     77 	fldl 8(%esp)
     78 	fisttpll (%esp)
     79 	movl (%esp), %eax
     80 	addl $20, %esp
     81 	ret
     82 
     83 This just requires being smarter when custom expanding fptoui.
     84 
     85 //===---------------------------------------------------------------------===//
     86 

README-SSE.txt

      1 //===---------------------------------------------------------------------===//
      2 // Random ideas for the X86 backend: SSE-specific stuff.
      3 //===---------------------------------------------------------------------===//
      4 
      5 //===---------------------------------------------------------------------===//
      6 
      7 SSE Variable shift can be custom lowered to something like this, which uses a
      8 small table + unaligned load + shuffle instead of going through memory.
      9 
     10 __m128i_shift_right:
     11 	.byte	  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
     12 	.byte	 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
     13 
     14 ...
     15 __m128i shift_right(__m128i value, unsigned long offset) {
     16   return _mm_shuffle_epi8(value,
     17                _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
     18 }
     19 
     20 //===---------------------------------------------------------------------===//
     21 
     22 SSE has instructions for doing operations on complex numbers, we should pattern
     23 match them.   For example, this should turn into a horizontal add:
     24 
     25 typedef float __attribute__((vector_size(16))) v4f32;
     26 float f32(v4f32 A) {
     27   return A[0]+A[1]+A[2]+A[3];
     28 }
     29 
     30 Instead we get this:
     31 
     32 _f32:                                   ## @f32
     33 	pshufd	$1, %xmm0, %xmm1        ## xmm1 = xmm0[1,0,0,0]
     34 	addss	%xmm0, %xmm1
     35 	pshufd	$3, %xmm0, %xmm2        ## xmm2 = xmm0[3,0,0,0]
     36 	movhlps	%xmm0, %xmm0            ## xmm0 = xmm0[1,1]
     37 	movaps	%xmm0, %xmm3
     38 	addss	%xmm1, %xmm3
     39 	movdqa	%xmm2, %xmm0
     40 	addss	%xmm3, %xmm0
     41 	ret
     42 
     43 Also, there are cases where some simple local SLP would improve codegen a bit.
     44 compiling this:
     45 
     46 _Complex float f32(_Complex float A, _Complex float B) {
     47   return A+B;
     48 }
     49 
     50 into:
     51 
     52 _f32:                                   ## @f32
     53 	movdqa	%xmm0, %xmm2
     54 	addss	%xmm1, %xmm2
     55 	pshufd	$1, %xmm1, %xmm1        ## xmm1 = xmm1[1,0,0,0]
     56 	pshufd	$1, %xmm0, %xmm3        ## xmm3 = xmm0[1,0,0,0]
     57 	addss	%xmm1, %xmm3
     58 	movaps	%xmm2, %xmm0
     59 	unpcklps	%xmm3, %xmm0    ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
     60 	ret
     61 
     62 seems silly when it could just be one addps.
     63 
     64 
     65 //===---------------------------------------------------------------------===//
     66 
     67 Expand libm rounding functions inline:  Significant speedups possible.
     68 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
     69 
     70 //===---------------------------------------------------------------------===//
     71 
     72 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
     73 other fast SSE modes.
     74 
     75 //===---------------------------------------------------------------------===//
     76 
     77 Think about doing i64 math in SSE regs on x86-32.
     78 
     79 //===---------------------------------------------------------------------===//
     80 
     81 This testcase should have no SSE instructions in it, and only one load from
     82 a constant pool:
     83 
     84 double %test3(bool %B) {
     85         %C = select bool %B, double 123.412, double 523.01123123
     86         ret double %C
     87 }
     88 
     89 Currently, the select is being lowered, which prevents the dag combiner from
     90 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
     91 
     92 The pattern isel got this one right.
     93 
     94 //===---------------------------------------------------------------------===//
     95 
     96 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
     97 feasible.
     98 
     99 //===---------------------------------------------------------------------===//
    100 
    101 Codegen:
    102   if (copysign(1.0, x) == copysign(1.0, y))
    103 into:
    104   if (x^y & mask)
    105 when using SSE.
    106 
    107 //===---------------------------------------------------------------------===//
    108 
    109 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
    110 of a v4sf value.
    111 
    112 //===---------------------------------------------------------------------===//
    113 
    114 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
    115 Perhaps use pxor / xorp* to clear a XMM register first?
    116 
    117 //===---------------------------------------------------------------------===//
    118 
    119 External test Nurbs exposed some problems. Look for
    120 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
    121 emits:
    122 
    123         movaps    (%edx), %xmm2                                 #59.21
    124         movaps    (%edx), %xmm5                                 #60.21
    125         movaps    (%edx), %xmm4                                 #61.21
    126         movaps    (%edx), %xmm3                                 #62.21
    127         movl      40(%ecx), %ebp                                #69.49
    128         shufps    $0, %xmm2, %xmm5                              #60.21
    129         movl      100(%esp), %ebx                               #69.20
    130         movl      (%ebx), %edi                                  #69.20
    131         imull     %ebp, %edi                                    #69.49
    132         addl      (%eax), %edi                                  #70.33
    133         shufps    $85, %xmm2, %xmm4                             #61.21
    134         shufps    $170, %xmm2, %xmm3                            #62.21
    135         shufps    $255, %xmm2, %xmm2                            #63.21
    136         lea       (%ebp,%ebp,2), %ebx                           #69.49
    137         negl      %ebx                                          #69.49
    138         lea       -3(%edi,%ebx), %ebx                           #70.33
    139         shll      $4, %ebx                                      #68.37
    140         addl      32(%ecx), %ebx                                #68.37
    141         testb     $15, %bl                                      #91.13
    142         jne       L_B1.24       # Prob 5%                       #91.13
    143 
    144 This is the llvm code after instruction scheduling:
    145 
    146 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
    147 	%reg1078 = MOV32ri -3
    148 	%reg1079 = ADD32rm %reg1078, %reg1068, 1, %noreg, 0
    149 	%reg1037 = MOV32rm %reg1024, 1, %noreg, 40
    150 	%reg1080 = IMUL32rr %reg1079, %reg1037
    151 	%reg1081 = MOV32rm %reg1058, 1, %noreg, 0
    152 	%reg1038 = LEA32r %reg1081, 1, %reg1080, -3
    153 	%reg1036 = MOV32rm %reg1024, 1, %noreg, 32
    154 	%reg1082 = SHL32ri %reg1038, 4
    155 	%reg1039 = ADD32rr %reg1036, %reg1082
    156 	%reg1083 = MOVAPSrm %reg1059, 1, %noreg, 0
    157 	%reg1034 = SHUFPSrr %reg1083, %reg1083, 170
    158 	%reg1032 = SHUFPSrr %reg1083, %reg1083, 0
    159 	%reg1035 = SHUFPSrr %reg1083, %reg1083, 255
    160 	%reg1033 = SHUFPSrr %reg1083, %reg1083, 85
    161 	%reg1040 = MOV32rr %reg1039
    162 	%reg1084 = AND32ri8 %reg1039, 15
    163 	CMP32ri8 %reg1084, 0
    164 	JE mbb<cond_next204,0xa914d30>
    165 
    166 Still ok. After register allocation:
    167 
    168 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
    169 	%eax = MOV32ri -3
    170 	%edx = MOV32rm %stack.3, 1, %noreg, 0
    171 	ADD32rm %eax<def&use>, %edx, 1, %noreg, 0
    172 	%edx = MOV32rm %stack.7, 1, %noreg, 0
    173 	%edx = MOV32rm %edx, 1, %noreg, 40
    174 	IMUL32rr %eax<def&use>, %edx
    175 	%esi = MOV32rm %stack.5, 1, %noreg, 0
    176 	%esi = MOV32rm %esi, 1, %noreg, 0
    177 	MOV32mr %stack.4, 1, %noreg, 0, %esi
    178 	%eax = LEA32r %esi, 1, %eax, -3
    179 	%esi = MOV32rm %stack.7, 1, %noreg, 0
    180 	%esi = MOV32rm %esi, 1, %noreg, 32
    181 	%edi = MOV32rr %eax
    182 	SHL32ri %edi<def&use>, 4
    183 	ADD32rr %edi<def&use>, %esi
    184 	%xmm0 = MOVAPSrm %ecx, 1, %noreg, 0
    185 	%xmm1 = MOVAPSrr %xmm0
    186 	SHUFPSrr %xmm1<def&use>, %xmm1, 170
    187 	%xmm2 = MOVAPSrr %xmm0
    188 	SHUFPSrr %xmm2<def&use>, %xmm2, 0
    189 	%xmm3 = MOVAPSrr %xmm0
    190 	SHUFPSrr %xmm3<def&use>, %xmm3, 255
    191 	SHUFPSrr %xmm0<def&use>, %xmm0, 85
    192 	%ebx = MOV32rr %edi
    193 	AND32ri8 %ebx<def&use>, 15
    194 	CMP32ri8 %ebx, 0
    195 	JE mbb<cond_next204,0xa914d30>
    196 
    197 This looks really bad. The problem is shufps is a destructive opcode. Since it
    198 appears as operand two in more than one shufps ops. It resulted in a number of
    199 copies. Note icc also suffers from the same problem. Either the instruction
    200 selector should select pshufd or The register allocator can made the two-address
    201 to three-address transformation.
    202 
    203 It also exposes some other problems. See MOV32ri -3 and the spills.
    204 
    205 //===---------------------------------------------------------------------===//
    206 
    207 Consider:
    208 
    209 __m128 test(float a) {
    210   return _mm_set_ps(0.0, 0.0, 0.0, a*a);
    211 }
    212 
    213 This compiles into:
    214 
    215 movss 4(%esp), %xmm1
    216 mulss %xmm1, %xmm1
    217 xorps %xmm0, %xmm0
    218 movss %xmm1, %xmm0
    219 ret
    220 
    221 Because mulss doesn't modify the top 3 elements, the top elements of 
    222 xmm1 are already zero'd.  We could compile this to:
    223 
    224 movss 4(%esp), %xmm0
    225 mulss %xmm0, %xmm0
    226 ret
    227 
    228 //===---------------------------------------------------------------------===//
    229 
    230 Here's a sick and twisted idea.  Consider code like this:
    231 
    232 __m128 test(__m128 a) {
    233   float b = *(float*)&A;
    234   ...
    235   return _mm_set_ps(0.0, 0.0, 0.0, b);
    236 }
    237 
    238 This might compile to this code:
    239 
    240 movaps c(%esp), %xmm1
    241 xorps %xmm0, %xmm0
    242 movss %xmm1, %xmm0
    243 ret
    244 
    245 Now consider if the ... code caused xmm1 to get spilled.  This might produce
    246 this code:
    247 
    248 movaps c(%esp), %xmm1
    249 movaps %xmm1, c2(%esp)
    250 ...
    251 
    252 xorps %xmm0, %xmm0
    253 movaps c2(%esp), %xmm1
    254 movss %xmm1, %xmm0
    255 ret
    256 
    257 However, since the reload is only used by these instructions, we could 
    258 "fold" it into the uses, producing something like this:
    259 
    260 movaps c(%esp), %xmm1
    261 movaps %xmm1, c2(%esp)
    262 ...
    263 
    264 movss c2(%esp), %xmm0
    265 ret
    266 
    267 ... saving two instructions.
    268 
    269 The basic idea is that a reload from a spill slot, can, if only one 4-byte 
    270 chunk is used, bring in 3 zeros the one element instead of 4 elements.
    271 This can be used to simplify a variety of shuffle operations, where the
    272 elements are fixed zeros.
    273 
    274 //===---------------------------------------------------------------------===//
    275 
    276 This code generates ugly code, probably due to costs being off or something:
    277 
    278 define void @test(float* %P, <4 x float>* %P2 ) {
    279         %xFloat0.688 = load float* %P
    280         %tmp = load <4 x float>* %P2
    281         %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
    282         store <4 x float> %inFloat3.713, <4 x float>* %P2
    283         ret void
    284 }
    285 
    286 Generates:
    287 
    288 _test:
    289 	movl	8(%esp), %eax
    290 	movaps	(%eax), %xmm0
    291 	pxor	%xmm1, %xmm1
    292 	movaps	%xmm0, %xmm2
    293 	shufps	$50, %xmm1, %xmm2
    294 	shufps	$132, %xmm2, %xmm0
    295 	movaps	%xmm0, (%eax)
    296 	ret
    297 
    298 Would it be better to generate:
    299 
    300 _test:
    301         movl 8(%esp), %ecx
    302         movaps (%ecx), %xmm0
    303 	xor %eax, %eax
    304         pinsrw $6, %eax, %xmm0
    305         pinsrw $7, %eax, %xmm0
    306         movaps %xmm0, (%ecx)
    307         ret
    308 
    309 ?
    310 
    311 //===---------------------------------------------------------------------===//
    312 
    313 Some useful information in the Apple Altivec / SSE Migration Guide:
    314 
    315 http://developer.apple.com/documentation/Performance/Conceptual/
    316 Accelerate_sse_migration/index.html
    317 
    318 e.g. SSE select using and, andnot, or. Various SSE compare translations.
    319 
    320 //===---------------------------------------------------------------------===//
    321 
    322 Add hooks to commute some CMPP operations.
    323 
    324 //===---------------------------------------------------------------------===//
    325 
    326 Apply the same transformation that merged four float into a single 128-bit load
    327 to loads from constant pool.
    328 
    329 //===---------------------------------------------------------------------===//
    330 
    331 Floating point max / min are commutable when -enable-unsafe-fp-path is
    332 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
    333 nodes which are selected to max / min instructions that are marked commutable.
    334 
    335 //===---------------------------------------------------------------------===//
    336 
    337 We should materialize vector constants like "all ones" and "signbit" with 
    338 code like:
    339 
    340      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
    341 
    342 and:
    343      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
    344      psrlq   xmm1, 31     ; xmm1 = all 100000000000...
    345 
    346 instead of using a load from the constant pool.  The later is important for
    347 ABS/NEG/copysign etc.
    348 
    349 //===---------------------------------------------------------------------===//
    350 
    351 These functions:
    352 
    353 #include <xmmintrin.h>
    354 __m128i a;
    355 void x(unsigned short n) {
    356   a = _mm_slli_epi32 (a, n);
    357 }
    358 void y(unsigned n) {
    359   a = _mm_slli_epi32 (a, n);
    360 }
    361 
    362 compile to ( -O3 -static -fomit-frame-pointer):
    363 _x:
    364         movzwl  4(%esp), %eax
    365         movd    %eax, %xmm0
    366         movaps  _a, %xmm1
    367         pslld   %xmm0, %xmm1
    368         movaps  %xmm1, _a
    369         ret
    370 _y:
    371         movd    4(%esp), %xmm0
    372         movaps  _a, %xmm1
    373         pslld   %xmm0, %xmm1
    374         movaps  %xmm1, _a
    375         ret
    376 
    377 "y" looks good, but "x" does silly movzwl stuff around into a GPR.  It seems
    378 like movd would be sufficient in both cases as the value is already zero 
    379 extended in the 32-bit stack slot IIRC.  For signed short, it should also be
    380 save, as a really-signed value would be undefined for pslld.
    381 
    382 
    383 //===---------------------------------------------------------------------===//
    384 
    385 #include <math.h>
    386 int t1(double d) { return signbit(d); }
    387 
    388 This currently compiles to:
    389 	subl	$12, %esp
    390 	movsd	16(%esp), %xmm0
    391 	movsd	%xmm0, (%esp)
    392 	movl	4(%esp), %eax
    393 	shrl	$31, %eax
    394 	addl	$12, %esp
    395 	ret
    396 
    397 We should use movmskp{s|d} instead.
    398 
    399 //===---------------------------------------------------------------------===//
    400 
    401 CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
    402 (aligned) vector load.  This functionality has a couple of problems.
    403 
    404 1. The code to infer alignment from loads of globals is in the X86 backend,
    405    not the dag combiner.  This is because dagcombine2 needs to be able to see
    406    through the X86ISD::Wrapper node, which DAGCombine can't really do.
    407 2. The code for turning 4 x load into a single vector load is target 
    408    independent and should be moved to the dag combiner.
    409 3. The code for turning 4 x load into a vector load can only handle a direct 
    410    load from a global or a direct load from the stack.  It should be generalized
    411    to handle any load from P, P+4, P+8, P+12, where P can be anything.
    412 4. The alignment inference code cannot handle loads from globals in non-static
    413    mode because it doesn't look through the extra dyld stub load.  If you try
    414    vec_align.ll without -relocation-model=static, you'll see what I mean.
    415 
    416 //===---------------------------------------------------------------------===//
    417 
    418 We should lower store(fneg(load p), q) into an integer load+xor+store, which
    419 eliminates a constant pool load.  For example, consider:
    420 
    421 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
    422 entry:
    423  %tmp6 = fsub float -0.000000e+00, %z.1		; <float> [#uses=1]
    424  %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
    425  ret i64 %tmp20
    426 }
    427 declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
    428 
    429 This currently compiles to:
    430 
    431 LCPI1_0:					#  <4 x float>
    432 	.long	2147483648	# float -0
    433 	.long	2147483648	# float -0
    434 	.long	2147483648	# float -0
    435 	.long	2147483648	# float -0
    436 _ccosf:
    437 	subl	$12, %esp
    438 	movss	16(%esp), %xmm0
    439 	movss	%xmm0, 4(%esp)
    440 	movss	20(%esp), %xmm0
    441 	xorps	LCPI1_0, %xmm0
    442 	movss	%xmm0, (%esp)
    443 	call	L_ccoshf$stub
    444 	addl	$12, %esp
    445 	ret
    446 
    447 Note the load into xmm0, then xor (to negate), then store.  In PIC mode,
    448 this code computes the pic base and does two loads to do the constant pool 
    449 load, so the improvement is much bigger.
    450 
    451 The tricky part about this xform is that the argument load/store isn't exposed
    452 until post-legalize, and at that point, the fneg has been custom expanded into 
    453 an X86 fxor.  This means that we need to handle this case in the x86 backend
    454 instead of in target independent code.
    455 
    456 //===---------------------------------------------------------------------===//
    457 
    458 Non-SSE4 insert into 16 x i8 is atrociously bad.
    459 
    460 //===---------------------------------------------------------------------===//
    461 
    462 <2 x i64> extract is substantially worse than <2 x f64>, even if the destination
    463 is memory.
    464 
    465 //===---------------------------------------------------------------------===//
    466 
    467 INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
    468 any number of 0.0 simultaneously.  Currently we only use it for simple
    469 insertions.
    470 
    471 See comments in LowerINSERT_VECTOR_ELT_SSE4.
    472 
    473 //===---------------------------------------------------------------------===//
    474 
    475 On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
    476 Custom.  All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
    477 legal, it'll just take a few extra patterns written in the .td file.
    478 
    479 Note: this is not a code quality issue; the custom lowered code happens to be
    480 right, but we shouldn't have to custom lower anything.  This is probably related
    481 to <2 x i64> ops being so bad.
    482 
    483 //===---------------------------------------------------------------------===//
    484 
    485 LLVM currently generates stack realignment code, when it is not necessary
    486 needed. The problem is that we need to know about stack alignment too early,
    487 before RA runs.
    488 
    489 At that point we don't know, whether there will be vector spill, or not.
    490 Stack realignment logic is overly conservative here, but otherwise we can
    491 produce unaligned loads/stores.
    492 
    493 Fixing this will require some huge RA changes.
    494 
    495 Testcase:
    496 #include <emmintrin.h>
    497 
    498 typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
    499 
    500 static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
    501 - 22725, - 12873};;
    502 
    503 vSInt16 madd(vSInt16 b)
    504 {
    505     return _mm_madd_epi16(a, b);
    506 }
    507 
    508 Generated code (x86-32, linux):
    509 madd:
    510         pushl   %ebp
    511         movl    %esp, %ebp
    512         andl    $-16, %esp
    513         movaps  .LCPI1_0, %xmm1
    514         pmaddwd %xmm1, %xmm0
    515         movl    %ebp, %esp
    516         popl    %ebp
    517         ret
    518 
    519 //===---------------------------------------------------------------------===//
    520 
    521 Consider:
    522 #include <emmintrin.h> 
    523 __m128 foo2 (float x) {
    524  return _mm_set_ps (0, 0, x, 0);
    525 }
    526 
    527 In x86-32 mode, we generate this spiffy code:
    528 
    529 _foo2:
    530 	movss	4(%esp), %xmm0
    531 	pshufd	$81, %xmm0, %xmm0
    532 	ret
    533 
    534 in x86-64 mode, we generate this code, which could be better:
    535 
    536 _foo2:
    537 	xorps	%xmm1, %xmm1
    538 	movss	%xmm0, %xmm1
    539 	pshufd	$81, %xmm1, %xmm0
    540 	ret
    541 
    542 In sse4 mode, we could use insertps to make both better.
    543 
    544 Here's another testcase that could use insertps [mem]:
    545 
    546 #include <xmmintrin.h>
    547 extern float x2, x3;
    548 __m128 foo1 (float x1, float x4) {
    549  return _mm_set_ps (x2, x1, x3, x4);
    550 }
    551 
    552 gcc mainline compiles it to:
    553 
    554 foo1:
    555        insertps        $0x10, x2(%rip), %xmm0
    556        insertps        $0x10, x3(%rip), %xmm1
    557        movaps  %xmm1, %xmm2
    558        movlhps %xmm0, %xmm2
    559        movaps  %xmm2, %xmm0
    560        ret
    561 
    562 //===---------------------------------------------------------------------===//
    563 
    564 We compile vector multiply-by-constant into poor code:
    565 
    566 define <4 x i32> @f(<4 x i32> %i) nounwind  {
    567 	%A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
    568 	ret <4 x i32> %A
    569 }
    570 
    571 On targets without SSE4.1, this compiles into:
    572 
    573 LCPI1_0:					##  <4 x i32>
    574 	.long	10
    575 	.long	10
    576 	.long	10
    577 	.long	10
    578 	.text
    579 	.align	4,0x90
    580 	.globl	_f
    581 _f:
    582 	pshufd	$3, %xmm0, %xmm1
    583 	movd	%xmm1, %eax
    584 	imull	LCPI1_0+12, %eax
    585 	movd	%eax, %xmm1
    586 	pshufd	$1, %xmm0, %xmm2
    587 	movd	%xmm2, %eax
    588 	imull	LCPI1_0+4, %eax
    589 	movd	%eax, %xmm2
    590 	punpckldq	%xmm1, %xmm2
    591 	movd	%xmm0, %eax
    592 	imull	LCPI1_0, %eax
    593 	movd	%eax, %xmm1
    594 	movhlps	%xmm0, %xmm0
    595 	movd	%xmm0, %eax
    596 	imull	LCPI1_0+8, %eax
    597 	movd	%eax, %xmm0
    598 	punpckldq	%xmm0, %xmm1
    599 	movaps	%xmm1, %xmm0
    600 	punpckldq	%xmm2, %xmm0
    601 	ret
    602 
    603 It would be better to synthesize integer vector multiplication by constants
    604 using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
    605 simple cases such as multiplication by powers of two would be better as
    606 vector shifts than as multiplications.
    607 
    608 //===---------------------------------------------------------------------===//
    609 
    610 We compile this:
    611 
    612 __m128i
    613 foo2 (char x)
    614 {
    615   return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
    616 }
    617 
    618 into:
    619 	movl	$1, %eax
    620 	xorps	%xmm0, %xmm0
    621 	pinsrw	$2, %eax, %xmm0
    622 	movzbl	4(%esp), %eax
    623 	pinsrw	$3, %eax, %xmm0
    624 	movl	$256, %eax
    625 	pinsrw	$7, %eax, %xmm0
    626 	ret
    627 
    628 
    629 gcc-4.2:
    630 	subl	$12, %esp
    631 	movzbl	16(%esp), %eax
    632 	movdqa	LC0, %xmm0
    633 	pinsrw	$3, %eax, %xmm0
    634 	addl	$12, %esp
    635 	ret
    636 	.const
    637 	.align 4
    638 LC0:
    639 	.word	0
    640 	.word	0
    641 	.word	1
    642 	.word	0
    643 	.word	0
    644 	.word	0
    645 	.word	0
    646 	.word	256
    647 
    648 With SSE4, it should be
    649       movdqa  .LC0(%rip), %xmm0
    650       pinsrb  $6, %edi, %xmm0
    651 
    652 //===---------------------------------------------------------------------===//
    653 
    654 We should transform a shuffle of two vectors of constants into a single vector
    655 of constants. Also, insertelement of a constant into a vector of constants
    656 should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
    657 
    658 We compiled it to something horrible:
    659 
    660 	.align	4
    661 LCPI1_1:					##  float
    662 	.long	1065353216	## float 1
    663 	.const
    664 
    665 	.align	4
    666 LCPI1_0:					##  <4 x float>
    667 	.space	4
    668 	.long	1065353216	## float 1
    669 	.space	4
    670 	.long	1065353216	## float 1
    671 	.text
    672 	.align	4,0x90
    673 	.globl	_t
    674 _t:
    675 	xorps	%xmm0, %xmm0
    676 	movhps	LCPI1_0, %xmm0
    677 	movss	LCPI1_1, %xmm1
    678 	movaps	%xmm0, %xmm2
    679 	shufps	$2, %xmm1, %xmm2
    680 	shufps	$132, %xmm2, %xmm0
    681 	movaps	%xmm0, 0
    682 
    683 //===---------------------------------------------------------------------===//
    684 rdar://5907648
    685 
    686 This function:
    687 
    688 float foo(unsigned char x) {
    689   return x;
    690 }
    691 
    692 compiles to (x86-32):
    693 
    694 define float @foo(i8 zeroext  %x) nounwind  {
    695 	%tmp12 = uitofp i8 %x to float		; <float> [#uses=1]
    696 	ret float %tmp12
    697 }
    698 
    699 compiles to:
    700 
    701 _foo:
    702 	subl	$4, %esp
    703 	movzbl	8(%esp), %eax
    704 	cvtsi2ss	%eax, %xmm0
    705 	movss	%xmm0, (%esp)
    706 	flds	(%esp)
    707 	addl	$4, %esp
    708 	ret
    709 
    710 We should be able to use:
    711   cvtsi2ss 8($esp), %xmm0
    712 since we know the stack slot is already zext'd.
    713 
    714 //===---------------------------------------------------------------------===//
    715 
    716 Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
    717 when code size is critical. movlps is slower than movsd on core2 but it's one
    718 byte shorter.
    719 
    720 //===---------------------------------------------------------------------===//
    721 
    722 We should use a dynamic programming based approach to tell when using FPStack
    723 operations is cheaper than SSE.  SciMark montecarlo contains code like this
    724 for example:
    725 
    726 double MonteCarlo_num_flops(int Num_samples) {
    727     return ((double) Num_samples)* 4.0;
    728 }
    729 
    730 In fpstack mode, this compiles into:
    731 
    732 LCPI1_0:					
    733 	.long	1082130432	## float 4.000000e+00
    734 _MonteCarlo_num_flops:
    735 	subl	$4, %esp
    736 	movl	8(%esp), %eax
    737 	movl	%eax, (%esp)
    738 	fildl	(%esp)
    739 	fmuls	LCPI1_0
    740 	addl	$4, %esp
    741 	ret
    742         
    743 in SSE mode, it compiles into significantly slower code:
    744 
    745 _MonteCarlo_num_flops:
    746 	subl	$12, %esp
    747 	cvtsi2sd	16(%esp), %xmm0
    748 	mulsd	LCPI1_0, %xmm0
    749 	movsd	%xmm0, (%esp)
    750 	fldl	(%esp)
    751 	addl	$12, %esp
    752 	ret
    753 
    754 There are also other cases in scimark where using fpstack is better, it is
    755 cheaper to do fld1 than load from a constant pool for example, so
    756 "load, add 1.0, store" is better done in the fp stack, etc.
    757 
    758 //===---------------------------------------------------------------------===//
    759 
    760 These should compile into the same code (PR6214): Perhaps instcombine should
    761 canonicalize the former into the later?
    762 
    763 define float @foo(float %x) nounwind {
    764   %t = bitcast float %x to i32
    765   %s = and i32 %t, 2147483647
    766   %d = bitcast i32 %s to float
    767   ret float %d
    768 }
    769 
    770 declare float @fabsf(float %n)
    771 define float @bar(float %x) nounwind {
    772   %d = call float @fabsf(float %x)
    773   ret float %d
    774 }
    775 
    776 //===---------------------------------------------------------------------===//
    777 
    778 This IR (from PR6194):
    779 
    780 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
    781 target triple = "x86_64-apple-darwin10.0.0"
    782 
    783 %0 = type { double, double }
    784 %struct.float3 = type { float, float, float }
    785 
    786 define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
    787 entry:
    788   %tmp18 = extractvalue %0 %0, 0                  ; <double> [#uses=1]
    789   %tmp19 = bitcast double %tmp18 to i64           ; <i64> [#uses=1]
    790   %tmp20 = zext i64 %tmp19 to i128                ; <i128> [#uses=1]
    791   %tmp10 = lshr i128 %tmp20, 32                   ; <i128> [#uses=1]
    792   %tmp11 = trunc i128 %tmp10 to i32               ; <i32> [#uses=1]
    793   %tmp12 = bitcast i32 %tmp11 to float            ; <float> [#uses=1]
    794   %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
    795   store float %tmp12, float* %tmp5
    796   ret void
    797 }
    798 
    799 Compiles to:
    800 
    801 _test:                                  ## @test
    802 	movd	%xmm0, %rax
    803 	shrq	$32, %rax
    804 	movl	%eax, 4(%rdi)
    805 	ret
    806 
    807 This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
    808 doing a shuffle from v[1] to v[0] then a float store.
    809 
    810 //===---------------------------------------------------------------------===//
    811 
    812 [UNSAFE FP]
    813 
    814 void foo(double, double, double);
    815 void norm(double x, double y, double z) {
    816   double scale = __builtin_sqrt(x*x + y*y + z*z);
    817   foo(x/scale, y/scale, z/scale);
    818 }
    819 
    820 We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is
    821 slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first
    822 and emit 3 mulsd in place of the divs. This can be done as a target-independent
    823 transform.
    824 
    825 If we're dealing with floats instead of doubles we could even replace the sqrtss
    826 and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
    827 cost of reduced accuracy.
    828 
    829 //===---------------------------------------------------------------------===//
    830 
    831 This function should be matched to haddpd when the appropriate CPU is enabled:
    832 
    833 #include <x86intrin.h>
    834 double f (__m128d p) {
    835   return p[0] + p[1];
    836 }
    837 
    838 similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should
    839 turn into hsubpd also.
    840 
    841 //===---------------------------------------------------------------------===//
    842 

README-X86-64.txt

      1 //===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===//
      2 
      3 AMD64 Optimization Manual 8.2 has some nice information about optimizing integer
      4 multiplication by a constant. How much of it applies to Intel's X86-64
      5 implementation? There are definite trade-offs to consider: latency vs. register
      6 pressure vs. code size.
      7 
      8 //===---------------------------------------------------------------------===//
      9 
     10 Are we better off using branches instead of cmove to implement FP to
     11 unsigned i64?
     12 
     13 _conv:
     14 	ucomiss	LC0(%rip), %xmm0
     15 	cvttss2siq	%xmm0, %rdx
     16 	jb	L3
     17 	subss	LC0(%rip), %xmm0
     18 	movabsq	$-9223372036854775808, %rax
     19 	cvttss2siq	%xmm0, %rdx
     20 	xorq	%rax, %rdx
     21 L3:
     22 	movq	%rdx, %rax
     23 	ret
     24 
     25 instead of
     26 
     27 _conv:
     28 	movss LCPI1_0(%rip), %xmm1
     29 	cvttss2siq %xmm0, %rcx
     30 	movaps %xmm0, %xmm2
     31 	subss %xmm1, %xmm2
     32 	cvttss2siq %xmm2, %rax
     33 	movabsq $-9223372036854775808, %rdx
     34 	xorq %rdx, %rax
     35 	ucomiss %xmm1, %xmm0
     36 	cmovb %rcx, %rax
     37 	ret
     38 
     39 Seems like the jb branch has high likelihood of being taken. It would have
     40 saved a few instructions.
     41 
     42 //===---------------------------------------------------------------------===//
     43 
     44 It's not possible to reference AH, BH, CH, and DH registers in an instruction
     45 requiring REX prefix. However, divb and mulb both produce results in AH. If isel
     46 emits a CopyFromReg which gets turned into a movb and that can be allocated a
     47 r8b - r15b.
     48 
     49 To get around this, isel emits a CopyFromReg from AX and then right shift it
     50 down by 8 and truncate it. It's not pretty but it works. We need some register
     51 allocation magic to make the hack go away (e.g. putting additional constraints
     52 on the result of the movb).
     53 
     54 //===---------------------------------------------------------------------===//
     55 
     56 The x86-64 ABI for hidden-argument struct returns requires that the
     57 incoming value of %rdi be copied into %rax by the callee upon return.
     58 
     59 The idea is that it saves callers from having to remember this value,
     60 which would often require a callee-saved register. Callees usually
     61 need to keep this value live for most of their body anyway, so it
     62 doesn't add a significant burden on them.
     63 
     64 We currently implement this in codegen, however this is suboptimal
     65 because it means that it would be quite awkward to implement the
     66 optimization for callers.
     67 
     68 A better implementation would be to relax the LLVM IR rules for sret
     69 arguments to allow a function with an sret argument to have a non-void
     70 return type, and to have the front-end to set up the sret argument value
     71 as the return value of the function. The front-end could more easily
     72 emit uses of the returned struct value to be in terms of the function's
     73 lowered return value, and it would free non-C frontends from a
     74 complication only required by a C-based ABI.
     75 
     76 //===---------------------------------------------------------------------===//
     77 
     78 We get a redundant zero extension for code like this:
     79 
     80 int mask[1000];
     81 int foo(unsigned x) {
     82  if (x < 10)
     83    x = x * 45;
     84  else
     85    x = x * 78;
     86  return mask[x];
     87 }
     88 
     89 _foo:
     90 LBB1_0:	## entry
     91 	cmpl	$9, %edi
     92 	jbe	LBB1_3	## bb
     93 LBB1_1:	## bb1
     94 	imull	$78, %edi, %eax
     95 LBB1_2:	## bb2
     96 	movl	%eax, %eax                    <----
     97 	movq	_mask@GOTPCREL(%rip), %rcx
     98 	movl	(%rcx,%rax,4), %eax
     99 	ret
    100 LBB1_3:	## bb
    101 	imull	$45, %edi, %eax
    102 	jmp	LBB1_2	## bb2
    103   
    104 Before regalloc, we have:
    105 
    106         %reg1025 = IMUL32rri8 %reg1024, 45, implicit-def %eflags
    107         JMP mbb<bb2,0x203afb0>
    108     Successors according to CFG: 0x203afb0 (#3)
    109 
    110 bb1: 0x203af60, LLVM BB @0x1e02310, ID#2:
    111     Predecessors according to CFG: 0x203aec0 (#0)
    112         %reg1026 = IMUL32rri8 %reg1024, 78, implicit-def %eflags
    113     Successors according to CFG: 0x203afb0 (#3)
    114 
    115 bb2: 0x203afb0, LLVM BB @0x1e02340, ID#3:
    116     Predecessors according to CFG: 0x203af10 (#1) 0x203af60 (#2)
    117         %reg1027 = PHI %reg1025, mbb<bb,0x203af10>,
    118                             %reg1026, mbb<bb1,0x203af60>
    119         %reg1029 = MOVZX64rr32 %reg1027
    120 
    121 so we'd have to know that IMUL32rri8 leaves the high word zero extended and to
    122 be able to recognize the zero extend.  This could also presumably be implemented
    123 if we have whole-function selectiondags.
    124 
    125 //===---------------------------------------------------------------------===//
    126 
    127 Take the following code
    128 (from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653):
    129 extern unsigned long table[];
    130 unsigned long foo(unsigned char *p) {
    131   unsigned long tag = *p;
    132   return table[tag >> 4] + table[tag & 0xf];
    133 }
    134 
    135 Current code generated:
    136 	movzbl	(%rdi), %eax
    137 	movq	%rax, %rcx
    138 	andq	$240, %rcx
    139 	shrq	%rcx
    140 	andq	$15, %rax
    141 	movq	table(,%rax,8), %rax
    142 	addq	table(%rcx), %rax
    143 	ret
    144 
    145 Issues:
    146 1. First movq should be movl; saves a byte.
    147 2. Both andq's should be andl; saves another two bytes.  I think this was
    148    implemented at one point, but subsequently regressed.
    149 3. shrq should be shrl; saves another byte.
    150 4. The first andq can be completely eliminated by using a slightly more
    151    expensive addressing mode.
    152 
    153 //===---------------------------------------------------------------------===//
    154 
    155 Consider the following (contrived testcase, but contains common factors):
    156 
    157 #include <stdarg.h>
    158 int test(int x, ...) {
    159   int sum, i;
    160   va_list l;
    161   va_start(l, x);
    162   for (i = 0; i < x; i++)
    163     sum += va_arg(l, int);
    164   va_end(l);
    165   return sum;
    166 }
    167 
    168 Testcase given in C because fixing it will likely involve changing the IR
    169 generated for it.  The primary issue with the result is that it doesn't do any
    170 of the optimizations which are possible if we know the address of a va_list
    171 in the current function is never taken:
    172 1. We shouldn't spill the XMM registers because we only call va_arg with "int".
    173 2. It would be nice if we could sroa the va_list.
    174 3. Probably overkill, but it'd be cool if we could peel off the first five
    175 iterations of the loop.
    176 
    177 Other optimizations involving functions which use va_arg on floats which don't
    178 have the address of a va_list taken:
    179 1. Conversely to the above, we shouldn't spill general registers if we only
    180    call va_arg on "double".
    181 2. If we know nothing more than 64 bits wide is read from the XMM registers,
    182    we can change the spilling code to reduce the amount of stack used by half.
    183 
    184 //===---------------------------------------------------------------------===//
    185 

README.txt

      1 //===---------------------------------------------------------------------===//
      2 // Random ideas for the X86 backend.
      3 //===---------------------------------------------------------------------===//
      4 
      5 Improvements to the multiply -> shift/add algorithm:
      6 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
      7 
      8 //===---------------------------------------------------------------------===//
      9 
     10 Improve code like this (occurs fairly frequently, e.g. in LLVM):
     11 long long foo(int x) { return 1LL << x; }
     12 
     13 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
     14 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
     15 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
     16 
     17 Another useful one would be  ~0ULL >> X and ~0ULL << X.
     18 
     19 One better solution for 1LL << x is:
     20         xorl    %eax, %eax
     21         xorl    %edx, %edx
     22         testb   $32, %cl
     23         sete    %al
     24         setne   %dl
     25         sall    %cl, %eax
     26         sall    %cl, %edx
     27 
     28 But that requires good 8-bit subreg support.
     29 
     30 Also, this might be better.  It's an extra shift, but it's one instruction
     31 shorter, and doesn't stress 8-bit subreg support.
     32 (From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
     33 but without the unnecessary and.)
     34         movl %ecx, %eax
     35         shrl $5, %eax
     36         movl %eax, %edx
     37         xorl $1, %edx
     38         sall %cl, %eax
     39         sall %cl. %edx
     40 
     41 64-bit shifts (in general) expand to really bad code.  Instead of using
     42 cmovs, we should expand to a conditional branch like GCC produces.
     43 
     44 //===---------------------------------------------------------------------===//
     45 
     46 Some isel ideas:
     47 
     48 1. Dynamic programming based approach when compile time is not an
     49    issue.
     50 2. Code duplication (addressing mode) during isel.
     51 3. Other ideas from "Register-Sensitive Selection, Duplication, and
     52    Sequencing of Instructions".
     53 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
     54    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
     55    and other related papers.
     56    http://citeseer.ist.psu.edu/govindarajan01minimum.html
     57 
     58 //===---------------------------------------------------------------------===//
     59 
     60 Should we promote i16 to i32 to avoid partial register update stalls?
     61 
     62 //===---------------------------------------------------------------------===//
     63 
     64 Leave any_extend as pseudo instruction and hint to register
     65 allocator. Delay codegen until post register allocation.
     66 Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
     67 the coalescer how to deal with it though.
     68 
     69 //===---------------------------------------------------------------------===//
     70 
     71 It appears icc use push for parameter passing. Need to investigate.
     72 
     73 //===---------------------------------------------------------------------===//
     74 
     75 The instruction selector sometimes misses folding a load into a compare.  The
     76 pattern is written as (cmp reg, (load p)).  Because the compare isn't
     77 commutative, it is not matched with the load on both sides.  The dag combiner
     78 should be made smart enough to canonicalize the load into the RHS of a compare
     79 when it can invert the result of the compare for free.
     80 
     81 //===---------------------------------------------------------------------===//
     82 
     83 In many cases, LLVM generates code like this:
     84 
     85 _test:
     86         movl 8(%esp), %eax
     87         cmpl %eax, 4(%esp)
     88         setl %al
     89         movzbl %al, %eax
     90         ret
     91 
     92 on some processors (which ones?), it is more efficient to do this:
     93 
     94 _test:
     95         movl 8(%esp), %ebx
     96         xor  %eax, %eax
     97         cmpl %ebx, 4(%esp)
     98         setl %al
     99         ret
    100 
    101 Doing this correctly is tricky though, as the xor clobbers the flags.
    102 
    103 //===---------------------------------------------------------------------===//
    104 
    105 We should generate bts/btr/etc instructions on targets where they are cheap or
    106 when codesize is important.  e.g., for:
    107 
    108 void setbit(int *target, int bit) {
    109     *target |= (1 << bit);
    110 }
    111 void clearbit(int *target, int bit) {
    112     *target &= ~(1 << bit);
    113 }
    114 
    115 //===---------------------------------------------------------------------===//
    116 
    117 Instead of the following for memset char*, 1, 10:
    118 
    119 	movl $16843009, 4(%edx)
    120 	movl $16843009, (%edx)
    121 	movw $257, 8(%edx)
    122 
    123 It might be better to generate
    124 
    125 	movl $16843009, %eax
    126 	movl %eax, 4(%edx)
    127 	movl %eax, (%edx)
    128 	movw al, 8(%edx)
    129 	
    130 when we can spare a register. It reduces code size.
    131 
    132 //===---------------------------------------------------------------------===//
    133 
    134 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
    135 get this:
    136 
    137 define i32 @test1(i32 %X) {
    138     %Y = sdiv i32 %X, 8
    139     ret i32 %Y
    140 }
    141 
    142 _test1:
    143         movl 4(%esp), %eax
    144         movl %eax, %ecx
    145         sarl $31, %ecx
    146         shrl $29, %ecx
    147         addl %ecx, %eax
    148         sarl $3, %eax
    149         ret
    150 
    151 GCC knows several different ways to codegen it, one of which is this:
    152 
    153 _test1:
    154         movl    4(%esp), %eax
    155         cmpl    $-1, %eax
    156         leal    7(%eax), %ecx
    157         cmovle  %ecx, %eax
    158         sarl    $3, %eax
    159         ret
    160 
    161 which is probably slower, but it's interesting at least :)
    162 
    163 //===---------------------------------------------------------------------===//
    164 
    165 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
    166 We should leave these as libcalls for everything over a much lower threshold,
    167 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
    168 stores, TLB preheating, etc)
    169 
    170 //===---------------------------------------------------------------------===//
    171 
    172 Optimize this into something reasonable:
    173  x * copysign(1.0, y) * copysign(1.0, z)
    174 
    175 //===---------------------------------------------------------------------===//
    176 
    177 Optimize copysign(x, *y) to use an integer load from y.
    178 
    179 //===---------------------------------------------------------------------===//
    180 
    181 The following tests perform worse with LSR:
    182 
    183 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
    184 
    185 //===---------------------------------------------------------------------===//
    186 
    187 Adding to the list of cmp / test poor codegen issues:
    188 
    189 int test(__m128 *A, __m128 *B) {
    190   if (_mm_comige_ss(*A, *B))
    191     return 3;
    192   else
    193     return 4;
    194 }
    195 
    196 _test:
    197 	movl 8(%esp), %eax
    198 	movaps (%eax), %xmm0
    199 	movl 4(%esp), %eax
    200 	movaps (%eax), %xmm1
    201 	comiss %xmm0, %xmm1
    202 	setae %al
    203 	movzbl %al, %ecx
    204 	movl $3, %eax
    205 	movl $4, %edx
    206 	cmpl $0, %ecx
    207 	cmove %edx, %eax
    208 	ret
    209 
    210 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
    211 are a number of issues. 1) We are introducing a setcc between the result of the
    212 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
    213 so a any extend (which becomes a zero extend) is added.
    214 
    215 We probably need some kind of target DAG combine hook to fix this.
    216 
    217 //===---------------------------------------------------------------------===//
    218 
    219 We generate significantly worse code for this than GCC:
    220 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
    221 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
    222 
    223 There is also one case we do worse on PPC.
    224 
    225 //===---------------------------------------------------------------------===//
    226 
    227 For this:
    228 
    229 int test(int a)
    230 {
    231   return a * 3;
    232 }
    233 
    234 We currently emits
    235 	imull $3, 4(%esp), %eax
    236 
    237 Perhaps this is what we really should generate is? Is imull three or four
    238 cycles? Note: ICC generates this:
    239 	movl	4(%esp), %eax
    240 	leal	(%eax,%eax,2), %eax
    241 
    242 The current instruction priority is based on pattern complexity. The former is
    243 more "complex" because it folds a load so the latter will not be emitted.
    244 
    245 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
    246 should always try to match LEA first since the LEA matching code does some
    247 estimate to determine whether the match is profitable.
    248 
    249 However, if we care more about code size, then imull is better. It's two bytes
    250 shorter than movl + leal.
    251 
    252 On a Pentium M, both variants have the same characteristics with regard
    253 to throughput; however, the multiplication has a latency of four cycles, as
    254 opposed to two cycles for the movl+lea variant.
    255 
    256 //===---------------------------------------------------------------------===//
    257 
    258 It appears gcc place string data with linkonce linkage in
    259 .section __TEXT,__const_coal,coalesced instead of
    260 .section __DATA,__const_coal,coalesced.
    261 Take a look at darwin.h, there are other Darwin assembler directives that we
    262 do not make use of.
    263 
    264 //===---------------------------------------------------------------------===//
    265 
    266 define i32 @foo(i32* %a, i32 %t) {
    267 entry:
    268 	br label %cond_true
    269 
    270 cond_true:		; preds = %cond_true, %entry
    271 	%x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]		; <i32> [#uses=3]
    272 	%t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]		; <i32> [#uses=1]
    273 	%tmp2 = getelementptr i32* %a, i32 %x.0.0		; <i32*> [#uses=1]
    274 	%tmp3 = load i32* %tmp2		; <i32> [#uses=1]
    275 	%tmp5 = add i32 %t_addr.0.0, %x.0.0		; <i32> [#uses=1]
    276 	%tmp7 = add i32 %tmp5, %tmp3		; <i32> [#uses=2]
    277 	%tmp9 = add i32 %x.0.0, 1		; <i32> [#uses=2]
    278 	%tmp = icmp sgt i32 %tmp9, 39		; <i1> [#uses=1]
    279 	br i1 %tmp, label %bb12, label %cond_true
    280 
    281 bb12:		; preds = %cond_true
    282 	ret i32 %tmp7
    283 }
    284 is pessimized by -loop-reduce and -indvars
    285 
    286 //===---------------------------------------------------------------------===//
    287 
    288 u32 to float conversion improvement:
    289 
    290 float uint32_2_float( unsigned u ) {
    291   float fl = (int) (u & 0xffff);
    292   float fh = (int) (u >> 16);
    293   fh *= 0x1.0p16f;
    294   return fh + fl;
    295 }
    296 
    297 00000000        subl    $0x04,%esp
    298 00000003        movl    0x08(%esp,1),%eax
    299 00000007        movl    %eax,%ecx
    300 00000009        shrl    $0x10,%ecx
    301 0000000c        cvtsi2ss        %ecx,%xmm0
    302 00000010        andl    $0x0000ffff,%eax
    303 00000015        cvtsi2ss        %eax,%xmm1
    304 00000019        mulss   0x00000078,%xmm0
    305 00000021        addss   %xmm1,%xmm0
    306 00000025        movss   %xmm0,(%esp,1)
    307 0000002a        flds    (%esp,1)
    308 0000002d        addl    $0x04,%esp
    309 00000030        ret
    310 
    311 //===---------------------------------------------------------------------===//
    312 
    313 When using fastcc abi, align stack slot of argument of type double on 8 byte
    314 boundary to improve performance.
    315 
    316 //===---------------------------------------------------------------------===//
    317 
    318 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
    319 simplifications for integer "x cmp y ? a : b".
    320 
    321 //===---------------------------------------------------------------------===//
    322 
    323 Consider the expansion of:
    324 
    325 define i32 @test3(i32 %X) {
    326         %tmp1 = urem i32 %X, 255
    327         ret i32 %tmp1
    328 }
    329 
    330 Currently it compiles to:
    331 
    332 ...
    333         movl $2155905153, %ecx
    334         movl 8(%esp), %esi
    335         movl %esi, %eax
    336         mull %ecx
    337 ...
    338 
    339 This could be "reassociated" into:
    340 
    341         movl $2155905153, %eax
    342         movl 8(%esp), %ecx
    343         mull %ecx
    344 
    345 to avoid the copy.  In fact, the existing two-address stuff would do this
    346 except that mul isn't a commutative 2-addr instruction.  I guess this has
    347 to be done at isel time based on the #uses to mul?
    348 
    349 //===---------------------------------------------------------------------===//
    350 
    351 Make sure the instruction which starts a loop does not cross a cacheline
    352 boundary. This requires knowning the exact length of each machine instruction.
    353 That is somewhat complicated, but doable. Example 256.bzip2:
    354 
    355 In the new trace, the hot loop has an instruction which crosses a cacheline
    356 boundary.  In addition to potential cache misses, this can't help decoding as I
    357 imagine there has to be some kind of complicated decoder reset and realignment
    358 to grab the bytes from the next cacheline.
    359 
    360 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
    361 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
    362 937  937 0x3d0a incl     %esi
    363 3    3   0x3d0b cmpb     %bl, %dl
    364 27   27  0x3d0d jnz      0x000062db <main+11707>
    365 
    366 //===---------------------------------------------------------------------===//
    367 
    368 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
    369 
    370 //===---------------------------------------------------------------------===//
    371 
    372 This could be a single 16-bit load.
    373 
    374 int f(char *p) {
    375     if ((p[0] == 1) & (p[1] == 2)) return 1;
    376     return 0;
    377 }
    378 
    379 //===---------------------------------------------------------------------===//
    380 
    381 We should inline lrintf and probably other libc functions.
    382 
    383 //===---------------------------------------------------------------------===//
    384 
    385 This code:
    386 
    387 void test(int X) {
    388   if (X) abort();
    389 }
    390 
    391 is currently compiled to:
    392 
    393 _test:
    394         subl $12, %esp
    395         cmpl $0, 16(%esp)
    396         jne LBB1_1
    397         addl $12, %esp
    398         ret
    399 LBB1_1:
    400         call L_abort$stub
    401 
    402 It would be better to produce:
    403 
    404 _test:
    405         subl $12, %esp
    406         cmpl $0, 16(%esp)
    407         jne L_abort$stub
    408         addl $12, %esp
    409         ret
    410 
    411 This can be applied to any no-return function call that takes no arguments etc.
    412 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
    413 something like this:
    414 
    415 _test:
    416         cmpl $0, 4(%esp)
    417         jne LBB1_1
    418         ret
    419 LBB1_1:
    420         subl $12, %esp
    421         call L_abort$stub
    422 
    423 Both are useful in different situations.  Finally, it could be shrink-wrapped
    424 and tail called, like this:
    425 
    426 _test:
    427         cmpl $0, 4(%esp)
    428         jne LBB1_1
    429         ret
    430 LBB1_1:
    431         pop %eax   # realign stack.
    432         call L_abort$stub
    433 
    434 Though this probably isn't worth it.
    435 
    436 //===---------------------------------------------------------------------===//
    437 
    438 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
    439 a neg instead of a sub instruction.  Consider:
    440 
    441 int test(char X) { return 7-X; }
    442 
    443 we currently produce:
    444 _test:
    445         movl $7, %eax
    446         movsbl 4(%esp), %ecx
    447         subl %ecx, %eax
    448         ret
    449 
    450 We would use one fewer register if codegen'd as:
    451 
    452         movsbl 4(%esp), %eax
    453 	neg %eax
    454         add $7, %eax
    455         ret
    456 
    457 Note that this isn't beneficial if the load can be folded into the sub.  In
    458 this case, we want a sub:
    459 
    460 int test(int X) { return 7-X; }
    461 _test:
    462         movl $7, %eax
    463         subl 4(%esp), %eax
    464         ret
    465 
    466 //===---------------------------------------------------------------------===//
    467 
    468 Leaf functions that require one 4-byte spill slot have a prolog like this:
    469 
    470 _foo:
    471         pushl   %esi
    472         subl    $4, %esp
    473 ...
    474 and an epilog like this:
    475         addl    $4, %esp
    476         popl    %esi
    477         ret
    478 
    479 It would be smaller, and potentially faster, to push eax on entry and to
    480 pop into a dummy register instead of using addl/subl of esp.  Just don't pop 
    481 into any return registers :)
    482 
    483 //===---------------------------------------------------------------------===//
    484 
    485 The X86 backend should fold (branch (or (setcc, setcc))) into multiple 
    486 branches.  We generate really poor code for:
    487 
    488 double testf(double a) {
    489        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
    490 }
    491 
    492 For example, the entry BB is:
    493 
    494 _testf:
    495         subl    $20, %esp
    496         pxor    %xmm0, %xmm0
    497         movsd   24(%esp), %xmm1
    498         ucomisd %xmm0, %xmm1
    499         setnp   %al
    500         sete    %cl
    501         testb   %cl, %al
    502         jne     LBB1_5  # UnifiedReturnBlock
    503 LBB1_1: # cond_true
    504 
    505 
    506 it would be better to replace the last four instructions with:
    507 
    508 	jp LBB1_1
    509 	je LBB1_5
    510 LBB1_1:
    511 
    512 We also codegen the inner ?: into a diamond:
    513 
    514        cvtss2sd        LCPI1_0(%rip), %xmm2
    515         cvtss2sd        LCPI1_1(%rip), %xmm3
    516         ucomisd %xmm1, %xmm0
    517         ja      LBB1_3  # cond_true
    518 LBB1_2: # cond_true
    519         movapd  %xmm3, %xmm2
    520 LBB1_3: # cond_true
    521         movapd  %xmm2, %xmm0
    522         ret
    523 
    524 We should sink the load into xmm3 into the LBB1_2 block.  This should
    525 be pretty easy, and will nuke all the copies.
    526 
    527 //===---------------------------------------------------------------------===//
    528 
    529 This:
    530         #include <algorithm>
    531         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
    532         { return std::make_pair(a + b, a + b < a); }
    533         bool no_overflow(unsigned a, unsigned b)
    534         { return !full_add(a, b).second; }
    535 
    536 Should compile to:
    537 	addl	%esi, %edi
    538 	setae	%al
    539 	movzbl	%al, %eax
    540 	ret
    541 
    542 on x86-64, instead of the rather stupid-looking:
    543 	addl	%esi, %edi
    544 	setb	%al
    545 	xorb	$1, %al
    546 	movzbl	%al, %eax
    547 	ret
    548 
    549 
    550 //===---------------------------------------------------------------------===//
    551 
    552 The following code:
    553 
    554 bb114.preheader:		; preds = %cond_next94
    555 	%tmp231232 = sext i16 %tmp62 to i32		; <i32> [#uses=1]
    556 	%tmp233 = sub i32 32, %tmp231232		; <i32> [#uses=1]
    557 	%tmp245246 = sext i16 %tmp65 to i32		; <i32> [#uses=1]
    558 	%tmp252253 = sext i16 %tmp68 to i32		; <i32> [#uses=1]
    559 	%tmp254 = sub i32 32, %tmp252253		; <i32> [#uses=1]
    560 	%tmp553554 = bitcast i16* %tmp37 to i8*		; <i8*> [#uses=2]
    561 	%tmp583584 = sext i16 %tmp98 to i32		; <i32> [#uses=1]
    562 	%tmp585 = sub i32 32, %tmp583584		; <i32> [#uses=1]
    563 	%tmp614615 = sext i16 %tmp101 to i32		; <i32> [#uses=1]
    564 	%tmp621622 = sext i16 %tmp104 to i32		; <i32> [#uses=1]
    565 	%tmp623 = sub i32 32, %tmp621622		; <i32> [#uses=1]
    566 	br label %bb114
    567 
    568 produces:
    569 
    570 LBB3_5:	# bb114.preheader
    571 	movswl	-68(%ebp), %eax
    572 	movl	$32, %ecx
    573 	movl	%ecx, -80(%ebp)
    574 	subl	%eax, -80(%ebp)
    575 	movswl	-52(%ebp), %eax
    576 	movl	%ecx, -84(%ebp)
    577 	subl	%eax, -84(%ebp)
    578 	movswl	-70(%ebp), %eax
    579 	movl	%ecx, -88(%ebp)
    580 	subl	%eax, -88(%ebp)
    581 	movswl	-50(%ebp), %eax
    582 	subl	%eax, %ecx
    583 	movl	%ecx, -76(%ebp)
    584 	movswl	-42(%ebp), %eax
    585 	movl	%eax, -92(%ebp)
    586 	movswl	-66(%ebp), %eax
    587 	movl	%eax, -96(%ebp)
    588 	movw	$0, -98(%ebp)
    589 
    590 This appears to be bad because the RA is not folding the store to the stack 
    591 slot into the movl.  The above instructions could be:
    592 	movl    $32, -80(%ebp)
    593 ...
    594 	movl    $32, -84(%ebp)
    595 ...
    596 This seems like a cross between remat and spill folding.
    597 
    598 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
    599 change, so we could simply subtract %eax from %ecx first and then use %ecx (or
    600 vice-versa).
    601 
    602 //===---------------------------------------------------------------------===//
    603 
    604 This code:
    605 
    606 	%tmp659 = icmp slt i16 %tmp654, 0		; <i1> [#uses=1]
    607 	br i1 %tmp659, label %cond_true662, label %cond_next715
    608 
    609 produces this:
    610 
    611 	testw	%cx, %cx
    612 	movswl	%cx, %esi
    613 	jns	LBB4_109	# cond_next715
    614 
    615 Shark tells us that using %cx in the testw instruction is sub-optimal. It
    616 suggests using the 32-bit register (which is what ICC uses).
    617 
    618 //===---------------------------------------------------------------------===//
    619 
    620 We compile this:
    621 
    622 void compare (long long foo) {
    623   if (foo < 4294967297LL)
    624     abort();
    625 }
    626 
    627 to:
    628 
    629 compare:
    630         subl    $4, %esp
    631         cmpl    $0, 8(%esp)
    632         setne   %al
    633         movzbw  %al, %ax
    634         cmpl    $1, 12(%esp)
    635         setg    %cl
    636         movzbw  %cl, %cx
    637         cmove   %ax, %cx
    638         testb   $1, %cl
    639         jne     .LBB1_2 # UnifiedReturnBlock
    640 .LBB1_1:        # ifthen
    641         call    abort
    642 .LBB1_2:        # UnifiedReturnBlock
    643         addl    $4, %esp
    644         ret
    645 
    646 (also really horrible code on ppc).  This is due to the expand code for 64-bit
    647 compares.  GCC produces multiple branches, which is much nicer:
    648 
    649 compare:
    650         subl    $12, %esp
    651         movl    20(%esp), %edx
    652         movl    16(%esp), %eax
    653         decl    %edx
    654         jle     .L7
    655 .L5:
    656         addl    $12, %esp
    657         ret
    658         .p2align 4,,7
    659 .L7:
    660         jl      .L4
    661         cmpl    $0, %eax
    662         .p2align 4,,8
    663         ja      .L5
    664 .L4:
    665         .p2align 4,,9
    666         call    abort
    667 
    668 //===---------------------------------------------------------------------===//
    669 
    670 Tail call optimization improvements: Tail call optimization currently
    671 pushes all arguments on the top of the stack (their normal place for
    672 non-tail call optimized calls) that source from the callers arguments
    673 or  that source from a virtual register (also possibly sourcing from
    674 callers arguments).
    675 This is done to prevent overwriting of parameters (see example
    676 below) that might be used later.
    677 
    678 example:  
    679 
    680 int callee(int32, int64); 
    681 int caller(int32 arg1, int32 arg2) { 
    682   int64 local = arg2 * 2; 
    683   return callee(arg2, (int64)local); 
    684 }
    685 
    686 [arg1]          [!arg2 no longer valid since we moved local onto it]
    687 [arg2]      ->  [(int64)
    688 [RETADDR]        local  ]
    689 
    690 Moving arg1 onto the stack slot of callee function would overwrite
    691 arg2 of the caller.
    692 
    693 Possible optimizations:
    694 
    695 
    696  - Analyse the actual parameters of the callee to see which would
    697    overwrite a caller parameter which is used by the callee and only
    698    push them onto the top of the stack.
    699 
    700    int callee (int32 arg1, int32 arg2);
    701    int caller (int32 arg1, int32 arg2) {
    702        return callee(arg1,arg2);
    703    }
    704 
    705    Here we don't need to write any variables to the top of the stack
    706    since they don't overwrite each other.
    707 
    708    int callee (int32 arg1, int32 arg2);
    709    int caller (int32 arg1, int32 arg2) {
    710        return callee(arg2,arg1);
    711    }
    712 
    713    Here we need to push the arguments because they overwrite each
    714    other.
    715 
    716 //===---------------------------------------------------------------------===//
    717 
    718 main ()
    719 {
    720   int i = 0;
    721   unsigned long int z = 0;
    722 
    723   do {
    724     z -= 0x00004000;
    725     i++;
    726     if (i > 0x00040000)
    727       abort ();
    728   } while (z > 0);
    729   exit (0);
    730 }
    731 
    732 gcc compiles this to:
    733 
    734 _main:
    735 	subl	$28, %esp
    736 	xorl	%eax, %eax
    737 	jmp	L2
    738 L3:
    739 	cmpl	$262144, %eax
    740 	je	L10
    741 L2:
    742 	addl	$1, %eax
    743 	cmpl	$262145, %eax
    744 	jne	L3
    745 	call	L_abort$stub
    746 L10:
    747 	movl	$0, (%esp)
    748 	call	L_exit$stub
    749 
    750 llvm:
    751 
    752 _main:
    753 	subl	$12, %esp
    754 	movl	$1, %eax
    755 	movl	$16384, %ecx
    756 LBB1_1:	# bb
    757 	cmpl	$262145, %eax
    758 	jge	LBB1_4	# cond_true
    759 LBB1_2:	# cond_next
    760 	incl	%eax
    761 	addl	$4294950912, %ecx
    762 	cmpl	$16384, %ecx
    763 	jne	LBB1_1	# bb
    764 LBB1_3:	# bb11
    765 	xorl	%eax, %eax
    766 	addl	$12, %esp
    767 	ret
    768 LBB1_4:	# cond_true
    769 	call	L_abort$stub
    770 
    771 1. LSR should rewrite the first cmp with induction variable %ecx.
    772 2. DAG combiner should fold
    773         leal    1(%eax), %edx
    774         cmpl    $262145, %edx
    775    =>
    776         cmpl    $262144, %eax
    777 
    778 //===---------------------------------------------------------------------===//
    779 
    780 define i64 @test(double %X) {
    781 	%Y = fptosi double %X to i64
    782 	ret i64 %Y
    783 }
    784 
    785 compiles to:
    786 
    787 _test:
    788 	subl	$20, %esp
    789 	movsd	24(%esp), %xmm0
    790 	movsd	%xmm0, 8(%esp)
    791 	fldl	8(%esp)
    792 	fisttpll	(%esp)
    793 	movl	4(%esp), %edx
    794 	movl	(%esp), %eax
    795 	addl	$20, %esp
    796 	#FP_REG_KILL
    797 	ret
    798 
    799 This should just fldl directly from the input stack slot.
    800 
    801 //===---------------------------------------------------------------------===//
    802 
    803 This code:
    804 int foo (int x) { return (x & 65535) | 255; }
    805 
    806 Should compile into:
    807 
    808 _foo:
    809         movzwl  4(%esp), %eax
    810         orl     $255, %eax
    811         ret
    812 
    813 instead of:
    814 _foo:
    815 	movl	$65280, %eax
    816 	andl	4(%esp), %eax
    817 	orl	$255, %eax
    818 	ret
    819 
    820 //===---------------------------------------------------------------------===//
    821 
    822 We're codegen'ing multiply of long longs inefficiently:
    823 
    824 unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
    825   return arg1 *  arg2;
    826 }
    827 
    828 We compile to (fomit-frame-pointer):
    829 
    830 _LLM:
    831 	pushl	%esi
    832 	movl	8(%esp), %ecx
    833 	movl	16(%esp), %esi
    834 	movl	%esi, %eax
    835 	mull	%ecx
    836 	imull	12(%esp), %esi
    837 	addl	%edx, %esi
    838 	imull	20(%esp), %ecx
    839 	movl	%esi, %edx
    840 	addl	%ecx, %edx
    841 	popl	%esi
    842 	ret
    843 
    844 This looks like a scheduling deficiency and lack of remat of the load from
    845 the argument area.  ICC apparently produces:
    846 
    847         movl      8(%esp), %ecx
    848         imull     12(%esp), %ecx
    849         movl      16(%esp), %eax
    850         imull     4(%esp), %eax 
    851         addl      %eax, %ecx  
    852         movl      4(%esp), %eax
    853         mull      12(%esp) 
    854         addl      %ecx, %edx
    855         ret
    856 
    857 Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
    858 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
    859 
    860 //===---------------------------------------------------------------------===//
    861 
    862 We can fold a store into "zeroing a reg".  Instead of:
    863 
    864 xorl    %eax, %eax
    865 movl    %eax, 124(%esp)
    866 
    867 we should get:
    868 
    869 movl    $0, 124(%esp)
    870 
    871 if the flags of the xor are dead.
    872 
    873 Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
    874 be folded into: shl [mem], 1
    875 
    876 //===---------------------------------------------------------------------===//
    877 
    878 In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
    879 or and instruction, for example:
    880 
    881 	xorpd	LCPI1_0, %xmm2
    882 
    883 However, if xmm2 gets spilled, we end up with really ugly code like this:
    884 
    885 	movsd	(%esp), %xmm0
    886 	xorpd	LCPI1_0, %xmm0
    887 	movsd	%xmm0, (%esp)
    888 
    889 Since we 'know' that this is a 'neg', we can actually "fold" the spill into
    890 the neg/abs instruction, turning it into an *integer* operation, like this:
    891 
    892 	xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
    893 
    894 you could also use xorb, but xorl is less likely to lead to a partial register
    895 stall.  Here is a contrived testcase:
    896 
    897 double a, b, c;
    898 void test(double *P) {
    899   double X = *P;
    900   a = X;
    901   bar();
    902   X = -X;
    903   b = X;
    904   bar();
    905   c = X;
    906 }
    907 
    908 //===---------------------------------------------------------------------===//
    909 
    910 The generated code on x86 for checking for signed overflow on a multiply the
    911 obvious way is much longer than it needs to be.
    912 
    913 int x(int a, int b) {
    914   long long prod = (long long)a*b;
    915   return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
    916 }
    917 
    918 See PR2053 for more details.
    919 
    920 //===---------------------------------------------------------------------===//
    921 
    922 We should investigate using cdq/ctld (effect: edx = sar eax, 31)
    923 more aggressively; it should cost the same as a move+shift on any modern
    924 processor, but it's a lot shorter. Downside is that it puts more
    925 pressure on register allocation because it has fixed operands.
    926 
    927 Example:
    928 int abs(int x) {return x < 0 ? -x : x;}
    929 
    930 gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
    931 abs:
    932         movl    4(%esp), %eax
    933         cltd
    934         xorl    %edx, %eax
    935         subl    %edx, %eax
    936         ret
    937 
    938 //===---------------------------------------------------------------------===//
    939 
    940 Take the following code (from 
    941 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
    942 
    943 extern unsigned char first_one[65536];
    944 int FirstOnet(unsigned long long arg1)
    945 {
    946   if (arg1 >> 48)
    947     return (first_one[arg1 >> 48]);
    948   return 0;
    949 }
    950 
    951 
    952 The following code is currently generated:
    953 FirstOnet:
    954         movl    8(%esp), %eax
    955         cmpl    $65536, %eax
    956         movl    4(%esp), %ecx
    957         jb      .LBB1_2 # UnifiedReturnBlock
    958 .LBB1_1:        # ifthen
    959         shrl    $16, %eax
    960         movzbl  first_one(%eax), %eax
    961         ret
    962 .LBB1_2:        # UnifiedReturnBlock
    963         xorl    %eax, %eax
    964         ret
    965 
    966 We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
    967 lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
    968 
    969 //===---------------------------------------------------------------------===//
    970 
    971 We compile this function:
    972 
    973 define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext  %d) nounwind  {
    974 entry:
    975 	%tmp2 = icmp eq i8 %d, 0		; <i1> [#uses=1]
    976 	br i1 %tmp2, label %bb7, label %bb
    977 
    978 bb:		; preds = %entry
    979 	%tmp6 = add i32 %b, %a		; <i32> [#uses=1]
    980 	ret i32 %tmp6
    981 
    982 bb7:		; preds = %entry
    983 	%tmp10 = sub i32 %a, %c		; <i32> [#uses=1]
    984 	ret i32 %tmp10
    985 }
    986 
    987 to:
    988 
    989 foo:                                    # @foo
    990 # %bb.0:                                # %entry
    991 	movl	4(%esp), %ecx
    992 	cmpb	$0, 16(%esp)
    993 	je	.LBB0_2
    994 # %bb.1:                                # %bb
    995 	movl	8(%esp), %eax
    996 	addl	%ecx, %eax
    997 	ret
    998 .LBB0_2:                                # %bb7
    999 	movl	12(%esp), %edx
   1000 	movl	%ecx, %eax
   1001 	subl	%edx, %eax
   1002 	ret
   1003 
   1004 There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
   1005 couple more movls by putting 4(%esp) into %eax instead of %ecx.
   1006 
   1007 //===---------------------------------------------------------------------===//
   1008 
   1009 See rdar://4653682.
   1010 
   1011 From flops:
   1012 
   1013 LBB1_15:        # bb310
   1014         cvtss2sd        LCPI1_0, %xmm1
   1015         addsd   %xmm1, %xmm0
   1016         movsd   176(%esp), %xmm2
   1017         mulsd   %xmm0, %xmm2
   1018         movapd  %xmm2, %xmm3
   1019         mulsd   %xmm3, %xmm3
   1020         movapd  %xmm3, %xmm4
   1021         mulsd   LCPI1_23, %xmm4
   1022         addsd   LCPI1_24, %xmm4
   1023         mulsd   %xmm3, %xmm4
   1024         addsd   LCPI1_25, %xmm4
   1025         mulsd   %xmm3, %xmm4
   1026         addsd   LCPI1_26, %xmm4
   1027         mulsd   %xmm3, %xmm4
   1028         addsd   LCPI1_27, %xmm4
   1029         mulsd   %xmm3, %xmm4
   1030         addsd   LCPI1_28, %xmm4
   1031         mulsd   %xmm3, %xmm4
   1032         addsd   %xmm1, %xmm4
   1033         mulsd   %xmm2, %xmm4
   1034         movsd   152(%esp), %xmm1
   1035         addsd   %xmm4, %xmm1
   1036         movsd   %xmm1, 152(%esp)
   1037         incl    %eax
   1038         cmpl    %eax, %esi
   1039         jge     LBB1_15 # bb310
   1040 LBB1_16:        # bb358.loopexit
   1041         movsd   152(%esp), %xmm0
   1042         addsd   %xmm0, %xmm0
   1043         addsd   LCPI1_22, %xmm0
   1044         movsd   %xmm0, 152(%esp)
   1045 
   1046 Rather than spilling the result of the last addsd in the loop, we should have
   1047 insert a copy to split the interval (one for the duration of the loop, one
   1048 extending to the fall through). The register pressure in the loop isn't high
   1049 enough to warrant the spill.
   1050 
   1051 Also check why xmm7 is not used at all in the function.
   1052 
   1053 //===---------------------------------------------------------------------===//
   1054 
   1055 Take the following:
   1056 
   1057 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128"
   1058 target triple = "i386-apple-darwin8"
   1059 @in_exit.4870.b = internal global i1 false		; <i1*> [#uses=2]
   1060 define fastcc void @abort_gzip() noreturn nounwind  {
   1061 entry:
   1062 	%tmp.b.i = load i1* @in_exit.4870.b		; <i1> [#uses=1]
   1063 	br i1 %tmp.b.i, label %bb.i, label %bb4.i
   1064 bb.i:		; preds = %entry
   1065 	tail call void @exit( i32 1 ) noreturn nounwind 
   1066 	unreachable
   1067 bb4.i:		; preds = %entry
   1068 	store i1 true, i1* @in_exit.4870.b
   1069 	tail call void @exit( i32 1 ) noreturn nounwind 
   1070 	unreachable
   1071 }
   1072 declare void @exit(i32) noreturn nounwind 
   1073 
   1074 This compiles into:
   1075 _abort_gzip:                            ## @abort_gzip
   1076 ## %bb.0:                               ## %entry
   1077 	subl	$12, %esp
   1078 	movb	_in_exit.4870.b, %al
   1079 	cmpb	$1, %al
   1080 	jne	LBB0_2
   1081 
   1082 We somehow miss folding the movb into the cmpb.
   1083 
   1084 //===---------------------------------------------------------------------===//
   1085 
   1086 We compile:
   1087 
   1088 int test(int x, int y) {
   1089   return x-y-1;
   1090 }
   1091 
   1092 into (-m64):
   1093 
   1094 _test:
   1095 	decl	%edi
   1096 	movl	%edi, %eax
   1097 	subl	%esi, %eax
   1098 	ret
   1099 
   1100 it would be better to codegen as: x+~y  (notl+addl)
   1101 
   1102 //===---------------------------------------------------------------------===//
   1103 
   1104 This code:
   1105 
   1106 int foo(const char *str,...)
   1107 {
   1108  __builtin_va_list a; int x;
   1109  __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
   1110  return x;
   1111 }
   1112 
   1113 gets compiled into this on x86-64:
   1114 	subq    $200, %rsp
   1115         movaps  %xmm7, 160(%rsp)
   1116         movaps  %xmm6, 144(%rsp)
   1117         movaps  %xmm5, 128(%rsp)
   1118         movaps  %xmm4, 112(%rsp)
   1119         movaps  %xmm3, 96(%rsp)
   1120         movaps  %xmm2, 80(%rsp)
   1121         movaps  %xmm1, 64(%rsp)
   1122         movaps  %xmm0, 48(%rsp)
   1123         movq    %r9, 40(%rsp)
   1124         movq    %r8, 32(%rsp)
   1125         movq    %rcx, 24(%rsp)
   1126         movq    %rdx, 16(%rsp)
   1127         movq    %rsi, 8(%rsp)
   1128         leaq    (%rsp), %rax
   1129         movq    %rax, 192(%rsp)
   1130         leaq    208(%rsp), %rax
   1131         movq    %rax, 184(%rsp)
   1132         movl    $48, 180(%rsp)
   1133         movl    $8, 176(%rsp)
   1134         movl    176(%rsp), %eax
   1135         cmpl    $47, %eax
   1136         jbe     .LBB1_3 # bb
   1137 .LBB1_1:        # bb3
   1138         movq    184(%rsp), %rcx
   1139         leaq    8(%rcx), %rax
   1140         movq    %rax, 184(%rsp)
   1141 .LBB1_2:        # bb4
   1142         movl    (%rcx), %eax
   1143         addq    $200, %rsp
   1144         ret
   1145 .LBB1_3:        # bb
   1146         movl    %eax, %ecx
   1147         addl    $8, %eax
   1148         addq    192(%rsp), %rcx
   1149         movl    %eax, 176(%rsp)
   1150         jmp     .LBB1_2 # bb4
   1151 
   1152 gcc 4.3 generates:
   1153 	subq    $96, %rsp
   1154 .LCFI0:
   1155         leaq    104(%rsp), %rax
   1156         movq    %rsi, -80(%rsp)
   1157         movl    $8, -120(%rsp)
   1158         movq    %rax, -112(%rsp)
   1159         leaq    -88(%rsp), %rax
   1160         movq    %rax, -104(%rsp)
   1161         movl    $8, %eax
   1162         cmpl    $48, %eax
   1163         jb      .L6
   1164         movq    -112(%rsp), %rdx
   1165         movl    (%rdx), %eax
   1166         addq    $96, %rsp
   1167         ret
   1168         .p2align 4,,10
   1169         .p2align 3
   1170 .L6:
   1171         mov     %eax, %edx
   1172         addq    -104(%rsp), %rdx
   1173         addl    $8, %eax
   1174         movl    %eax, -120(%rsp)
   1175         movl    (%rdx), %eax
   1176         addq    $96, %rsp
   1177         ret
   1178 
   1179 and it gets compiled into this on x86:
   1180 	pushl   %ebp
   1181         movl    %esp, %ebp
   1182         subl    $4, %esp
   1183         leal    12(%ebp), %eax
   1184         movl    %eax, -4(%ebp)
   1185         leal    16(%ebp), %eax
   1186         movl    %eax, -4(%ebp)
   1187         movl    12(%ebp), %eax
   1188         addl    $4, %esp
   1189         popl    %ebp
   1190         ret
   1191 
   1192 gcc 4.3 generates:
   1193 	pushl   %ebp
   1194         movl    %esp, %ebp
   1195         movl    12(%ebp), %eax
   1196         popl    %ebp
   1197         ret
   1198 
   1199 //===---------------------------------------------------------------------===//
   1200 
   1201 Teach tblgen not to check bitconvert source type in some cases. This allows us
   1202 to consolidate the following patterns in X86InstrMMX.td:
   1203 
   1204 def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
   1205                                                   (iPTR 0))))),
   1206           (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
   1207 def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
   1208                                                   (iPTR 0))))),
   1209           (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
   1210 def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
   1211                                                   (iPTR 0))))),
   1212           (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
   1213 
   1214 There are other cases in various td files.
   1215 
   1216 //===---------------------------------------------------------------------===//
   1217 
   1218 Take something like the following on x86-32:
   1219 unsigned a(unsigned long long x, unsigned y) {return x % y;}
   1220 
   1221 We currently generate a libcall, but we really shouldn't: the expansion is
   1222 shorter and likely faster than the libcall.  The expected code is something
   1223 like the following:
   1224 
   1225 	movl	12(%ebp), %eax
   1226 	movl	16(%ebp), %ecx
   1227 	xorl	%edx, %edx
   1228 	divl	%ecx
   1229 	movl	8(%ebp), %eax
   1230 	divl	%ecx
   1231 	movl	%edx, %eax
   1232 	ret
   1233 
   1234 A similar code sequence works for division.
   1235 
   1236 //===---------------------------------------------------------------------===//
   1237 
   1238 We currently compile this:
   1239 
   1240 define i32 @func1(i32 %v1, i32 %v2) nounwind {
   1241 entry:
   1242   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
   1243   %sum = extractvalue {i32, i1} %t, 0
   1244   %obit = extractvalue {i32, i1} %t, 1
   1245   br i1 %obit, label %overflow, label %normal
   1246 normal:
   1247   ret i32 %sum
   1248 overflow:
   1249   call void @llvm.trap()
   1250   unreachable
   1251 }
   1252 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
   1253 declare void @llvm.trap()
   1254 
   1255 to:
   1256 
   1257 _func1:
   1258 	movl	4(%esp), %eax
   1259 	addl	8(%esp), %eax
   1260 	jo	LBB1_2	## overflow
   1261 LBB1_1:	## normal
   1262 	ret
   1263 LBB1_2:	## overflow
   1264 	ud2
   1265 
   1266 it would be nice to produce "into" someday.
   1267 
   1268 //===---------------------------------------------------------------------===//
   1269 
   1270 Test instructions can be eliminated by using EFLAGS values from arithmetic
   1271 instructions. This is currently not done for mul, and, or, xor, neg, shl,
   1272 sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
   1273 for read-modify-write instructions. It is also current not done if the
   1274 OF or CF flags are needed.
   1275 
   1276 The shift operators have the complication that when the shift count is
   1277 zero, EFLAGS is not set, so they can only subsume a test instruction if
   1278 the shift count is known to be non-zero. Also, using the EFLAGS value
   1279 from a shift is apparently very slow on some x86 implementations.
   1280 
   1281 In read-modify-write instructions, the root node in the isel match is
   1282 the store, and isel has no way for the use of the EFLAGS result of the
   1283 arithmetic to be remapped to the new node.
   1284 
   1285 Add and subtract instructions set OF on signed overflow and CF on unsiged
   1286 overflow, while test instructions always clear OF and CF. In order to
   1287 replace a test with an add or subtract in a situation where OF or CF is
   1288 needed, codegen must be able to prove that the operation cannot see
   1289 signed or unsigned overflow, respectively.
   1290 
   1291 //===---------------------------------------------------------------------===//
   1292 
   1293 memcpy/memmove do not lower to SSE copies when possible.  A silly example is:
   1294 define <16 x float> @foo(<16 x float> %A) nounwind {
   1295 	%tmp = alloca <16 x float>, align 16
   1296 	%tmp2 = alloca <16 x float>, align 16
   1297 	store <16 x float> %A, <16 x float>* %tmp
   1298 	%s = bitcast <16 x float>* %tmp to i8*
   1299 	%s2 = bitcast <16 x float>* %tmp2 to i8*
   1300 	call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
   1301 	%R = load <16 x float>* %tmp2
   1302 	ret <16 x float> %R
   1303 }
   1304 
   1305 declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
   1306 
   1307 which compiles to:
   1308 
   1309 _foo:
   1310 	subl	$140, %esp
   1311 	movaps	%xmm3, 112(%esp)
   1312 	movaps	%xmm2, 96(%esp)
   1313 	movaps	%xmm1, 80(%esp)
   1314 	movaps	%xmm0, 64(%esp)
   1315 	movl	60(%esp), %eax
   1316 	movl	%eax, 124(%esp)
   1317 	movl	56(%esp), %eax
   1318 	movl	%eax, 120(%esp)
   1319 	movl	52(%esp), %eax
   1320         <many many more 32-bit copies>
   1321       	movaps	(%esp), %xmm0
   1322 	movaps	16(%esp), %xmm1
   1323 	movaps	32(%esp), %xmm2
   1324 	movaps	48(%esp), %xmm3
   1325 	addl	$140, %esp
   1326 	ret
   1327 
   1328 On Nehalem, it may even be cheaper to just use movups when unaligned than to
   1329 fall back to lower-granularity chunks.
   1330 
   1331 //===---------------------------------------------------------------------===//
   1332 
   1333 Implement processor-specific optimizations for parity with GCC on these
   1334 processors.  GCC does two optimizations:
   1335 
   1336 1. ix86_pad_returns inserts a noop before ret instructions if immediately
   1337    preceded by a conditional branch or is the target of a jump.
   1338 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
   1339    code contains more than 3 branches.
   1340    
   1341 The first one is done for all AMDs, Core2, and "Generic"
   1342 The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
   1343   Core 2, and "Generic"
   1344 
   1345 //===---------------------------------------------------------------------===//
   1346 Testcase:
   1347 int x(int a) { return (a&0xf0)>>4; }
   1348 
   1349 Current output:
   1350 	movl	4(%esp), %eax
   1351 	shrl	$4, %eax
   1352 	andl	$15, %eax
   1353 	ret
   1354 
   1355 Ideal output:
   1356 	movzbl	4(%esp), %eax
   1357 	shrl	$4, %eax
   1358 	ret
   1359 
   1360 //===---------------------------------------------------------------------===//
   1361 
   1362 Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
   1363 properly.
   1364 
   1365 When the return value is not used (i.e. only care about the value in the
   1366 memory), x86 does not have to use add to implement these. Instead, it can use
   1367 add, sub, inc, dec instructions with the "lock" prefix.
   1368 
   1369 This is currently implemented using a bit of instruction selection trick. The
   1370 issue is the target independent pattern produces one output and a chain and we
   1371 want to map it into one that just output a chain. The current trick is to select
   1372 it into a MERGE_VALUES with the first definition being an implicit_def. The
   1373 proper solution is to add new ISD opcodes for the no-output variant. DAG
   1374 combiner can then transform the node before it gets to target node selection.
   1375 
   1376 Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
   1377 fact these instructions are identical to the non-lock versions. We need a way to
   1378 add target specific information to target nodes and have this information
   1379 carried over to machine instructions. Asm printer (or JIT) can use this
   1380 information to add the "lock" prefix.
   1381 
   1382 //===---------------------------------------------------------------------===//
   1383 
   1384 struct B {
   1385   unsigned char y0 : 1;
   1386 };
   1387 
   1388 int bar(struct B* a) { return a->y0; }
   1389 
   1390 define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
   1391   %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
   1392   %2 = load i8* %1, align 1
   1393   %3 = and i8 %2, 1
   1394   %4 = zext i8 %3 to i32
   1395   ret i32 %4
   1396 }
   1397 
   1398 bar:                                    # @bar
   1399 # %bb.0:
   1400         movb    (%rdi), %al
   1401         andb    $1, %al
   1402         movzbl  %al, %eax
   1403         ret
   1404 
   1405 Missed optimization: should be movl+andl.
   1406 
   1407 //===---------------------------------------------------------------------===//
   1408 
   1409 The x86_64 abi says:
   1410 
   1411 Booleans, when stored in a memory object, are stored as single byte objects the
   1412 value of which is always 0 (false) or 1 (true).
   1413 
   1414 We are not using this fact:
   1415 
   1416 int bar(_Bool *a) { return *a; }
   1417 
   1418 define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
   1419   %1 = load i8* %a, align 1, !tbaa !0
   1420   %tmp = and i8 %1, 1
   1421   %2 = zext i8 %tmp to i32
   1422   ret i32 %2
   1423 }
   1424 
   1425 bar:
   1426         movb    (%rdi), %al
   1427         andb    $1, %al
   1428         movzbl  %al, %eax
   1429         ret
   1430 
   1431 GCC produces
   1432 
   1433 bar:
   1434         movzbl  (%rdi), %eax
   1435         ret
   1436 
   1437 //===---------------------------------------------------------------------===//
   1438 
   1439 Take the following C code:
   1440 int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
   1441 
   1442 We generate the following IR with clang:
   1443 define i32 @f(i32 %a, i32 %b) nounwind readnone {
   1444 entry:
   1445   %tmp = xor i32 %b, %a                           ; <i32> [#uses=1]
   1446   %tmp6 = and i32 %tmp, 255                       ; <i32> [#uses=1]
   1447   %cmp = icmp eq i32 %tmp6, 0                     ; <i1> [#uses=1]
   1448   %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
   1449   ret i32 %conv5
   1450 }
   1451 
   1452 And the following x86 code:
   1453 	xorl	%esi, %edi
   1454 	testb	$-1, %dil
   1455 	sete	%al
   1456 	movzbl	%al, %eax
   1457 	ret
   1458 
   1459 A cmpb instead of the xorl+testb would be one instruction shorter.
   1460 
   1461 //===---------------------------------------------------------------------===//
   1462 
   1463 Given the following C code:
   1464 int f(int a, int b) { return (signed char)a == (signed char)b; }
   1465 
   1466 We generate the following IR with clang:
   1467 define i32 @f(i32 %a, i32 %b) nounwind readnone {
   1468 entry:
   1469   %sext = shl i32 %a, 24                          ; <i32> [#uses=1]
   1470   %conv1 = ashr i32 %sext, 24                     ; <i32> [#uses=1]
   1471   %sext6 = shl i32 %b, 24                         ; <i32> [#uses=1]
   1472   %conv4 = ashr i32 %sext6, 24                    ; <i32> [#uses=1]
   1473   %cmp = icmp eq i32 %conv1, %conv4               ; <i1> [#uses=1]
   1474   %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
   1475   ret i32 %conv5
   1476 }
   1477 
   1478 And the following x86 code:
   1479 	movsbl	%sil, %eax
   1480 	movsbl	%dil, %ecx
   1481 	cmpl	%eax, %ecx
   1482 	sete	%al
   1483 	movzbl	%al, %eax
   1484 	ret
   1485 
   1486 
   1487 It should be possible to eliminate the sign extensions.
   1488 
   1489 //===---------------------------------------------------------------------===//
   1490 
   1491 LLVM misses a load+store narrowing opportunity in this code:
   1492 
   1493 %struct.bf = type { i64, i16, i16, i32 }
   1494 
   1495 @bfi = external global %struct.bf*                ; <%struct.bf**> [#uses=2]
   1496 
   1497 define void @t1() nounwind ssp {
   1498 entry:
   1499   %0 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
   1500   %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
   1501   %2 = bitcast i16* %1 to i32*                    ; <i32*> [#uses=2]
   1502   %3 = load i32* %2, align 1                      ; <i32> [#uses=1]
   1503   %4 = and i32 %3, -65537                         ; <i32> [#uses=1]
   1504   store i32 %4, i32* %2, align 1
   1505   %5 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
   1506   %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
   1507   %7 = bitcast i16* %6 to i32*                    ; <i32*> [#uses=2]
   1508   %8 = load i32* %7, align 1                      ; <i32> [#uses=1]
   1509   %9 = and i32 %8, -131073                        ; <i32> [#uses=1]
   1510   store i32 %9, i32* %7, align 1
   1511   ret void
   1512 }
   1513 
   1514 LLVM currently emits this:
   1515 
   1516   movq  bfi(%rip), %rax
   1517   andl  $-65537, 8(%rax)
   1518   movq  bfi(%rip), %rax
   1519   andl  $-131073, 8(%rax)
   1520   ret
   1521 
   1522 It could narrow the loads and stores to emit this:
   1523 
   1524   movq  bfi(%rip), %rax
   1525   andb  $-2, 10(%rax)
   1526   movq  bfi(%rip), %rax
   1527   andb  $-3, 10(%rax)
   1528   ret
   1529 
   1530 The trouble is that there is a TokenFactor between the store and the
   1531 load, making it non-trivial to determine if there's anything between
   1532 the load and the store which would prohibit narrowing.
   1533 
   1534 //===---------------------------------------------------------------------===//
   1535 
   1536 This code:
   1537 void foo(unsigned x) {
   1538   if (x == 0) bar();
   1539   else if (x == 1) qux();
   1540 }
   1541 
   1542 currently compiles into:
   1543 _foo:
   1544 	movl	4(%esp), %eax
   1545 	cmpl	$1, %eax
   1546 	je	LBB0_3
   1547 	testl	%eax, %eax
   1548 	jne	LBB0_4
   1549 
   1550 the testl could be removed:
   1551 _foo:
   1552 	movl	4(%esp), %eax
   1553 	cmpl	$1, %eax
   1554 	je	LBB0_3
   1555 	jb	LBB0_4
   1556 
   1557 0 is the only unsigned number < 1.
   1558 
   1559 //===---------------------------------------------------------------------===//
   1560 
   1561 This code:
   1562 
   1563 %0 = type { i32, i1 }
   1564 
   1565 define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
   1566 entry:
   1567   %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
   1568   %cmp = extractvalue %0 %uadd, 1
   1569   %inc = zext i1 %cmp to i32
   1570   %add = add i32 %x, %sum
   1571   %z.0 = add i32 %add, %inc
   1572   ret i32 %z.0
   1573 }
   1574 
   1575 declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
   1576 
   1577 compiles to:
   1578 
   1579 _add32carry:                            ## @add32carry
   1580 	addl	%esi, %edi
   1581 	sbbl	%ecx, %ecx
   1582 	movl	%edi, %eax
   1583 	subl	%ecx, %eax
   1584 	ret
   1585 
   1586 But it could be:
   1587 
   1588 _add32carry:
   1589 	leal	(%rsi,%rdi), %eax
   1590 	cmpl	%esi, %eax
   1591 	adcl	$0, %eax
   1592 	ret
   1593 
   1594 //===---------------------------------------------------------------------===//
   1595 
   1596 The hot loop of 256.bzip2 contains code that looks a bit like this:
   1597 
   1598 int foo(char *P, char *Q, int x, int y) {
   1599   if (P[0] != Q[0])
   1600      return P[0] < Q[0];
   1601   if (P[1] != Q[1])
   1602      return P[1] < Q[1];
   1603   if (P[2] != Q[2])
   1604      return P[2] < Q[2];
   1605    return P[3] < Q[3];
   1606 }
   1607 
   1608 In the real code, we get a lot more wrong than this.  However, even in this
   1609 code we generate:
   1610 
   1611 _foo:                                   ## @foo
   1612 ## %bb.0:                               ## %entry
   1613 	movb	(%rsi), %al
   1614 	movb	(%rdi), %cl
   1615 	cmpb	%al, %cl
   1616 	je	LBB0_2
   1617 LBB0_1:                                 ## %if.then
   1618 	cmpb	%al, %cl
   1619 	jmp	LBB0_5
   1620 LBB0_2:                                 ## %if.end
   1621 	movb	1(%rsi), %al
   1622 	movb	1(%rdi), %cl
   1623 	cmpb	%al, %cl
   1624 	jne	LBB0_1
   1625 ## %bb.3:                               ## %if.end38
   1626 	movb	2(%rsi), %al
   1627 	movb	2(%rdi), %cl
   1628 	cmpb	%al, %cl
   1629 	jne	LBB0_1
   1630 ## %bb.4:                               ## %if.end60
   1631 	movb	3(%rdi), %al
   1632 	cmpb	3(%rsi), %al
   1633 LBB0_5:                                 ## %if.end60
   1634 	setl	%al
   1635 	movzbl	%al, %eax
   1636 	ret
   1637 
   1638 Note that we generate jumps to LBB0_1 which does a redundant compare.  The
   1639 redundant compare also forces the register values to be live, which prevents
   1640 folding one of the loads into the compare.  In contrast, GCC 4.2 produces:
   1641 
   1642 _foo:
   1643 	movzbl	(%rsi), %eax
   1644 	cmpb	%al, (%rdi)
   1645 	jne	L10
   1646 L12:
   1647 	movzbl	1(%rsi), %eax
   1648 	cmpb	%al, 1(%rdi)
   1649 	jne	L10
   1650 	movzbl	2(%rsi), %eax
   1651 	cmpb	%al, 2(%rdi)
   1652 	jne	L10
   1653 	movzbl	3(%rdi), %eax
   1654 	cmpb	3(%rsi), %al
   1655 L10:
   1656 	setl	%al
   1657 	movzbl	%al, %eax
   1658 	ret
   1659 
   1660 which is "perfect".
   1661 
   1662 //===---------------------------------------------------------------------===//
   1663 
   1664 For the branch in the following code:
   1665 int a();
   1666 int b(int x, int y) {
   1667   if (x & (1<<(y&7)))
   1668     return a();
   1669   return y;
   1670 }
   1671 
   1672 We currently generate:
   1673 	movb	%sil, %al
   1674 	andb	$7, %al
   1675 	movzbl	%al, %eax
   1676 	btl	%eax, %edi
   1677 	jae	.LBB0_2
   1678 
   1679 movl+andl would be shorter than the movb+andb+movzbl sequence.
   1680 
   1681 //===---------------------------------------------------------------------===//
   1682 
   1683 For the following:
   1684 struct u1 {
   1685     float x, y;
   1686 };
   1687 float foo(struct u1 u) {
   1688     return u.x + u.y;
   1689 }
   1690 
   1691 We currently generate:
   1692 	movdqa	%xmm0, %xmm1
   1693 	pshufd	$1, %xmm0, %xmm0        # xmm0 = xmm0[1,0,0,0]
   1694 	addss	%xmm1, %xmm0
   1695 	ret
   1696 
   1697 We could save an instruction here by commuting the addss.
   1698 
   1699 //===---------------------------------------------------------------------===//
   1700 
   1701 This (from PR9661):
   1702 
   1703 float clamp_float(float a) {
   1704         if (a > 1.0f)
   1705                 return 1.0f;
   1706         else if (a < 0.0f)
   1707                 return 0.0f;
   1708         else
   1709                 return a;
   1710 }
   1711 
   1712 Could compile to:
   1713 
   1714 clamp_float:                            # @clamp_float
   1715         movss   .LCPI0_0(%rip), %xmm1
   1716         minss   %xmm1, %xmm0
   1717         pxor    %xmm1, %xmm1
   1718         maxss   %xmm1, %xmm0
   1719         ret
   1720 
   1721 with -ffast-math.
   1722 
   1723 //===---------------------------------------------------------------------===//
   1724 
   1725 This function (from PR9803):
   1726 
   1727 int clamp2(int a) {
   1728         if (a > 5)
   1729                 a = 5;
   1730         if (a < 0) 
   1731                 return 0;
   1732         return a;
   1733 }
   1734 
   1735 Compiles to:
   1736 
   1737 _clamp2:                                ## @clamp2
   1738         pushq   %rbp
   1739         movq    %rsp, %rbp
   1740         cmpl    $5, %edi
   1741         movl    $5, %ecx
   1742         cmovlel %edi, %ecx
   1743         testl   %ecx, %ecx
   1744         movl    $0, %eax
   1745         cmovnsl %ecx, %eax
   1746         popq    %rbp
   1747         ret
   1748 
   1749 The move of 0 could be scheduled above the test to make it is xor reg,reg.
   1750 
   1751 //===---------------------------------------------------------------------===//
   1752 
   1753 GCC PR48986.  We currently compile this:
   1754 
   1755 void bar(void);
   1756 void yyy(int* p) {
   1757     if (__sync_fetch_and_add(p, -1) == 1)
   1758       bar();
   1759 }
   1760 
   1761 into:
   1762 	movl	$-1, %eax
   1763 	lock
   1764 	xaddl	%eax, (%rdi)
   1765 	cmpl	$1, %eax
   1766 	je	LBB0_2
   1767 
   1768 Instead we could generate:
   1769 
   1770 	lock
   1771 	dec %rdi
   1772 	je LBB0_2
   1773 
   1774 The trick is to match "fetch_and_add(X, -C) == C".
   1775 
   1776 //===---------------------------------------------------------------------===//
   1777 
   1778 unsigned t(unsigned a, unsigned b) {
   1779   return a <= b ? 5 : -5;
   1780 }
   1781 
   1782 We generate:
   1783 	movl	$5, %ecx
   1784 	cmpl	%esi, %edi
   1785 	movl	$-5, %eax
   1786 	cmovbel	%ecx, %eax
   1787 
   1788 GCC:
   1789 	cmpl	%edi, %esi
   1790 	sbbl	%eax, %eax
   1791 	andl	$-10, %eax
   1792 	addl	$5, %eax
   1793 
   1794 //===---------------------------------------------------------------------===//
   1795