Home | History | Annotate | Download | only in X86
      1 //===---------------------------------------------------------------------===//
      2 // Random ideas for the X86 backend: SSE-specific stuff.
      3 //===---------------------------------------------------------------------===//
      4 
      5 //===---------------------------------------------------------------------===//
      6 
      7 SSE Variable shift can be custom lowered to something like this, which uses a
      8 small table + unaligned load + shuffle instead of going through memory.
      9 
     10 __m128i_shift_right:
     11 	.byte	  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
     12 	.byte	 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
     13 
     14 ...
     15 __m128i shift_right(__m128i value, unsigned long offset) {
     16   return _mm_shuffle_epi8(value,
     17                _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
     18 }
     19 
     20 //===---------------------------------------------------------------------===//
     21 
     22 SSE has instructions for doing operations on complex numbers, we should pattern
     23 match them.   For example, this should turn into a horizontal add:
     24 
     25 typedef float __attribute__((vector_size(16))) v4f32;
     26 float f32(v4f32 A) {
     27   return A[0]+A[1]+A[2]+A[3];
     28 }
     29 
     30 Instead we get this:
     31 
     32 _f32:                                   ## @f32
     33 	pshufd	$1, %xmm0, %xmm1        ## xmm1 = xmm0[1,0,0,0]
     34 	addss	%xmm0, %xmm1
     35 	pshufd	$3, %xmm0, %xmm2        ## xmm2 = xmm0[3,0,0,0]
     36 	movhlps	%xmm0, %xmm0            ## xmm0 = xmm0[1,1]
     37 	movaps	%xmm0, %xmm3
     38 	addss	%xmm1, %xmm3
     39 	movdqa	%xmm2, %xmm0
     40 	addss	%xmm3, %xmm0
     41 	ret
     42 
     43 Also, there are cases where some simple local SLP would improve codegen a bit.
     44 compiling this:
     45 
     46 _Complex float f32(_Complex float A, _Complex float B) {
     47   return A+B;
     48 }
     49 
     50 into:
     51 
     52 _f32:                                   ## @f32
     53 	movdqa	%xmm0, %xmm2
     54 	addss	%xmm1, %xmm2
     55 	pshufd	$1, %xmm1, %xmm1        ## xmm1 = xmm1[1,0,0,0]
     56 	pshufd	$1, %xmm0, %xmm3        ## xmm3 = xmm0[1,0,0,0]
     57 	addss	%xmm1, %xmm3
     58 	movaps	%xmm2, %xmm0
     59 	unpcklps	%xmm3, %xmm0    ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
     60 	ret
     61 
     62 seems silly when it could just be one addps.
     63 
     64 
     65 //===---------------------------------------------------------------------===//
     66 
     67 Expand libm rounding functions inline:  Significant speedups possible.
     68 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
     69 
     70 //===---------------------------------------------------------------------===//
     71 
     72 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
     73 other fast SSE modes.
     74 
     75 //===---------------------------------------------------------------------===//
     76 
     77 Think about doing i64 math in SSE regs on x86-32.
     78 
     79 //===---------------------------------------------------------------------===//
     80 
     81 This testcase should have no SSE instructions in it, and only one load from
     82 a constant pool:
     83 
     84 double %test3(bool %B) {
     85         %C = select bool %B, double 123.412, double 523.01123123
     86         ret double %C
     87 }
     88 
     89 Currently, the select is being lowered, which prevents the dag combiner from
     90 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
     91 
     92 The pattern isel got this one right.
     93 
     94 //===---------------------------------------------------------------------===//
     95 
     96 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
     97 feasible.
     98 
     99 //===---------------------------------------------------------------------===//
    100 
    101 Codegen:
    102   if (copysign(1.0, x) == copysign(1.0, y))
    103 into:
    104   if (x^y & mask)
    105 when using SSE.
    106 
    107 //===---------------------------------------------------------------------===//
    108 
    109 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
    110 of a v4sf value.
    111 
    112 //===---------------------------------------------------------------------===//
    113 
    114 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
    115 Perhaps use pxor / xorp* to clear a XMM register first?
    116 
    117 //===---------------------------------------------------------------------===//
    118 
    119 External test Nurbs exposed some problems. Look for
    120 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
    121 emits:
    122 
    123         movaps    (%edx), %xmm2                                 #59.21
    124         movaps    (%edx), %xmm5                                 #60.21
    125         movaps    (%edx), %xmm4                                 #61.21
    126         movaps    (%edx), %xmm3                                 #62.21
    127         movl      40(%ecx), %ebp                                #69.49
    128         shufps    $0, %xmm2, %xmm5                              #60.21
    129         movl      100(%esp), %ebx                               #69.20
    130         movl      (%ebx), %edi                                  #69.20
    131         imull     %ebp, %edi                                    #69.49
    132         addl      (%eax), %edi                                  #70.33
    133         shufps    $85, %xmm2, %xmm4                             #61.21
    134         shufps    $170, %xmm2, %xmm3                            #62.21
    135         shufps    $255, %xmm2, %xmm2                            #63.21
    136         lea       (%ebp,%ebp,2), %ebx                           #69.49
    137         negl      %ebx                                          #69.49
    138         lea       -3(%edi,%ebx), %ebx                           #70.33
    139         shll      $4, %ebx                                      #68.37
    140         addl      32(%ecx), %ebx                                #68.37
    141         testb     $15, %bl                                      #91.13
    142         jne       L_B1.24       # Prob 5%                       #91.13
    143 
    144 This is the llvm code after instruction scheduling:
    145 
    146 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
    147 	%reg1078 = MOV32ri -3
    148 	%reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
    149 	%reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
    150 	%reg1080 = IMUL32rr %reg1079, %reg1037
    151 	%reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
    152 	%reg1038 = LEA32r %reg1081, 1, %reg1080, -3
    153 	%reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
    154 	%reg1082 = SHL32ri %reg1038, 4
    155 	%reg1039 = ADD32rr %reg1036, %reg1082
    156 	%reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
    157 	%reg1034 = SHUFPSrr %reg1083, %reg1083, 170
    158 	%reg1032 = SHUFPSrr %reg1083, %reg1083, 0
    159 	%reg1035 = SHUFPSrr %reg1083, %reg1083, 255
    160 	%reg1033 = SHUFPSrr %reg1083, %reg1083, 85
    161 	%reg1040 = MOV32rr %reg1039
    162 	%reg1084 = AND32ri8 %reg1039, 15
    163 	CMP32ri8 %reg1084, 0
    164 	JE mbb<cond_next204,0xa914d30>
    165 
    166 Still ok. After register allocation:
    167 
    168 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
    169 	%EAX = MOV32ri -3
    170 	%EDX = MOV32rm <fi#3>, 1, %NOREG, 0
    171 	ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
    172 	%EDX = MOV32rm <fi#7>, 1, %NOREG, 0
    173 	%EDX = MOV32rm %EDX, 1, %NOREG, 40
    174 	IMUL32rr %EAX<def&use>, %EDX
    175 	%ESI = MOV32rm <fi#5>, 1, %NOREG, 0
    176 	%ESI = MOV32rm %ESI, 1, %NOREG, 0
    177 	MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
    178 	%EAX = LEA32r %ESI, 1, %EAX, -3
    179 	%ESI = MOV32rm <fi#7>, 1, %NOREG, 0
    180 	%ESI = MOV32rm %ESI, 1, %NOREG, 32
    181 	%EDI = MOV32rr %EAX
    182 	SHL32ri %EDI<def&use>, 4
    183 	ADD32rr %EDI<def&use>, %ESI
    184 	%XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
    185 	%XMM1 = MOVAPSrr %XMM0
    186 	SHUFPSrr %XMM1<def&use>, %XMM1, 170
    187 	%XMM2 = MOVAPSrr %XMM0
    188 	SHUFPSrr %XMM2<def&use>, %XMM2, 0
    189 	%XMM3 = MOVAPSrr %XMM0
    190 	SHUFPSrr %XMM3<def&use>, %XMM3, 255
    191 	SHUFPSrr %XMM0<def&use>, %XMM0, 85
    192 	%EBX = MOV32rr %EDI
    193 	AND32ri8 %EBX<def&use>, 15
    194 	CMP32ri8 %EBX, 0
    195 	JE mbb<cond_next204,0xa914d30>
    196 
    197 This looks really bad. The problem is shufps is a destructive opcode. Since it
    198 appears as operand two in more than one shufps ops. It resulted in a number of
    199 copies. Note icc also suffers from the same problem. Either the instruction
    200 selector should select pshufd or The register allocator can made the two-address
    201 to three-address transformation.
    202 
    203 It also exposes some other problems. See MOV32ri -3 and the spills.
    204 
    205 //===---------------------------------------------------------------------===//
    206 
    207 Consider:
    208 
    209 __m128 test(float a) {
    210   return _mm_set_ps(0.0, 0.0, 0.0, a*a);
    211 }
    212 
    213 This compiles into:
    214 
    215 movss 4(%esp), %xmm1
    216 mulss %xmm1, %xmm1
    217 xorps %xmm0, %xmm0
    218 movss %xmm1, %xmm0
    219 ret
    220 
    221 Because mulss doesn't modify the top 3 elements, the top elements of 
    222 xmm1 are already zero'd.  We could compile this to:
    223 
    224 movss 4(%esp), %xmm0
    225 mulss %xmm0, %xmm0
    226 ret
    227 
    228 //===---------------------------------------------------------------------===//
    229 
    230 Here's a sick and twisted idea.  Consider code like this:
    231 
    232 __m128 test(__m128 a) {
    233   float b = *(float*)&A;
    234   ...
    235   return _mm_set_ps(0.0, 0.0, 0.0, b);
    236 }
    237 
    238 This might compile to this code:
    239 
    240 movaps c(%esp), %xmm1
    241 xorps %xmm0, %xmm0
    242 movss %xmm1, %xmm0
    243 ret
    244 
    245 Now consider if the ... code caused xmm1 to get spilled.  This might produce
    246 this code:
    247 
    248 movaps c(%esp), %xmm1
    249 movaps %xmm1, c2(%esp)
    250 ...
    251 
    252 xorps %xmm0, %xmm0
    253 movaps c2(%esp), %xmm1
    254 movss %xmm1, %xmm0
    255 ret
    256 
    257 However, since the reload is only used by these instructions, we could 
    258 "fold" it into the uses, producing something like this:
    259 
    260 movaps c(%esp), %xmm1
    261 movaps %xmm1, c2(%esp)
    262 ...
    263 
    264 movss c2(%esp), %xmm0
    265 ret
    266 
    267 ... saving two instructions.
    268 
    269 The basic idea is that a reload from a spill slot, can, if only one 4-byte 
    270 chunk is used, bring in 3 zeros the one element instead of 4 elements.
    271 This can be used to simplify a variety of shuffle operations, where the
    272 elements are fixed zeros.
    273 
    274 //===---------------------------------------------------------------------===//
    275 
    276 This code generates ugly code, probably due to costs being off or something:
    277 
    278 define void @test(float* %P, <4 x float>* %P2 ) {
    279         %xFloat0.688 = load float* %P
    280         %tmp = load <4 x float>* %P2
    281         %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
    282         store <4 x float> %inFloat3.713, <4 x float>* %P2
    283         ret void
    284 }
    285 
    286 Generates:
    287 
    288 _test:
    289 	movl	8(%esp), %eax
    290 	movaps	(%eax), %xmm0
    291 	pxor	%xmm1, %xmm1
    292 	movaps	%xmm0, %xmm2
    293 	shufps	$50, %xmm1, %xmm2
    294 	shufps	$132, %xmm2, %xmm0
    295 	movaps	%xmm0, (%eax)
    296 	ret
    297 
    298 Would it be better to generate:
    299 
    300 _test:
    301         movl 8(%esp), %ecx
    302         movaps (%ecx), %xmm0
    303 	xor %eax, %eax
    304         pinsrw $6, %eax, %xmm0
    305         pinsrw $7, %eax, %xmm0
    306         movaps %xmm0, (%ecx)
    307         ret
    308 
    309 ?
    310 
    311 //===---------------------------------------------------------------------===//
    312 
    313 Some useful information in the Apple Altivec / SSE Migration Guide:
    314 
    315 http://developer.apple.com/documentation/Performance/Conceptual/
    316 Accelerate_sse_migration/index.html
    317 
    318 e.g. SSE select using and, andnot, or. Various SSE compare translations.
    319 
    320 //===---------------------------------------------------------------------===//
    321 
    322 Add hooks to commute some CMPP operations.
    323 
    324 //===---------------------------------------------------------------------===//
    325 
    326 Apply the same transformation that merged four float into a single 128-bit load
    327 to loads from constant pool.
    328 
    329 //===---------------------------------------------------------------------===//
    330 
    331 Floating point max / min are commutable when -enable-unsafe-fp-path is
    332 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
    333 nodes which are selected to max / min instructions that are marked commutable.
    334 
    335 //===---------------------------------------------------------------------===//
    336 
    337 We should materialize vector constants like "all ones" and "signbit" with 
    338 code like:
    339 
    340      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
    341 
    342 and:
    343      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
    344      psrlq   xmm1, 31     ; xmm1 = all 100000000000...
    345 
    346 instead of using a load from the constant pool.  The later is important for
    347 ABS/NEG/copysign etc.
    348 
    349 //===---------------------------------------------------------------------===//
    350 
    351 These functions:
    352 
    353 #include <xmmintrin.h>
    354 __m128i a;
    355 void x(unsigned short n) {
    356   a = _mm_slli_epi32 (a, n);
    357 }
    358 void y(unsigned n) {
    359   a = _mm_slli_epi32 (a, n);
    360 }
    361 
    362 compile to ( -O3 -static -fomit-frame-pointer):
    363 _x:
    364         movzwl  4(%esp), %eax
    365         movd    %eax, %xmm0
    366         movaps  _a, %xmm1
    367         pslld   %xmm0, %xmm1
    368         movaps  %xmm1, _a
    369         ret
    370 _y:
    371         movd    4(%esp), %xmm0
    372         movaps  _a, %xmm1
    373         pslld   %xmm0, %xmm1
    374         movaps  %xmm1, _a
    375         ret
    376 
    377 "y" looks good, but "x" does silly movzwl stuff around into a GPR.  It seems
    378 like movd would be sufficient in both cases as the value is already zero 
    379 extended in the 32-bit stack slot IIRC.  For signed short, it should also be
    380 save, as a really-signed value would be undefined for pslld.
    381 
    382 
    383 //===---------------------------------------------------------------------===//
    384 
    385 #include <math.h>
    386 int t1(double d) { return signbit(d); }
    387 
    388 This currently compiles to:
    389 	subl	$12, %esp
    390 	movsd	16(%esp), %xmm0
    391 	movsd	%xmm0, (%esp)
    392 	movl	4(%esp), %eax
    393 	shrl	$31, %eax
    394 	addl	$12, %esp
    395 	ret
    396 
    397 We should use movmskp{s|d} instead.
    398 
    399 //===---------------------------------------------------------------------===//
    400 
    401 CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
    402 (aligned) vector load.  This functionality has a couple of problems.
    403 
    404 1. The code to infer alignment from loads of globals is in the X86 backend,
    405    not the dag combiner.  This is because dagcombine2 needs to be able to see
    406    through the X86ISD::Wrapper node, which DAGCombine can't really do.
    407 2. The code for turning 4 x load into a single vector load is target 
    408    independent and should be moved to the dag combiner.
    409 3. The code for turning 4 x load into a vector load can only handle a direct 
    410    load from a global or a direct load from the stack.  It should be generalized
    411    to handle any load from P, P+4, P+8, P+12, where P can be anything.
    412 4. The alignment inference code cannot handle loads from globals in non-static
    413    mode because it doesn't look through the extra dyld stub load.  If you try
    414    vec_align.ll without -relocation-model=static, you'll see what I mean.
    415 
    416 //===---------------------------------------------------------------------===//
    417 
    418 We should lower store(fneg(load p), q) into an integer load+xor+store, which
    419 eliminates a constant pool load.  For example, consider:
    420 
    421 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
    422 entry:
    423  %tmp6 = fsub float -0.000000e+00, %z.1		; <float> [#uses=1]
    424  %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
    425  ret i64 %tmp20
    426 }
    427 declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
    428 
    429 This currently compiles to:
    430 
    431 LCPI1_0:					#  <4 x float>
    432 	.long	2147483648	# float -0
    433 	.long	2147483648	# float -0
    434 	.long	2147483648	# float -0
    435 	.long	2147483648	# float -0
    436 _ccosf:
    437 	subl	$12, %esp
    438 	movss	16(%esp), %xmm0
    439 	movss	%xmm0, 4(%esp)
    440 	movss	20(%esp), %xmm0
    441 	xorps	LCPI1_0, %xmm0
    442 	movss	%xmm0, (%esp)
    443 	call	L_ccoshf$stub
    444 	addl	$12, %esp
    445 	ret
    446 
    447 Note the load into xmm0, then xor (to negate), then store.  In PIC mode,
    448 this code computes the pic base and does two loads to do the constant pool 
    449 load, so the improvement is much bigger.
    450 
    451 The tricky part about this xform is that the argument load/store isn't exposed
    452 until post-legalize, and at that point, the fneg has been custom expanded into 
    453 an X86 fxor.  This means that we need to handle this case in the x86 backend
    454 instead of in target independent code.
    455 
    456 //===---------------------------------------------------------------------===//
    457 
    458 Non-SSE4 insert into 16 x i8 is atrociously bad.
    459 
    460 //===---------------------------------------------------------------------===//
    461 
    462 <2 x i64> extract is substantially worse than <2 x f64>, even if the destination
    463 is memory.
    464 
    465 //===---------------------------------------------------------------------===//
    466 
    467 INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
    468 any number of 0.0 simultaneously.  Currently we only use it for simple
    469 insertions.
    470 
    471 See comments in LowerINSERT_VECTOR_ELT_SSE4.
    472 
    473 //===---------------------------------------------------------------------===//
    474 
    475 On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
    476 Custom.  All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
    477 legal, it'll just take a few extra patterns written in the .td file.
    478 
    479 Note: this is not a code quality issue; the custom lowered code happens to be
    480 right, but we shouldn't have to custom lower anything.  This is probably related
    481 to <2 x i64> ops being so bad.
    482 
    483 //===---------------------------------------------------------------------===//
    484 
    485 LLVM currently generates stack realignment code, when it is not necessary
    486 needed. The problem is that we need to know about stack alignment too early,
    487 before RA runs.
    488 
    489 At that point we don't know, whether there will be vector spill, or not.
    490 Stack realignment logic is overly conservative here, but otherwise we can
    491 produce unaligned loads/stores.
    492 
    493 Fixing this will require some huge RA changes.
    494 
    495 Testcase:
    496 #include <emmintrin.h>
    497 
    498 typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
    499 
    500 static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
    501 - 22725, - 12873};;
    502 
    503 vSInt16 madd(vSInt16 b)
    504 {
    505     return _mm_madd_epi16(a, b);
    506 }
    507 
    508 Generated code (x86-32, linux):
    509 madd:
    510         pushl   %ebp
    511         movl    %esp, %ebp
    512         andl    $-16, %esp
    513         movaps  .LCPI1_0, %xmm1
    514         pmaddwd %xmm1, %xmm0
    515         movl    %ebp, %esp
    516         popl    %ebp
    517         ret
    518 
    519 //===---------------------------------------------------------------------===//
    520 
    521 Consider:
    522 #include <emmintrin.h> 
    523 __m128 foo2 (float x) {
    524  return _mm_set_ps (0, 0, x, 0);
    525 }
    526 
    527 In x86-32 mode, we generate this spiffy code:
    528 
    529 _foo2:
    530 	movss	4(%esp), %xmm0
    531 	pshufd	$81, %xmm0, %xmm0
    532 	ret
    533 
    534 in x86-64 mode, we generate this code, which could be better:
    535 
    536 _foo2:
    537 	xorps	%xmm1, %xmm1
    538 	movss	%xmm0, %xmm1
    539 	pshufd	$81, %xmm1, %xmm0
    540 	ret
    541 
    542 In sse4 mode, we could use insertps to make both better.
    543 
    544 Here's another testcase that could use insertps [mem]:
    545 
    546 #include <xmmintrin.h>
    547 extern float x2, x3;
    548 __m128 foo1 (float x1, float x4) {
    549  return _mm_set_ps (x2, x1, x3, x4);
    550 }
    551 
    552 gcc mainline compiles it to:
    553 
    554 foo1:
    555        insertps        $0x10, x2(%rip), %xmm0
    556        insertps        $0x10, x3(%rip), %xmm1
    557        movaps  %xmm1, %xmm2
    558        movlhps %xmm0, %xmm2
    559        movaps  %xmm2, %xmm0
    560        ret
    561 
    562 //===---------------------------------------------------------------------===//
    563 
    564 We compile vector multiply-by-constant into poor code:
    565 
    566 define <4 x i32> @f(<4 x i32> %i) nounwind  {
    567 	%A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
    568 	ret <4 x i32> %A
    569 }
    570 
    571 On targets without SSE4.1, this compiles into:
    572 
    573 LCPI1_0:					##  <4 x i32>
    574 	.long	10
    575 	.long	10
    576 	.long	10
    577 	.long	10
    578 	.text
    579 	.align	4,0x90
    580 	.globl	_f
    581 _f:
    582 	pshufd	$3, %xmm0, %xmm1
    583 	movd	%xmm1, %eax
    584 	imull	LCPI1_0+12, %eax
    585 	movd	%eax, %xmm1
    586 	pshufd	$1, %xmm0, %xmm2
    587 	movd	%xmm2, %eax
    588 	imull	LCPI1_0+4, %eax
    589 	movd	%eax, %xmm2
    590 	punpckldq	%xmm1, %xmm2
    591 	movd	%xmm0, %eax
    592 	imull	LCPI1_0, %eax
    593 	movd	%eax, %xmm1
    594 	movhlps	%xmm0, %xmm0
    595 	movd	%xmm0, %eax
    596 	imull	LCPI1_0+8, %eax
    597 	movd	%eax, %xmm0
    598 	punpckldq	%xmm0, %xmm1
    599 	movaps	%xmm1, %xmm0
    600 	punpckldq	%xmm2, %xmm0
    601 	ret
    602 
    603 It would be better to synthesize integer vector multiplication by constants
    604 using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
    605 simple cases such as multiplication by powers of two would be better as
    606 vector shifts than as multiplications.
    607 
    608 //===---------------------------------------------------------------------===//
    609 
    610 We compile this:
    611 
    612 __m128i
    613 foo2 (char x)
    614 {
    615   return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
    616 }
    617 
    618 into:
    619 	movl	$1, %eax
    620 	xorps	%xmm0, %xmm0
    621 	pinsrw	$2, %eax, %xmm0
    622 	movzbl	4(%esp), %eax
    623 	pinsrw	$3, %eax, %xmm0
    624 	movl	$256, %eax
    625 	pinsrw	$7, %eax, %xmm0
    626 	ret
    627 
    628 
    629 gcc-4.2:
    630 	subl	$12, %esp
    631 	movzbl	16(%esp), %eax
    632 	movdqa	LC0, %xmm0
    633 	pinsrw	$3, %eax, %xmm0
    634 	addl	$12, %esp
    635 	ret
    636 	.const
    637 	.align 4
    638 LC0:
    639 	.word	0
    640 	.word	0
    641 	.word	1
    642 	.word	0
    643 	.word	0
    644 	.word	0
    645 	.word	0
    646 	.word	256
    647 
    648 With SSE4, it should be
    649       movdqa  .LC0(%rip), %xmm0
    650       pinsrb  $6, %edi, %xmm0
    651 
    652 //===---------------------------------------------------------------------===//
    653 
    654 We should transform a shuffle of two vectors of constants into a single vector
    655 of constants. Also, insertelement of a constant into a vector of constants
    656 should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
    657 
    658 We compiled it to something horrible:
    659 
    660 	.align	4
    661 LCPI1_1:					##  float
    662 	.long	1065353216	## float 1
    663 	.const
    664 
    665 	.align	4
    666 LCPI1_0:					##  <4 x float>
    667 	.space	4
    668 	.long	1065353216	## float 1
    669 	.space	4
    670 	.long	1065353216	## float 1
    671 	.text
    672 	.align	4,0x90
    673 	.globl	_t
    674 _t:
    675 	xorps	%xmm0, %xmm0
    676 	movhps	LCPI1_0, %xmm0
    677 	movss	LCPI1_1, %xmm1
    678 	movaps	%xmm0, %xmm2
    679 	shufps	$2, %xmm1, %xmm2
    680 	shufps	$132, %xmm2, %xmm0
    681 	movaps	%xmm0, 0
    682 
    683 //===---------------------------------------------------------------------===//
    684 rdar://5907648
    685 
    686 This function:
    687 
    688 float foo(unsigned char x) {
    689   return x;
    690 }
    691 
    692 compiles to (x86-32):
    693 
    694 define float @foo(i8 zeroext  %x) nounwind  {
    695 	%tmp12 = uitofp i8 %x to float		; <float> [#uses=1]
    696 	ret float %tmp12
    697 }
    698 
    699 compiles to:
    700 
    701 _foo:
    702 	subl	$4, %esp
    703 	movzbl	8(%esp), %eax
    704 	cvtsi2ss	%eax, %xmm0
    705 	movss	%xmm0, (%esp)
    706 	flds	(%esp)
    707 	addl	$4, %esp
    708 	ret
    709 
    710 We should be able to use:
    711   cvtsi2ss 8($esp), %xmm0
    712 since we know the stack slot is already zext'd.
    713 
    714 //===---------------------------------------------------------------------===//
    715 
    716 Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
    717 when code size is critical. movlps is slower than movsd on core2 but it's one
    718 byte shorter.
    719 
    720 //===---------------------------------------------------------------------===//
    721 
    722 We should use a dynamic programming based approach to tell when using FPStack
    723 operations is cheaper than SSE.  SciMark montecarlo contains code like this
    724 for example:
    725 
    726 double MonteCarlo_num_flops(int Num_samples) {
    727     return ((double) Num_samples)* 4.0;
    728 }
    729 
    730 In fpstack mode, this compiles into:
    731 
    732 LCPI1_0:					
    733 	.long	1082130432	## float 4.000000e+00
    734 _MonteCarlo_num_flops:
    735 	subl	$4, %esp
    736 	movl	8(%esp), %eax
    737 	movl	%eax, (%esp)
    738 	fildl	(%esp)
    739 	fmuls	LCPI1_0
    740 	addl	$4, %esp
    741 	ret
    742         
    743 in SSE mode, it compiles into significantly slower code:
    744 
    745 _MonteCarlo_num_flops:
    746 	subl	$12, %esp
    747 	cvtsi2sd	16(%esp), %xmm0
    748 	mulsd	LCPI1_0, %xmm0
    749 	movsd	%xmm0, (%esp)
    750 	fldl	(%esp)
    751 	addl	$12, %esp
    752 	ret
    753 
    754 There are also other cases in scimark where using fpstack is better, it is
    755 cheaper to do fld1 than load from a constant pool for example, so
    756 "load, add 1.0, store" is better done in the fp stack, etc.
    757 
    758 //===---------------------------------------------------------------------===//
    759 
    760 These should compile into the same code (PR6214): Perhaps instcombine should
    761 canonicalize the former into the later?
    762 
    763 define float @foo(float %x) nounwind {
    764   %t = bitcast float %x to i32
    765   %s = and i32 %t, 2147483647
    766   %d = bitcast i32 %s to float
    767   ret float %d
    768 }
    769 
    770 declare float @fabsf(float %n)
    771 define float @bar(float %x) nounwind {
    772   %d = call float @fabsf(float %x)
    773   ret float %d
    774 }
    775 
    776 //===---------------------------------------------------------------------===//
    777 
    778 This IR (from PR6194):
    779 
    780 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
    781 target triple = "x86_64-apple-darwin10.0.0"
    782 
    783 %0 = type { double, double }
    784 %struct.float3 = type { float, float, float }
    785 
    786 define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
    787 entry:
    788   %tmp18 = extractvalue %0 %0, 0                  ; <double> [#uses=1]
    789   %tmp19 = bitcast double %tmp18 to i64           ; <i64> [#uses=1]
    790   %tmp20 = zext i64 %tmp19 to i128                ; <i128> [#uses=1]
    791   %tmp10 = lshr i128 %tmp20, 32                   ; <i128> [#uses=1]
    792   %tmp11 = trunc i128 %tmp10 to i32               ; <i32> [#uses=1]
    793   %tmp12 = bitcast i32 %tmp11 to float            ; <float> [#uses=1]
    794   %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
    795   store float %tmp12, float* %tmp5
    796   ret void
    797 }
    798 
    799 Compiles to:
    800 
    801 _test:                                  ## @test
    802 	movd	%xmm0, %rax
    803 	shrq	$32, %rax
    804 	movl	%eax, 4(%rdi)
    805 	ret
    806 
    807 This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
    808 doing a shuffle from v[1] to v[0] then a float store.
    809 
    810 //===---------------------------------------------------------------------===//
    811 
    812 [UNSAFE FP]
    813 
    814 void foo(double, double, double);
    815 void norm(double x, double y, double z) {
    816   double scale = __builtin_sqrt(x*x + y*y + z*z);
    817   foo(x/scale, y/scale, z/scale);
    818 }
    819 
    820 We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is
    821 slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first
    822 and emit 3 mulsd in place of the divs. This can be done as a target-independent
    823 transform.
    824 
    825 If we're dealing with floats instead of doubles we could even replace the sqrtss
    826 and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
    827 cost of reduced accuracy.
    828 
    829 //===---------------------------------------------------------------------===//
    830 
    831 This function should be matched to haddpd when the appropriate CPU is enabled:
    832 
    833 #include <x86intrin.h>
    834 double f (__m128d p) {
    835   return p[0] + p[1];
    836 }
    837 
    838 similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should
    839 turn into hsubpd also.
    840 
    841 //===---------------------------------------------------------------------===//
    842 
    843 define <2 x i32> @foo(<2 x double> %in) {
    844   %x = fptosi <2 x double> %in to <2 x i32>
    845   ret <2 x i32> %x
    846 }
    847 
    848 Should compile into cvttpd2dq instead of being scalarized into 2 cvttsd2si.
    849 
    850 //===---------------------------------------------------------------------===//
    851