Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64
      4 
      5 ; If we are transferring XMM conversion results to MMX registers we could use the MMX equivalents
      6 ; (CVTPD2PI/CVTTPD2PI + CVTPS2PI/CVTTPS2PI) without affecting rounding/exceptions etc.
      7 
      8 define void @cvt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
      9 ; X86-LABEL: cvt_v2f64_v2i32:
     10 ; X86:       # %bb.0:
     11 ; X86-NEXT:    pushl %ebp
     12 ; X86-NEXT:    movl %esp, %ebp
     13 ; X86-NEXT:    andl $-8, %esp
     14 ; X86-NEXT:    subl $8, %esp
     15 ; X86-NEXT:    movl 8(%ebp), %eax
     16 ; X86-NEXT:    cvtpd2pi %xmm0, %mm0
     17 ; X86-NEXT:    paddd %mm0, %mm0
     18 ; X86-NEXT:    movq %mm0, (%esp)
     19 ; X86-NEXT:    movl (%esp), %ecx
     20 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
     21 ; X86-NEXT:    movl %edx, 4(%eax)
     22 ; X86-NEXT:    movl %ecx, (%eax)
     23 ; X86-NEXT:    movl %ebp, %esp
     24 ; X86-NEXT:    popl %ebp
     25 ; X86-NEXT:    retl
     26 ;
     27 ; X64-LABEL: cvt_v2f64_v2i32:
     28 ; X64:       # %bb.0:
     29 ; X64-NEXT:    cvtpd2pi %xmm0, %mm0
     30 ; X64-NEXT:    paddd %mm0, %mm0
     31 ; X64-NEXT:    movq %mm0, (%rdi)
     32 ; X64-NEXT:    retq
     33   %3 = tail call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %0)
     34   %4 = bitcast <4 x i32> %3 to <2 x i64>
     35   %5 = extractelement <2 x i64> %4, i32 0
     36   %6 = bitcast i64 %5 to x86_mmx
     37   %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
     38   %8 = bitcast x86_mmx %7 to i64
     39   %9 = insertelement <1 x i64> undef, i64 %8, i32 0
     40   store <1 x i64> %9, <1 x i64>* %1
     41   ret void
     42 }
     43 
     44 define void @cvtt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
     45 ; X86-LABEL: cvtt_v2f64_v2i32:
     46 ; X86:       # %bb.0:
     47 ; X86-NEXT:    pushl %ebp
     48 ; X86-NEXT:    movl %esp, %ebp
     49 ; X86-NEXT:    andl $-8, %esp
     50 ; X86-NEXT:    subl $8, %esp
     51 ; X86-NEXT:    movl 8(%ebp), %eax
     52 ; X86-NEXT:    cvttpd2pi %xmm0, %mm0
     53 ; X86-NEXT:    paddd %mm0, %mm0
     54 ; X86-NEXT:    movq %mm0, (%esp)
     55 ; X86-NEXT:    movl (%esp), %ecx
     56 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
     57 ; X86-NEXT:    movl %edx, 4(%eax)
     58 ; X86-NEXT:    movl %ecx, (%eax)
     59 ; X86-NEXT:    movl %ebp, %esp
     60 ; X86-NEXT:    popl %ebp
     61 ; X86-NEXT:    retl
     62 ;
     63 ; X64-LABEL: cvtt_v2f64_v2i32:
     64 ; X64:       # %bb.0:
     65 ; X64-NEXT:    cvttpd2pi %xmm0, %mm0
     66 ; X64-NEXT:    paddd %mm0, %mm0
     67 ; X64-NEXT:    movq %mm0, (%rdi)
     68 ; X64-NEXT:    retq
     69   %3 = tail call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %0)
     70   %4 = bitcast <4 x i32> %3 to <2 x i64>
     71   %5 = extractelement <2 x i64> %4, i32 0
     72   %6 = bitcast i64 %5 to x86_mmx
     73   %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
     74   %8 = bitcast x86_mmx %7 to i64
     75   %9 = insertelement <1 x i64> undef, i64 %8, i32 0
     76   store <1 x i64> %9, <1 x i64>* %1
     77   ret void
     78 }
     79 
     80 define void @fptosi_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
     81 ; X86-LABEL: fptosi_v2f64_v2i32:
     82 ; X86:       # %bb.0:
     83 ; X86-NEXT:    pushl %ebp
     84 ; X86-NEXT:    movl %esp, %ebp
     85 ; X86-NEXT:    andl $-8, %esp
     86 ; X86-NEXT:    subl $8, %esp
     87 ; X86-NEXT:    movl 8(%ebp), %eax
     88 ; X86-NEXT:    cvttpd2pi %xmm0, %mm0
     89 ; X86-NEXT:    paddd %mm0, %mm0
     90 ; X86-NEXT:    movq %mm0, (%esp)
     91 ; X86-NEXT:    movl (%esp), %ecx
     92 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
     93 ; X86-NEXT:    movl %edx, 4(%eax)
     94 ; X86-NEXT:    movl %ecx, (%eax)
     95 ; X86-NEXT:    movl %ebp, %esp
     96 ; X86-NEXT:    popl %ebp
     97 ; X86-NEXT:    retl
     98 ;
     99 ; X64-LABEL: fptosi_v2f64_v2i32:
    100 ; X64:       # %bb.0:
    101 ; X64-NEXT:    cvttpd2pi %xmm0, %mm0
    102 ; X64-NEXT:    paddd %mm0, %mm0
    103 ; X64-NEXT:    movq %mm0, (%rdi)
    104 ; X64-NEXT:    retq
    105   %3 = fptosi <2 x double> %0 to <2 x i32>
    106   %4 = bitcast <2 x i32> %3 to x86_mmx
    107   %5 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %4, x86_mmx %4)
    108   %6 = bitcast x86_mmx %5 to i64
    109   %7 = insertelement <1 x i64> undef, i64 %6, i32 0
    110   store <1 x i64> %7, <1 x i64>* %1
    111   ret void
    112 }
    113 
    114 define void @cvt_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
    115 ; X86-LABEL: cvt_v2f32_v2i32:
    116 ; X86:       # %bb.0:
    117 ; X86-NEXT:    pushl %ebp
    118 ; X86-NEXT:    movl %esp, %ebp
    119 ; X86-NEXT:    andl $-8, %esp
    120 ; X86-NEXT:    subl $8, %esp
    121 ; X86-NEXT:    movl 8(%ebp), %eax
    122 ; X86-NEXT:    cvtps2pi %xmm0, %mm0
    123 ; X86-NEXT:    paddd %mm0, %mm0
    124 ; X86-NEXT:    movq %mm0, (%esp)
    125 ; X86-NEXT:    movl (%esp), %ecx
    126 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
    127 ; X86-NEXT:    movl %edx, 4(%eax)
    128 ; X86-NEXT:    movl %ecx, (%eax)
    129 ; X86-NEXT:    movl %ebp, %esp
    130 ; X86-NEXT:    popl %ebp
    131 ; X86-NEXT:    retl
    132 ;
    133 ; X64-LABEL: cvt_v2f32_v2i32:
    134 ; X64:       # %bb.0:
    135 ; X64-NEXT:    cvtps2pi %xmm0, %mm0
    136 ; X64-NEXT:    paddd %mm0, %mm0
    137 ; X64-NEXT:    movq %mm0, (%rdi)
    138 ; X64-NEXT:    retq
    139   %3 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %0)
    140   %4 = bitcast <4 x i32> %3 to <2 x i64>
    141   %5 = extractelement <2 x i64> %4, i32 0
    142   %6 = bitcast i64 %5 to x86_mmx
    143   %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
    144   %8 = bitcast x86_mmx %7 to i64
    145   %9 = insertelement <1 x i64> undef, i64 %8, i32 0
    146   store <1 x i64> %9, <1 x i64>* %1
    147   ret void
    148 }
    149 
    150 define void @cvtt_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
    151 ; X86-LABEL: cvtt_v2f32_v2i32:
    152 ; X86:       # %bb.0:
    153 ; X86-NEXT:    pushl %ebp
    154 ; X86-NEXT:    movl %esp, %ebp
    155 ; X86-NEXT:    andl $-8, %esp
    156 ; X86-NEXT:    subl $8, %esp
    157 ; X86-NEXT:    movl 8(%ebp), %eax
    158 ; X86-NEXT:    cvttps2pi %xmm0, %mm0
    159 ; X86-NEXT:    paddd %mm0, %mm0
    160 ; X86-NEXT:    movq %mm0, (%esp)
    161 ; X86-NEXT:    movl (%esp), %ecx
    162 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
    163 ; X86-NEXT:    movl %edx, 4(%eax)
    164 ; X86-NEXT:    movl %ecx, (%eax)
    165 ; X86-NEXT:    movl %ebp, %esp
    166 ; X86-NEXT:    popl %ebp
    167 ; X86-NEXT:    retl
    168 ;
    169 ; X64-LABEL: cvtt_v2f32_v2i32:
    170 ; X64:       # %bb.0:
    171 ; X64-NEXT:    cvttps2pi %xmm0, %mm0
    172 ; X64-NEXT:    paddd %mm0, %mm0
    173 ; X64-NEXT:    movq %mm0, (%rdi)
    174 ; X64-NEXT:    retq
    175   %3 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %0)
    176   %4 = bitcast <4 x i32> %3 to <2 x i64>
    177   %5 = extractelement <2 x i64> %4, i32 0
    178   %6 = bitcast i64 %5 to x86_mmx
    179   %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
    180   %8 = bitcast x86_mmx %7 to i64
    181   %9 = insertelement <1 x i64> undef, i64 %8, i32 0
    182   store <1 x i64> %9, <1 x i64>* %1
    183   ret void
    184 }
    185 
    186 define void @fptosi_v4f32_v4i32(<4 x float>, <1 x i64>*) nounwind {
    187 ; X86-LABEL: fptosi_v4f32_v4i32:
    188 ; X86:       # %bb.0:
    189 ; X86-NEXT:    pushl %ebp
    190 ; X86-NEXT:    movl %esp, %ebp
    191 ; X86-NEXT:    andl $-8, %esp
    192 ; X86-NEXT:    subl $8, %esp
    193 ; X86-NEXT:    movl 8(%ebp), %eax
    194 ; X86-NEXT:    cvttps2pi %xmm0, %mm0
    195 ; X86-NEXT:    paddd %mm0, %mm0
    196 ; X86-NEXT:    movq %mm0, (%esp)
    197 ; X86-NEXT:    movl (%esp), %ecx
    198 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
    199 ; X86-NEXT:    movl %edx, 4(%eax)
    200 ; X86-NEXT:    movl %ecx, (%eax)
    201 ; X86-NEXT:    movl %ebp, %esp
    202 ; X86-NEXT:    popl %ebp
    203 ; X86-NEXT:    retl
    204 ;
    205 ; X64-LABEL: fptosi_v4f32_v4i32:
    206 ; X64:       # %bb.0:
    207 ; X64-NEXT:    cvttps2pi %xmm0, %mm0
    208 ; X64-NEXT:    paddd %mm0, %mm0
    209 ; X64-NEXT:    movq %mm0, (%rdi)
    210 ; X64-NEXT:    retq
    211   %3 = fptosi <4 x float> %0 to <4 x i32>
    212   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
    213   %5 = bitcast <2 x i32> %4 to x86_mmx
    214   %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
    215   %7 = bitcast x86_mmx %6 to i64
    216   %8 = insertelement <1 x i64> undef, i64 %7, i32 0
    217   store <1 x i64> %8, <1 x i64>* %1
    218   ret void
    219 }
    220 
    221 define void @fptosi_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
    222 ; X86-LABEL: fptosi_v2f32_v2i32:
    223 ; X86:       # %bb.0:
    224 ; X86-NEXT:    pushl %ebp
    225 ; X86-NEXT:    movl %esp, %ebp
    226 ; X86-NEXT:    andl $-8, %esp
    227 ; X86-NEXT:    subl $8, %esp
    228 ; X86-NEXT:    movl 8(%ebp), %eax
    229 ; X86-NEXT:    cvttps2pi %xmm0, %mm0
    230 ; X86-NEXT:    paddd %mm0, %mm0
    231 ; X86-NEXT:    movq %mm0, (%esp)
    232 ; X86-NEXT:    movl (%esp), %ecx
    233 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
    234 ; X86-NEXT:    movl %edx, 4(%eax)
    235 ; X86-NEXT:    movl %ecx, (%eax)
    236 ; X86-NEXT:    movl %ebp, %esp
    237 ; X86-NEXT:    popl %ebp
    238 ; X86-NEXT:    retl
    239 ;
    240 ; X64-LABEL: fptosi_v2f32_v2i32:
    241 ; X64:       # %bb.0:
    242 ; X64-NEXT:    cvttps2pi %xmm0, %mm0
    243 ; X64-NEXT:    paddd %mm0, %mm0
    244 ; X64-NEXT:    movq %mm0, (%rdi)
    245 ; X64-NEXT:    retq
    246   %3 = fptosi <4 x float> %0 to <4 x i32>
    247   %4 = bitcast <4 x i32> %3 to <2 x i64>
    248   %5 = extractelement <2 x i64> %4, i32 0
    249   %6 = bitcast i64 %5 to x86_mmx
    250   %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
    251   %8 = bitcast x86_mmx %7 to i64
    252   %9 = insertelement <1 x i64> undef, i64 %8, i32 0
    253   store <1 x i64> %9, <1 x i64>* %1
    254   ret void
    255 }
    256 
    257 ; FIXME: If we are transferring MMX registers to XMM for conversion we could use the MMX equivalents
    258 ; (CVTPI2PD + CVTPI2PS) without affecting rounding/exceptions etc.
    259 
    260 define <2 x double> @sitofp_v2i32_v2f64(<1 x i64>*) nounwind {
    261 ; X86-LABEL: sitofp_v2i32_v2f64:
    262 ; X86:       # %bb.0:
    263 ; X86-NEXT:    pushl %ebp
    264 ; X86-NEXT:    movl %esp, %ebp
    265 ; X86-NEXT:    andl $-8, %esp
    266 ; X86-NEXT:    subl $8, %esp
    267 ; X86-NEXT:    movl 8(%ebp), %eax
    268 ; X86-NEXT:    movq (%eax), %mm0
    269 ; X86-NEXT:    paddd %mm0, %mm0
    270 ; X86-NEXT:    movq %mm0, (%esp)
    271 ; X86-NEXT:    cvtdq2pd (%esp), %xmm0
    272 ; X86-NEXT:    movl %ebp, %esp
    273 ; X86-NEXT:    popl %ebp
    274 ; X86-NEXT:    retl
    275 ;
    276 ; X64-LABEL: sitofp_v2i32_v2f64:
    277 ; X64:       # %bb.0:
    278 ; X64-NEXT:    movq (%rdi), %mm0
    279 ; X64-NEXT:    paddd %mm0, %mm0
    280 ; X64-NEXT:    movq2dq %mm0, %xmm0
    281 ; X64-NEXT:    cvtdq2pd %xmm0, %xmm0
    282 ; X64-NEXT:    retq
    283   %2 = bitcast <1 x i64>* %0 to x86_mmx*
    284   %3 = load x86_mmx, x86_mmx* %2, align 8
    285   %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
    286   %5 = bitcast x86_mmx %4 to i64
    287   %6 = insertelement <2 x i64> undef, i64 %5, i32 0
    288   %7 = bitcast <2 x i64> %6 to <4 x i32>
    289   %8 = shufflevector <4 x i32> %7, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
    290   %9 = sitofp <2 x i32> %8 to <2 x double>
    291   ret <2 x double> %9
    292 }
    293 
    294 define <4 x float> @sitofp_v2i32_v2f32(<1 x i64>*) nounwind {
    295 ; X86-LABEL: sitofp_v2i32_v2f32:
    296 ; X86:       # %bb.0:
    297 ; X86-NEXT:    pushl %ebp
    298 ; X86-NEXT:    movl %esp, %ebp
    299 ; X86-NEXT:    andl $-8, %esp
    300 ; X86-NEXT:    subl $8, %esp
    301 ; X86-NEXT:    movl 8(%ebp), %eax
    302 ; X86-NEXT:    movq (%eax), %mm0
    303 ; X86-NEXT:    paddd %mm0, %mm0
    304 ; X86-NEXT:    movq %mm0, (%esp)
    305 ; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
    306 ; X86-NEXT:    cvtdq2ps %xmm0, %xmm0
    307 ; X86-NEXT:    movl %ebp, %esp
    308 ; X86-NEXT:    popl %ebp
    309 ; X86-NEXT:    retl
    310 ;
    311 ; X64-LABEL: sitofp_v2i32_v2f32:
    312 ; X64:       # %bb.0:
    313 ; X64-NEXT:    movq (%rdi), %mm0
    314 ; X64-NEXT:    paddd %mm0, %mm0
    315 ; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
    316 ; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
    317 ; X64-NEXT:    cvtdq2ps %xmm0, %xmm0
    318 ; X64-NEXT:    retq
    319   %2 = bitcast <1 x i64>* %0 to x86_mmx*
    320   %3 = load x86_mmx, x86_mmx* %2, align 8
    321   %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
    322   %5 = bitcast x86_mmx %4 to <2 x i32>
    323   %6 = shufflevector <2 x i32> %5, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    324   %7 = sitofp <4 x i32> %6 to <4 x float>
    325   ret <4 x float> %7
    326 }
    327 
    328 define <4 x float> @cvt_v2i32_v2f32(<1 x i64>*) nounwind {
    329 ; X86-LABEL: cvt_v2i32_v2f32:
    330 ; X86:       # %bb.0:
    331 ; X86-NEXT:    pushl %ebp
    332 ; X86-NEXT:    movl %esp, %ebp
    333 ; X86-NEXT:    andl $-8, %esp
    334 ; X86-NEXT:    subl $8, %esp
    335 ; X86-NEXT:    movl 8(%ebp), %eax
    336 ; X86-NEXT:    movq (%eax), %mm0
    337 ; X86-NEXT:    paddd %mm0, %mm0
    338 ; X86-NEXT:    movq %mm0, (%esp)
    339 ; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
    340 ; X86-NEXT:    cvtdq2ps %xmm0, %xmm0
    341 ; X86-NEXT:    movl %ebp, %esp
    342 ; X86-NEXT:    popl %ebp
    343 ; X86-NEXT:    retl
    344 ;
    345 ; X64-LABEL: cvt_v2i32_v2f32:
    346 ; X64:       # %bb.0:
    347 ; X64-NEXT:    movq (%rdi), %mm0
    348 ; X64-NEXT:    paddd %mm0, %mm0
    349 ; X64-NEXT:    movq %mm0, %rax
    350 ; X64-NEXT:    movq %rax, %xmm0
    351 ; X64-NEXT:    cvtdq2ps %xmm0, %xmm0
    352 ; X64-NEXT:    retq
    353   %2 = bitcast <1 x i64>* %0 to x86_mmx*
    354   %3 = load x86_mmx, x86_mmx* %2, align 8
    355   %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
    356   %5 = bitcast x86_mmx %4 to i64
    357   %6 = insertelement <2 x i64> undef, i64 %5, i32 0
    358   %7 = insertelement <2 x i64> %6, i64 0, i32 1
    359   %8 = bitcast <2 x i64> %7 to <4 x i32>
    360   %9 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %8)
    361   ret <4 x float> %9
    362 }
    363 
    364 declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
    365 declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>)
    366 declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>)
    367 declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>)
    368 declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>)
    369 declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>)
    370