1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64 4 5 ; If we are transferring XMM conversion results to MMX registers we could use the MMX equivalents 6 ; (CVTPD2PI/CVTTPD2PI + CVTPS2PI/CVTTPS2PI) without affecting rounding/exceptions etc. 7 8 define void @cvt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind { 9 ; X86-LABEL: cvt_v2f64_v2i32: 10 ; X86: # %bb.0: 11 ; X86-NEXT: pushl %ebp 12 ; X86-NEXT: movl %esp, %ebp 13 ; X86-NEXT: andl $-8, %esp 14 ; X86-NEXT: subl $8, %esp 15 ; X86-NEXT: movl 8(%ebp), %eax 16 ; X86-NEXT: cvtpd2pi %xmm0, %mm0 17 ; X86-NEXT: paddd %mm0, %mm0 18 ; X86-NEXT: movq %mm0, (%esp) 19 ; X86-NEXT: movl (%esp), %ecx 20 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 21 ; X86-NEXT: movl %edx, 4(%eax) 22 ; X86-NEXT: movl %ecx, (%eax) 23 ; X86-NEXT: movl %ebp, %esp 24 ; X86-NEXT: popl %ebp 25 ; X86-NEXT: retl 26 ; 27 ; X64-LABEL: cvt_v2f64_v2i32: 28 ; X64: # %bb.0: 29 ; X64-NEXT: cvtpd2pi %xmm0, %mm0 30 ; X64-NEXT: paddd %mm0, %mm0 31 ; X64-NEXT: movq %mm0, (%rdi) 32 ; X64-NEXT: retq 33 %3 = tail call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %0) 34 %4 = bitcast <4 x i32> %3 to <2 x i64> 35 %5 = extractelement <2 x i64> %4, i32 0 36 %6 = bitcast i64 %5 to x86_mmx 37 %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6) 38 %8 = bitcast x86_mmx %7 to i64 39 %9 = insertelement <1 x i64> undef, i64 %8, i32 0 40 store <1 x i64> %9, <1 x i64>* %1 41 ret void 42 } 43 44 define void @cvtt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind { 45 ; X86-LABEL: cvtt_v2f64_v2i32: 46 ; X86: # %bb.0: 47 ; X86-NEXT: pushl %ebp 48 ; X86-NEXT: movl %esp, %ebp 49 ; X86-NEXT: andl $-8, %esp 50 ; X86-NEXT: subl $8, %esp 51 ; X86-NEXT: movl 8(%ebp), %eax 52 ; X86-NEXT: cvttpd2pi %xmm0, %mm0 53 ; X86-NEXT: paddd %mm0, %mm0 54 ; X86-NEXT: movq %mm0, (%esp) 55 ; X86-NEXT: movl (%esp), %ecx 56 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 57 ; X86-NEXT: movl %edx, 4(%eax) 58 ; X86-NEXT: movl %ecx, (%eax) 59 ; X86-NEXT: movl %ebp, %esp 60 ; X86-NEXT: popl %ebp 61 ; X86-NEXT: retl 62 ; 63 ; X64-LABEL: cvtt_v2f64_v2i32: 64 ; X64: # %bb.0: 65 ; X64-NEXT: cvttpd2pi %xmm0, %mm0 66 ; X64-NEXT: paddd %mm0, %mm0 67 ; X64-NEXT: movq %mm0, (%rdi) 68 ; X64-NEXT: retq 69 %3 = tail call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %0) 70 %4 = bitcast <4 x i32> %3 to <2 x i64> 71 %5 = extractelement <2 x i64> %4, i32 0 72 %6 = bitcast i64 %5 to x86_mmx 73 %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6) 74 %8 = bitcast x86_mmx %7 to i64 75 %9 = insertelement <1 x i64> undef, i64 %8, i32 0 76 store <1 x i64> %9, <1 x i64>* %1 77 ret void 78 } 79 80 define void @fptosi_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind { 81 ; X86-LABEL: fptosi_v2f64_v2i32: 82 ; X86: # %bb.0: 83 ; X86-NEXT: pushl %ebp 84 ; X86-NEXT: movl %esp, %ebp 85 ; X86-NEXT: andl $-8, %esp 86 ; X86-NEXT: subl $8, %esp 87 ; X86-NEXT: movl 8(%ebp), %eax 88 ; X86-NEXT: cvttpd2pi %xmm0, %mm0 89 ; X86-NEXT: paddd %mm0, %mm0 90 ; X86-NEXT: movq %mm0, (%esp) 91 ; X86-NEXT: movl (%esp), %ecx 92 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 93 ; X86-NEXT: movl %edx, 4(%eax) 94 ; X86-NEXT: movl %ecx, (%eax) 95 ; X86-NEXT: movl %ebp, %esp 96 ; X86-NEXT: popl %ebp 97 ; X86-NEXT: retl 98 ; 99 ; X64-LABEL: fptosi_v2f64_v2i32: 100 ; X64: # %bb.0: 101 ; X64-NEXT: cvttpd2pi %xmm0, %mm0 102 ; X64-NEXT: paddd %mm0, %mm0 103 ; X64-NEXT: movq %mm0, (%rdi) 104 ; X64-NEXT: retq 105 %3 = fptosi <2 x double> %0 to <2 x i32> 106 %4 = bitcast <2 x i32> %3 to x86_mmx 107 %5 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %4, x86_mmx %4) 108 %6 = bitcast x86_mmx %5 to i64 109 %7 = insertelement <1 x i64> undef, i64 %6, i32 0 110 store <1 x i64> %7, <1 x i64>* %1 111 ret void 112 } 113 114 define void @cvt_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind { 115 ; X86-LABEL: cvt_v2f32_v2i32: 116 ; X86: # %bb.0: 117 ; X86-NEXT: pushl %ebp 118 ; X86-NEXT: movl %esp, %ebp 119 ; X86-NEXT: andl $-8, %esp 120 ; X86-NEXT: subl $8, %esp 121 ; X86-NEXT: movl 8(%ebp), %eax 122 ; X86-NEXT: cvtps2pi %xmm0, %mm0 123 ; X86-NEXT: paddd %mm0, %mm0 124 ; X86-NEXT: movq %mm0, (%esp) 125 ; X86-NEXT: movl (%esp), %ecx 126 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 127 ; X86-NEXT: movl %edx, 4(%eax) 128 ; X86-NEXT: movl %ecx, (%eax) 129 ; X86-NEXT: movl %ebp, %esp 130 ; X86-NEXT: popl %ebp 131 ; X86-NEXT: retl 132 ; 133 ; X64-LABEL: cvt_v2f32_v2i32: 134 ; X64: # %bb.0: 135 ; X64-NEXT: cvtps2pi %xmm0, %mm0 136 ; X64-NEXT: paddd %mm0, %mm0 137 ; X64-NEXT: movq %mm0, (%rdi) 138 ; X64-NEXT: retq 139 %3 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %0) 140 %4 = bitcast <4 x i32> %3 to <2 x i64> 141 %5 = extractelement <2 x i64> %4, i32 0 142 %6 = bitcast i64 %5 to x86_mmx 143 %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6) 144 %8 = bitcast x86_mmx %7 to i64 145 %9 = insertelement <1 x i64> undef, i64 %8, i32 0 146 store <1 x i64> %9, <1 x i64>* %1 147 ret void 148 } 149 150 define void @cvtt_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind { 151 ; X86-LABEL: cvtt_v2f32_v2i32: 152 ; X86: # %bb.0: 153 ; X86-NEXT: pushl %ebp 154 ; X86-NEXT: movl %esp, %ebp 155 ; X86-NEXT: andl $-8, %esp 156 ; X86-NEXT: subl $8, %esp 157 ; X86-NEXT: movl 8(%ebp), %eax 158 ; X86-NEXT: cvttps2pi %xmm0, %mm0 159 ; X86-NEXT: paddd %mm0, %mm0 160 ; X86-NEXT: movq %mm0, (%esp) 161 ; X86-NEXT: movl (%esp), %ecx 162 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 163 ; X86-NEXT: movl %edx, 4(%eax) 164 ; X86-NEXT: movl %ecx, (%eax) 165 ; X86-NEXT: movl %ebp, %esp 166 ; X86-NEXT: popl %ebp 167 ; X86-NEXT: retl 168 ; 169 ; X64-LABEL: cvtt_v2f32_v2i32: 170 ; X64: # %bb.0: 171 ; X64-NEXT: cvttps2pi %xmm0, %mm0 172 ; X64-NEXT: paddd %mm0, %mm0 173 ; X64-NEXT: movq %mm0, (%rdi) 174 ; X64-NEXT: retq 175 %3 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %0) 176 %4 = bitcast <4 x i32> %3 to <2 x i64> 177 %5 = extractelement <2 x i64> %4, i32 0 178 %6 = bitcast i64 %5 to x86_mmx 179 %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6) 180 %8 = bitcast x86_mmx %7 to i64 181 %9 = insertelement <1 x i64> undef, i64 %8, i32 0 182 store <1 x i64> %9, <1 x i64>* %1 183 ret void 184 } 185 186 define void @fptosi_v4f32_v4i32(<4 x float>, <1 x i64>*) nounwind { 187 ; X86-LABEL: fptosi_v4f32_v4i32: 188 ; X86: # %bb.0: 189 ; X86-NEXT: pushl %ebp 190 ; X86-NEXT: movl %esp, %ebp 191 ; X86-NEXT: andl $-8, %esp 192 ; X86-NEXT: subl $8, %esp 193 ; X86-NEXT: movl 8(%ebp), %eax 194 ; X86-NEXT: cvttps2pi %xmm0, %mm0 195 ; X86-NEXT: paddd %mm0, %mm0 196 ; X86-NEXT: movq %mm0, (%esp) 197 ; X86-NEXT: movl (%esp), %ecx 198 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 199 ; X86-NEXT: movl %edx, 4(%eax) 200 ; X86-NEXT: movl %ecx, (%eax) 201 ; X86-NEXT: movl %ebp, %esp 202 ; X86-NEXT: popl %ebp 203 ; X86-NEXT: retl 204 ; 205 ; X64-LABEL: fptosi_v4f32_v4i32: 206 ; X64: # %bb.0: 207 ; X64-NEXT: cvttps2pi %xmm0, %mm0 208 ; X64-NEXT: paddd %mm0, %mm0 209 ; X64-NEXT: movq %mm0, (%rdi) 210 ; X64-NEXT: retq 211 %3 = fptosi <4 x float> %0 to <4 x i32> 212 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 213 %5 = bitcast <2 x i32> %4 to x86_mmx 214 %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5) 215 %7 = bitcast x86_mmx %6 to i64 216 %8 = insertelement <1 x i64> undef, i64 %7, i32 0 217 store <1 x i64> %8, <1 x i64>* %1 218 ret void 219 } 220 221 define void @fptosi_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind { 222 ; X86-LABEL: fptosi_v2f32_v2i32: 223 ; X86: # %bb.0: 224 ; X86-NEXT: pushl %ebp 225 ; X86-NEXT: movl %esp, %ebp 226 ; X86-NEXT: andl $-8, %esp 227 ; X86-NEXT: subl $8, %esp 228 ; X86-NEXT: movl 8(%ebp), %eax 229 ; X86-NEXT: cvttps2pi %xmm0, %mm0 230 ; X86-NEXT: paddd %mm0, %mm0 231 ; X86-NEXT: movq %mm0, (%esp) 232 ; X86-NEXT: movl (%esp), %ecx 233 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 234 ; X86-NEXT: movl %edx, 4(%eax) 235 ; X86-NEXT: movl %ecx, (%eax) 236 ; X86-NEXT: movl %ebp, %esp 237 ; X86-NEXT: popl %ebp 238 ; X86-NEXT: retl 239 ; 240 ; X64-LABEL: fptosi_v2f32_v2i32: 241 ; X64: # %bb.0: 242 ; X64-NEXT: cvttps2pi %xmm0, %mm0 243 ; X64-NEXT: paddd %mm0, %mm0 244 ; X64-NEXT: movq %mm0, (%rdi) 245 ; X64-NEXT: retq 246 %3 = fptosi <4 x float> %0 to <4 x i32> 247 %4 = bitcast <4 x i32> %3 to <2 x i64> 248 %5 = extractelement <2 x i64> %4, i32 0 249 %6 = bitcast i64 %5 to x86_mmx 250 %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6) 251 %8 = bitcast x86_mmx %7 to i64 252 %9 = insertelement <1 x i64> undef, i64 %8, i32 0 253 store <1 x i64> %9, <1 x i64>* %1 254 ret void 255 } 256 257 ; FIXME: If we are transferring MMX registers to XMM for conversion we could use the MMX equivalents 258 ; (CVTPI2PD + CVTPI2PS) without affecting rounding/exceptions etc. 259 260 define <2 x double> @sitofp_v2i32_v2f64(<1 x i64>*) nounwind { 261 ; X86-LABEL: sitofp_v2i32_v2f64: 262 ; X86: # %bb.0: 263 ; X86-NEXT: pushl %ebp 264 ; X86-NEXT: movl %esp, %ebp 265 ; X86-NEXT: andl $-8, %esp 266 ; X86-NEXT: subl $8, %esp 267 ; X86-NEXT: movl 8(%ebp), %eax 268 ; X86-NEXT: movq (%eax), %mm0 269 ; X86-NEXT: paddd %mm0, %mm0 270 ; X86-NEXT: movq %mm0, (%esp) 271 ; X86-NEXT: cvtdq2pd (%esp), %xmm0 272 ; X86-NEXT: movl %ebp, %esp 273 ; X86-NEXT: popl %ebp 274 ; X86-NEXT: retl 275 ; 276 ; X64-LABEL: sitofp_v2i32_v2f64: 277 ; X64: # %bb.0: 278 ; X64-NEXT: movq (%rdi), %mm0 279 ; X64-NEXT: paddd %mm0, %mm0 280 ; X64-NEXT: movq2dq %mm0, %xmm0 281 ; X64-NEXT: cvtdq2pd %xmm0, %xmm0 282 ; X64-NEXT: retq 283 %2 = bitcast <1 x i64>* %0 to x86_mmx* 284 %3 = load x86_mmx, x86_mmx* %2, align 8 285 %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3) 286 %5 = bitcast x86_mmx %4 to i64 287 %6 = insertelement <2 x i64> undef, i64 %5, i32 0 288 %7 = bitcast <2 x i64> %6 to <4 x i32> 289 %8 = shufflevector <4 x i32> %7, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 290 %9 = sitofp <2 x i32> %8 to <2 x double> 291 ret <2 x double> %9 292 } 293 294 define <4 x float> @sitofp_v2i32_v2f32(<1 x i64>*) nounwind { 295 ; X86-LABEL: sitofp_v2i32_v2f32: 296 ; X86: # %bb.0: 297 ; X86-NEXT: pushl %ebp 298 ; X86-NEXT: movl %esp, %ebp 299 ; X86-NEXT: andl $-8, %esp 300 ; X86-NEXT: subl $8, %esp 301 ; X86-NEXT: movl 8(%ebp), %eax 302 ; X86-NEXT: movq (%eax), %mm0 303 ; X86-NEXT: paddd %mm0, %mm0 304 ; X86-NEXT: movq %mm0, (%esp) 305 ; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 306 ; X86-NEXT: cvtdq2ps %xmm0, %xmm0 307 ; X86-NEXT: movl %ebp, %esp 308 ; X86-NEXT: popl %ebp 309 ; X86-NEXT: retl 310 ; 311 ; X64-LABEL: sitofp_v2i32_v2f32: 312 ; X64: # %bb.0: 313 ; X64-NEXT: movq (%rdi), %mm0 314 ; X64-NEXT: paddd %mm0, %mm0 315 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) 316 ; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 317 ; X64-NEXT: cvtdq2ps %xmm0, %xmm0 318 ; X64-NEXT: retq 319 %2 = bitcast <1 x i64>* %0 to x86_mmx* 320 %3 = load x86_mmx, x86_mmx* %2, align 8 321 %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3) 322 %5 = bitcast x86_mmx %4 to <2 x i32> 323 %6 = shufflevector <2 x i32> %5, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 324 %7 = sitofp <4 x i32> %6 to <4 x float> 325 ret <4 x float> %7 326 } 327 328 define <4 x float> @cvt_v2i32_v2f32(<1 x i64>*) nounwind { 329 ; X86-LABEL: cvt_v2i32_v2f32: 330 ; X86: # %bb.0: 331 ; X86-NEXT: pushl %ebp 332 ; X86-NEXT: movl %esp, %ebp 333 ; X86-NEXT: andl $-8, %esp 334 ; X86-NEXT: subl $8, %esp 335 ; X86-NEXT: movl 8(%ebp), %eax 336 ; X86-NEXT: movq (%eax), %mm0 337 ; X86-NEXT: paddd %mm0, %mm0 338 ; X86-NEXT: movq %mm0, (%esp) 339 ; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 340 ; X86-NEXT: cvtdq2ps %xmm0, %xmm0 341 ; X86-NEXT: movl %ebp, %esp 342 ; X86-NEXT: popl %ebp 343 ; X86-NEXT: retl 344 ; 345 ; X64-LABEL: cvt_v2i32_v2f32: 346 ; X64: # %bb.0: 347 ; X64-NEXT: movq (%rdi), %mm0 348 ; X64-NEXT: paddd %mm0, %mm0 349 ; X64-NEXT: movq %mm0, %rax 350 ; X64-NEXT: movq %rax, %xmm0 351 ; X64-NEXT: cvtdq2ps %xmm0, %xmm0 352 ; X64-NEXT: retq 353 %2 = bitcast <1 x i64>* %0 to x86_mmx* 354 %3 = load x86_mmx, x86_mmx* %2, align 8 355 %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3) 356 %5 = bitcast x86_mmx %4 to i64 357 %6 = insertelement <2 x i64> undef, i64 %5, i32 0 358 %7 = insertelement <2 x i64> %6, i64 0, i32 1 359 %8 = bitcast <2 x i64> %7 to <4 x i32> 360 %9 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %8) 361 ret <4 x float> %9 362 } 363 364 declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) 365 declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) 366 declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) 367 declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) 368 declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) 369 declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) 370