1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 4 ; 5 ; 32-bit tests to make sure we're not doing anything stupid. 6 ; RUN: llc < %s -mtriple=i686-unknown-unknown 7 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse 8 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 9 10 ; 11 ; Signed Integer to Double 12 ; 13 14 define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) { 15 ; SSE-LABEL: sitofp_2i64_to_2f64: 16 ; SSE: # BB#0: 17 ; SSE-NEXT: movd %xmm0, %rax 18 ; SSE-NEXT: cvtsi2sdq %rax, %xmm1 19 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 20 ; SSE-NEXT: movd %xmm0, %rax 21 ; SSE-NEXT: xorps %xmm0, %xmm0 22 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0 23 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] 24 ; SSE-NEXT: movapd %xmm1, %xmm0 25 ; SSE-NEXT: retq 26 ; 27 ; AVX-LABEL: sitofp_2i64_to_2f64: 28 ; AVX: # BB#0: 29 ; AVX-NEXT: vpextrq $1, %xmm0, %rax 30 ; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 31 ; AVX-NEXT: vmovq %xmm0, %rax 32 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 33 ; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 34 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 35 ; AVX-NEXT: retq 36 %cvt = sitofp <2 x i64> %a to <2 x double> 37 ret <2 x double> %cvt 38 } 39 40 define <2 x double> @sitofp_2i32_to_2f64(<4 x i32> %a) { 41 ; SSE-LABEL: sitofp_2i32_to_2f64: 42 ; SSE: # BB#0: 43 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 44 ; SSE-NEXT: retq 45 ; 46 ; AVX-LABEL: sitofp_2i32_to_2f64: 47 ; AVX: # BB#0: 48 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 49 ; AVX-NEXT: retq 50 %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 51 %cvt = sitofp <2 x i32> %shuf to <2 x double> 52 ret <2 x double> %cvt 53 } 54 55 define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) { 56 ; SSE-LABEL: sitofp_4i32_to_2f64: 57 ; SSE: # BB#0: 58 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 59 ; SSE-NEXT: retq 60 ; 61 ; AVX-LABEL: sitofp_4i32_to_2f64: 62 ; AVX: # BB#0: 63 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 64 ; AVX-NEXT: # kill 65 ; AVX-NEXT: vzeroupper 66 ; AVX-NEXT: retq 67 %cvt = sitofp <4 x i32> %a to <4 x double> 68 %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1> 69 ret <2 x double> %shuf 70 } 71 72 define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) { 73 ; SSE-LABEL: sitofp_2i16_to_2f64: 74 ; SSE: # BB#0: 75 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 76 ; SSE-NEXT: psrad $16, %xmm0 77 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 78 ; SSE-NEXT: retq 79 ; 80 ; AVX-LABEL: sitofp_2i16_to_2f64: 81 ; AVX: # BB#0: 82 ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 83 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 84 ; AVX-NEXT: retq 85 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1> 86 %cvt = sitofp <2 x i16> %shuf to <2 x double> 87 ret <2 x double> %cvt 88 } 89 90 define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) { 91 ; SSE-LABEL: sitofp_8i16_to_2f64: 92 ; SSE: # BB#0: 93 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 94 ; SSE-NEXT: psrad $16, %xmm0 95 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 96 ; SSE-NEXT: retq 97 ; 98 ; AVX1-LABEL: sitofp_8i16_to_2f64: 99 ; AVX1: # BB#0: 100 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 101 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 102 ; AVX1-NEXT: # kill 103 ; AVX1-NEXT: vzeroupper 104 ; AVX1-NEXT: retq 105 ; 106 ; AVX2-LABEL: sitofp_8i16_to_2f64: 107 ; AVX2: # BB#0: 108 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 109 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 110 ; AVX2-NEXT: # kill 111 ; AVX2-NEXT: vzeroupper 112 ; AVX2-NEXT: retq 113 %cvt = sitofp <8 x i16> %a to <8 x double> 114 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1> 115 ret <2 x double> %shuf 116 } 117 118 define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) { 119 ; SSE-LABEL: sitofp_2i8_to_2f64: 120 ; SSE: # BB#0: 121 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 122 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 123 ; SSE-NEXT: psrad $24, %xmm0 124 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 125 ; SSE-NEXT: retq 126 ; 127 ; AVX-LABEL: sitofp_2i8_to_2f64: 128 ; AVX: # BB#0: 129 ; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 130 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 131 ; AVX-NEXT: retq 132 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1> 133 %cvt = sitofp <2 x i8> %shuf to <2 x double> 134 ret <2 x double> %cvt 135 } 136 137 define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) { 138 ; SSE-LABEL: sitofp_16i8_to_2f64: 139 ; SSE: # BB#0: 140 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 141 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 142 ; SSE-NEXT: psrad $24, %xmm0 143 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 144 ; SSE-NEXT: retq 145 ; 146 ; AVX1-LABEL: sitofp_16i8_to_2f64: 147 ; AVX1: # BB#0: 148 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 149 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 150 ; AVX1-NEXT: # kill 151 ; AVX1-NEXT: vzeroupper 152 ; AVX1-NEXT: retq 153 ; 154 ; AVX2-LABEL: sitofp_16i8_to_2f64: 155 ; AVX2: # BB#0: 156 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 157 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 158 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 159 ; AVX2-NEXT: # kill 160 ; AVX2-NEXT: vzeroupper 161 ; AVX2-NEXT: retq 162 %cvt = sitofp <16 x i8> %a to <16 x double> 163 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1> 164 ret <2 x double> %shuf 165 } 166 167 define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { 168 ; SSE-LABEL: sitofp_4i64_to_4f64: 169 ; SSE: # BB#0: 170 ; SSE-NEXT: movd %xmm0, %rax 171 ; SSE-NEXT: cvtsi2sdq %rax, %xmm2 172 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 173 ; SSE-NEXT: movd %xmm0, %rax 174 ; SSE-NEXT: xorps %xmm0, %xmm0 175 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0 176 ; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] 177 ; SSE-NEXT: movd %xmm1, %rax 178 ; SSE-NEXT: cvtsi2sdq %rax, %xmm3 179 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 180 ; SSE-NEXT: movd %xmm0, %rax 181 ; SSE-NEXT: xorps %xmm0, %xmm0 182 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0 183 ; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0] 184 ; SSE-NEXT: movapd %xmm2, %xmm0 185 ; SSE-NEXT: movapd %xmm3, %xmm1 186 ; SSE-NEXT: retq 187 ; 188 ; AVX1-LABEL: sitofp_4i64_to_4f64: 189 ; AVX1: # BB#0: 190 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 191 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax 192 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 193 ; AVX1-NEXT: vmovq %xmm1, %rax 194 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 195 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 196 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax 197 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 198 ; AVX1-NEXT: vmovq %xmm0, %rax 199 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 200 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 201 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] 202 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 203 ; AVX1-NEXT: retq 204 ; 205 ; AVX2-LABEL: sitofp_4i64_to_4f64: 206 ; AVX2: # BB#0: 207 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 208 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax 209 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 210 ; AVX2-NEXT: vmovq %xmm1, %rax 211 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 212 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 213 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax 214 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 215 ; AVX2-NEXT: vmovq %xmm0, %rax 216 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 217 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 218 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] 219 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 220 ; AVX2-NEXT: retq 221 %cvt = sitofp <4 x i64> %a to <4 x double> 222 ret <4 x double> %cvt 223 } 224 225 define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) { 226 ; SSE-LABEL: sitofp_4i32_to_4f64: 227 ; SSE: # BB#0: 228 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 229 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 230 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 231 ; SSE-NEXT: movaps %xmm2, %xmm0 232 ; SSE-NEXT: retq 233 ; 234 ; AVX-LABEL: sitofp_4i32_to_4f64: 235 ; AVX: # BB#0: 236 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 237 ; AVX-NEXT: retq 238 %cvt = sitofp <4 x i32> %a to <4 x double> 239 ret <4 x double> %cvt 240 } 241 242 define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) { 243 ; SSE-LABEL: sitofp_4i16_to_4f64: 244 ; SSE: # BB#0: 245 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 246 ; SSE-NEXT: psrad $16, %xmm1 247 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 248 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 249 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 250 ; SSE-NEXT: retq 251 ; 252 ; AVX-LABEL: sitofp_4i16_to_4f64: 253 ; AVX: # BB#0: 254 ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 255 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 256 ; AVX-NEXT: retq 257 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 258 %cvt = sitofp <4 x i16> %shuf to <4 x double> 259 ret <4 x double> %cvt 260 } 261 262 define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) { 263 ; SSE-LABEL: sitofp_8i16_to_4f64: 264 ; SSE: # BB#0: 265 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 266 ; SSE-NEXT: psrad $16, %xmm1 267 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 268 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 269 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 270 ; SSE-NEXT: retq 271 ; 272 ; AVX1-LABEL: sitofp_8i16_to_4f64: 273 ; AVX1: # BB#0: 274 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 275 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 276 ; AVX1-NEXT: retq 277 ; 278 ; AVX2-LABEL: sitofp_8i16_to_4f64: 279 ; AVX2: # BB#0: 280 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 281 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 282 ; AVX2-NEXT: retq 283 %cvt = sitofp <8 x i16> %a to <8 x double> 284 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 285 ret <4 x double> %shuf 286 } 287 288 define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) { 289 ; SSE-LABEL: sitofp_4i8_to_4f64: 290 ; SSE: # BB#0: 291 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 292 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 293 ; SSE-NEXT: psrad $24, %xmm1 294 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 295 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 296 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 297 ; SSE-NEXT: retq 298 ; 299 ; AVX-LABEL: sitofp_4i8_to_4f64: 300 ; AVX: # BB#0: 301 ; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 302 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 303 ; AVX-NEXT: retq 304 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 305 %cvt = sitofp <4 x i8> %shuf to <4 x double> 306 ret <4 x double> %cvt 307 } 308 309 define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) { 310 ; SSE-LABEL: sitofp_16i8_to_4f64: 311 ; SSE: # BB#0: 312 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 313 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 314 ; SSE-NEXT: psrad $24, %xmm1 315 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 316 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 317 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 318 ; SSE-NEXT: retq 319 ; 320 ; AVX1-LABEL: sitofp_16i8_to_4f64: 321 ; AVX1: # BB#0: 322 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 323 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 324 ; AVX1-NEXT: retq 325 ; 326 ; AVX2-LABEL: sitofp_16i8_to_4f64: 327 ; AVX2: # BB#0: 328 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 329 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 330 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 331 ; AVX2-NEXT: retq 332 %cvt = sitofp <16 x i8> %a to <16 x double> 333 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 334 ret <4 x double> %shuf 335 } 336 337 ; 338 ; Unsigned Integer to Double 339 ; 340 341 define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) { 342 ; SSE-LABEL: uitofp_2i64_to_2f64: 343 ; SSE: # BB#0: 344 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] 345 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 346 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 347 ; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] 348 ; SSE-NEXT: subpd %xmm3, %xmm0 349 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 350 ; SSE-NEXT: addpd %xmm4, %xmm0 351 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 352 ; SSE-NEXT: subpd %xmm3, %xmm2 353 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] 354 ; SSE-NEXT: addpd %xmm2, %xmm1 355 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 356 ; SSE-NEXT: retq 357 ; 358 ; AVX-LABEL: uitofp_2i64_to_2f64: 359 ; AVX: # BB#0: 360 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] 361 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 362 ; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] 363 ; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2 364 ; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2 365 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 366 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 367 ; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0 368 ; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 369 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] 370 ; AVX-NEXT: retq 371 %cvt = uitofp <2 x i64> %a to <2 x double> 372 ret <2 x double> %cvt 373 } 374 375 define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) { 376 ; SSE-LABEL: uitofp_2i32_to_2f64: 377 ; SSE: # BB#0: 378 ; SSE-NEXT: pxor %xmm1, %xmm1 379 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 380 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] 381 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 382 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 383 ; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] 384 ; SSE-NEXT: subpd %xmm3, %xmm0 385 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 386 ; SSE-NEXT: addpd %xmm4, %xmm0 387 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 388 ; SSE-NEXT: subpd %xmm3, %xmm2 389 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] 390 ; SSE-NEXT: addpd %xmm2, %xmm1 391 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 392 ; SSE-NEXT: retq 393 ; 394 ; AVX-LABEL: uitofp_2i32_to_2f64: 395 ; AVX: # BB#0: 396 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 397 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] 398 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 399 ; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] 400 ; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2 401 ; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2 402 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 403 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 404 ; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0 405 ; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 406 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] 407 ; AVX-NEXT: retq 408 %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 409 %cvt = uitofp <2 x i32> %shuf to <2 x double> 410 ret <2 x double> %cvt 411 } 412 413 define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) { 414 ; SSE-LABEL: uitofp_4i32_to_2f64: 415 ; SSE: # BB#0: 416 ; SSE-NEXT: pxor %xmm1, %xmm1 417 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 418 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] 419 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 420 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 421 ; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] 422 ; SSE-NEXT: subpd %xmm3, %xmm0 423 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 424 ; SSE-NEXT: addpd %xmm4, %xmm0 425 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 426 ; SSE-NEXT: subpd %xmm3, %xmm2 427 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] 428 ; SSE-NEXT: addpd %xmm2, %xmm1 429 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 430 ; SSE-NEXT: retq 431 ; 432 ; AVX1-LABEL: uitofp_4i32_to_2f64: 433 ; AVX1: # BB#0: 434 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 435 ; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1 436 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 437 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 438 ; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 439 ; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 440 ; AVX1-NEXT: # kill 441 ; AVX1-NEXT: vzeroupper 442 ; AVX1-NEXT: retq 443 ; 444 ; AVX2-LABEL: uitofp_4i32_to_2f64: 445 ; AVX2: # BB#0: 446 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 447 ; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1 448 ; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2 449 ; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1 450 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 451 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 452 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 453 ; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 454 ; AVX2-NEXT: # kill 455 ; AVX2-NEXT: vzeroupper 456 ; AVX2-NEXT: retq 457 %cvt = uitofp <4 x i32> %a to <4 x double> 458 %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1> 459 ret <2 x double> %shuf 460 } 461 462 define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) { 463 ; SSE-LABEL: uitofp_2i16_to_2f64: 464 ; SSE: # BB#0: 465 ; SSE-NEXT: pxor %xmm1, %xmm1 466 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 467 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 468 ; SSE-NEXT: retq 469 ; 470 ; AVX-LABEL: uitofp_2i16_to_2f64: 471 ; AVX: # BB#0: 472 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 473 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 474 ; AVX-NEXT: retq 475 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1> 476 %cvt = uitofp <2 x i16> %shuf to <2 x double> 477 ret <2 x double> %cvt 478 } 479 480 define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) { 481 ; SSE-LABEL: uitofp_8i16_to_2f64: 482 ; SSE: # BB#0: 483 ; SSE-NEXT: pxor %xmm1, %xmm1 484 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 485 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 486 ; SSE-NEXT: retq 487 ; 488 ; AVX1-LABEL: uitofp_8i16_to_2f64: 489 ; AVX1: # BB#0: 490 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 491 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 492 ; AVX1-NEXT: # kill 493 ; AVX1-NEXT: vzeroupper 494 ; AVX1-NEXT: retq 495 ; 496 ; AVX2-LABEL: uitofp_8i16_to_2f64: 497 ; AVX2: # BB#0: 498 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 499 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 500 ; AVX2-NEXT: # kill 501 ; AVX2-NEXT: vzeroupper 502 ; AVX2-NEXT: retq 503 %cvt = uitofp <8 x i16> %a to <8 x double> 504 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1> 505 ret <2 x double> %shuf 506 } 507 508 define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) { 509 ; SSE-LABEL: uitofp_2i8_to_2f64: 510 ; SSE: # BB#0: 511 ; SSE-NEXT: pxor %xmm1, %xmm1 512 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 513 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 514 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 515 ; SSE-NEXT: retq 516 ; 517 ; AVX-LABEL: uitofp_2i8_to_2f64: 518 ; AVX: # BB#0: 519 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 520 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 521 ; AVX-NEXT: retq 522 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1> 523 %cvt = uitofp <2 x i8> %shuf to <2 x double> 524 ret <2 x double> %cvt 525 } 526 527 define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) { 528 ; SSE-LABEL: uitofp_16i8_to_2f64: 529 ; SSE: # BB#0: 530 ; SSE-NEXT: pxor %xmm1, %xmm1 531 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 532 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 533 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 534 ; SSE-NEXT: retq 535 ; 536 ; AVX1-LABEL: uitofp_16i8_to_2f64: 537 ; AVX1: # BB#0: 538 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 539 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 540 ; AVX1-NEXT: # kill 541 ; AVX1-NEXT: vzeroupper 542 ; AVX1-NEXT: retq 543 ; 544 ; AVX2-LABEL: uitofp_16i8_to_2f64: 545 ; AVX2: # BB#0: 546 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 547 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 548 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 549 ; AVX2-NEXT: # kill 550 ; AVX2-NEXT: vzeroupper 551 ; AVX2-NEXT: retq 552 %cvt = uitofp <16 x i8> %a to <16 x double> 553 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1> 554 ret <2 x double> %shuf 555 } 556 557 define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) { 558 ; SSE-LABEL: uitofp_4i64_to_4f64: 559 ; SSE: # BB#0: 560 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] 561 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] 562 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 563 ; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] 564 ; SSE-NEXT: subpd %xmm4, %xmm0 565 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] 566 ; SSE-NEXT: addpd %xmm5, %xmm0 567 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 568 ; SSE-NEXT: subpd %xmm4, %xmm3 569 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1] 570 ; SSE-NEXT: addpd %xmm3, %xmm5 571 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0] 572 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 573 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 574 ; SSE-NEXT: subpd %xmm4, %xmm1 575 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] 576 ; SSE-NEXT: addpd %xmm5, %xmm1 577 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 578 ; SSE-NEXT: subpd %xmm4, %xmm3 579 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] 580 ; SSE-NEXT: addpd %xmm3, %xmm2 581 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 582 ; SSE-NEXT: retq 583 ; 584 ; AVX1-LABEL: uitofp_4i64_to_4f64: 585 ; AVX1: # BB#0: 586 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 587 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] 588 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 589 ; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] 590 ; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3 591 ; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 592 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 593 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 594 ; AVX1-NEXT: vsubpd %xmm4, %xmm1, %xmm1 595 ; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 596 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0] 597 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 598 ; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3 599 ; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 600 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 601 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 602 ; AVX1-NEXT: vsubpd %xmm4, %xmm0, %xmm0 603 ; AVX1-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 604 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0] 605 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 606 ; AVX1-NEXT: retq 607 ; 608 ; AVX2-LABEL: uitofp_4i64_to_4f64: 609 ; AVX2: # BB#0: 610 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 611 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] 612 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 613 ; AVX2-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] 614 ; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3 615 ; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 616 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 617 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 618 ; AVX2-NEXT: vsubpd %xmm4, %xmm1, %xmm1 619 ; AVX2-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 620 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0] 621 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 622 ; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3 623 ; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 624 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 625 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 626 ; AVX2-NEXT: vsubpd %xmm4, %xmm0, %xmm0 627 ; AVX2-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 628 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0] 629 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 630 ; AVX2-NEXT: retq 631 %cvt = uitofp <4 x i64> %a to <4 x double> 632 ret <4 x double> %cvt 633 } 634 635 define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) { 636 ; SSE-LABEL: uitofp_4i32_to_4f64: 637 ; SSE: # BB#0: 638 ; SSE-NEXT: movdqa %xmm0, %xmm2 639 ; SSE-NEXT: pxor %xmm1, %xmm1 640 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 641 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0] 642 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 643 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 644 ; SSE-NEXT: movapd {{.*#+}} xmm5 = [4.503600e+15,1.934281e+25] 645 ; SSE-NEXT: subpd %xmm5, %xmm0 646 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1] 647 ; SSE-NEXT: addpd %xmm6, %xmm0 648 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 649 ; SSE-NEXT: subpd %xmm5, %xmm4 650 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1] 651 ; SSE-NEXT: addpd %xmm4, %xmm6 652 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm6[0] 653 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 654 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] 655 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 656 ; SSE-NEXT: subpd %xmm5, %xmm2 657 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] 658 ; SSE-NEXT: addpd %xmm2, %xmm1 659 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 660 ; SSE-NEXT: subpd %xmm5, %xmm4 661 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1] 662 ; SSE-NEXT: addpd %xmm4, %xmm2 663 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 664 ; SSE-NEXT: retq 665 ; 666 ; AVX1-LABEL: uitofp_4i32_to_4f64: 667 ; AVX1: # BB#0: 668 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 669 ; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1 670 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 671 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 672 ; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 673 ; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 674 ; AVX1-NEXT: retq 675 ; 676 ; AVX2-LABEL: uitofp_4i32_to_4f64: 677 ; AVX2: # BB#0: 678 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 679 ; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1 680 ; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2 681 ; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1 682 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 683 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 684 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 685 ; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 686 ; AVX2-NEXT: retq 687 %cvt = uitofp <4 x i32> %a to <4 x double> 688 ret <4 x double> %cvt 689 } 690 691 define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) { 692 ; SSE-LABEL: uitofp_4i16_to_4f64: 693 ; SSE: # BB#0: 694 ; SSE-NEXT: pxor %xmm1, %xmm1 695 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 696 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 697 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 698 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 699 ; SSE-NEXT: movaps %xmm2, %xmm0 700 ; SSE-NEXT: retq 701 ; 702 ; AVX-LABEL: uitofp_4i16_to_4f64: 703 ; AVX: # BB#0: 704 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 705 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 706 ; AVX-NEXT: retq 707 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 708 %cvt = uitofp <4 x i16> %shuf to <4 x double> 709 ret <4 x double> %cvt 710 } 711 712 define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) { 713 ; SSE-LABEL: uitofp_8i16_to_4f64: 714 ; SSE: # BB#0: 715 ; SSE-NEXT: pxor %xmm1, %xmm1 716 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 717 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 718 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 719 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 720 ; SSE-NEXT: movaps %xmm2, %xmm0 721 ; SSE-NEXT: retq 722 ; 723 ; AVX1-LABEL: uitofp_8i16_to_4f64: 724 ; AVX1: # BB#0: 725 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 726 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 727 ; AVX1-NEXT: retq 728 ; 729 ; AVX2-LABEL: uitofp_8i16_to_4f64: 730 ; AVX2: # BB#0: 731 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 732 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 733 ; AVX2-NEXT: retq 734 %cvt = uitofp <8 x i16> %a to <8 x double> 735 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 736 ret <4 x double> %shuf 737 } 738 739 define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) { 740 ; SSE-LABEL: uitofp_4i8_to_4f64: 741 ; SSE: # BB#0: 742 ; SSE-NEXT: pxor %xmm1, %xmm1 743 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 744 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 745 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 746 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 747 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 748 ; SSE-NEXT: movaps %xmm2, %xmm0 749 ; SSE-NEXT: retq 750 ; 751 ; AVX-LABEL: uitofp_4i8_to_4f64: 752 ; AVX: # BB#0: 753 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 754 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 755 ; AVX-NEXT: retq 756 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 757 %cvt = uitofp <4 x i8> %shuf to <4 x double> 758 ret <4 x double> %cvt 759 } 760 761 define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) { 762 ; SSE-LABEL: uitofp_16i8_to_4f64: 763 ; SSE: # BB#0: 764 ; SSE-NEXT: pxor %xmm1, %xmm1 765 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 766 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 767 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 768 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 769 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 770 ; SSE-NEXT: movaps %xmm2, %xmm0 771 ; SSE-NEXT: retq 772 ; 773 ; AVX1-LABEL: uitofp_16i8_to_4f64: 774 ; AVX1: # BB#0: 775 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 776 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 777 ; AVX1-NEXT: retq 778 ; 779 ; AVX2-LABEL: uitofp_16i8_to_4f64: 780 ; AVX2: # BB#0: 781 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 782 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 783 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 784 ; AVX2-NEXT: retq 785 %cvt = uitofp <16 x i8> %a to <16 x double> 786 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 787 ret <4 x double> %shuf 788 } 789 790 ; 791 ; Signed Integer to Float 792 ; 793 794 define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) { 795 ; SSE-LABEL: sitofp_2i64_to_4f32: 796 ; SSE: # BB#0: 797 ; SSE-NEXT: movd %xmm0, %rax 798 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 799 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 800 ; SSE-NEXT: movd %xmm0, %rax 801 ; SSE-NEXT: xorps %xmm0, %xmm0 802 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 803 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 804 ; SSE-NEXT: movaps %xmm1, %xmm0 805 ; SSE-NEXT: retq 806 ; 807 ; AVX-LABEL: sitofp_2i64_to_4f32: 808 ; AVX: # BB#0: 809 ; AVX-NEXT: vpextrq $1, %xmm0, %rax 810 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 811 ; AVX-NEXT: vmovq %xmm0, %rax 812 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 813 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 814 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 815 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 816 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 817 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 818 ; AVX-NEXT: retq 819 %cvt = sitofp <2 x i64> %a to <2 x float> 820 %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 821 ret <4 x float> %ext 822 } 823 824 define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) { 825 ; SSE-LABEL: sitofp_4i64_to_4f32_undef: 826 ; SSE: # BB#0: 827 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 828 ; SSE-NEXT: movd %xmm0, %rax 829 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 830 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 831 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 832 ; SSE-NEXT: movd %xmm0, %rax 833 ; SSE-NEXT: xorps %xmm0, %xmm0 834 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 835 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 836 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 837 ; SSE-NEXT: movaps %xmm1, %xmm0 838 ; SSE-NEXT: retq 839 ; 840 ; AVX-LABEL: sitofp_4i64_to_4f32_undef: 841 ; AVX: # BB#0: 842 ; AVX-NEXT: vpextrq $1, %xmm0, %rax 843 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 844 ; AVX-NEXT: vmovq %xmm0, %rax 845 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 846 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 847 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 848 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 849 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 850 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 851 ; AVX-NEXT: retq 852 %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 853 %cvt = sitofp <4 x i64> %ext to <4 x float> 854 ret <4 x float> %cvt 855 } 856 857 define <4 x float> @sitofp_4i32_to_4f32(<4 x i32> %a) { 858 ; SSE-LABEL: sitofp_4i32_to_4f32: 859 ; SSE: # BB#0: 860 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 861 ; SSE-NEXT: retq 862 ; 863 ; AVX-LABEL: sitofp_4i32_to_4f32: 864 ; AVX: # BB#0: 865 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 866 ; AVX-NEXT: retq 867 %cvt = sitofp <4 x i32> %a to <4 x float> 868 ret <4 x float> %cvt 869 } 870 871 define <4 x float> @sitofp_4i16_to_4f32(<8 x i16> %a) { 872 ; SSE-LABEL: sitofp_4i16_to_4f32: 873 ; SSE: # BB#0: 874 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 875 ; SSE-NEXT: psrad $16, %xmm0 876 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 877 ; SSE-NEXT: retq 878 ; 879 ; AVX-LABEL: sitofp_4i16_to_4f32: 880 ; AVX: # BB#0: 881 ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 882 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 883 ; AVX-NEXT: retq 884 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 885 %cvt = sitofp <4 x i16> %shuf to <4 x float> 886 ret <4 x float> %cvt 887 } 888 889 define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) { 890 ; SSE-LABEL: sitofp_8i16_to_4f32: 891 ; SSE: # BB#0: 892 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 893 ; SSE-NEXT: psrad $16, %xmm0 894 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 895 ; SSE-NEXT: retq 896 ; 897 ; AVX1-LABEL: sitofp_8i16_to_4f32: 898 ; AVX1: # BB#0: 899 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 900 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 901 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 902 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 903 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 904 ; AVX1-NEXT: # kill 905 ; AVX1-NEXT: vzeroupper 906 ; AVX1-NEXT: retq 907 ; 908 ; AVX2-LABEL: sitofp_8i16_to_4f32: 909 ; AVX2: # BB#0: 910 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 911 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 912 ; AVX2-NEXT: # kill 913 ; AVX2-NEXT: vzeroupper 914 ; AVX2-NEXT: retq 915 %cvt = sitofp <8 x i16> %a to <8 x float> 916 %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 917 ret <4 x float> %shuf 918 } 919 920 define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) { 921 ; SSE-LABEL: sitofp_4i8_to_4f32: 922 ; SSE: # BB#0: 923 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 924 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 925 ; SSE-NEXT: psrad $24, %xmm0 926 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 927 ; SSE-NEXT: retq 928 ; 929 ; AVX-LABEL: sitofp_4i8_to_4f32: 930 ; AVX: # BB#0: 931 ; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 932 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 933 ; AVX-NEXT: retq 934 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 935 %cvt = sitofp <4 x i8> %shuf to <4 x float> 936 ret <4 x float> %cvt 937 } 938 939 define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) { 940 ; SSE-LABEL: sitofp_16i8_to_4f32: 941 ; SSE: # BB#0: 942 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 943 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 944 ; SSE-NEXT: psrad $24, %xmm0 945 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 946 ; SSE-NEXT: retq 947 ; 948 ; AVX1-LABEL: sitofp_16i8_to_4f32: 949 ; AVX1: # BB#0: 950 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 951 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 952 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 953 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 954 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 955 ; AVX1-NEXT: # kill 956 ; AVX1-NEXT: vzeroupper 957 ; AVX1-NEXT: retq 958 ; 959 ; AVX2-LABEL: sitofp_16i8_to_4f32: 960 ; AVX2: # BB#0: 961 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 962 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 963 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 964 ; AVX2-NEXT: # kill 965 ; AVX2-NEXT: vzeroupper 966 ; AVX2-NEXT: retq 967 %cvt = sitofp <16 x i8> %a to <16 x float> 968 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 969 ret <4 x float> %shuf 970 } 971 972 define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { 973 ; SSE-LABEL: sitofp_4i64_to_4f32: 974 ; SSE: # BB#0: 975 ; SSE-NEXT: movd %xmm1, %rax 976 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3 977 ; SSE-NEXT: movd %xmm0, %rax 978 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 979 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 980 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 981 ; SSE-NEXT: movd %xmm1, %rax 982 ; SSE-NEXT: xorps %xmm1, %xmm1 983 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 984 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 985 ; SSE-NEXT: movd %xmm0, %rax 986 ; SSE-NEXT: xorps %xmm0, %xmm0 987 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 988 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 989 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 990 ; SSE-NEXT: movaps %xmm2, %xmm0 991 ; SSE-NEXT: retq 992 ; 993 ; AVX1-LABEL: sitofp_4i64_to_4f32: 994 ; AVX1: # BB#0: 995 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax 996 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 997 ; AVX1-NEXT: vmovq %xmm0, %rax 998 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 999 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] 1000 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1001 ; AVX1-NEXT: vmovq %xmm0, %rax 1002 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 1003 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 1004 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax 1005 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 1006 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 1007 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1008 ; AVX1-NEXT: vzeroupper 1009 ; AVX1-NEXT: retq 1010 ; 1011 ; AVX2-LABEL: sitofp_4i64_to_4f32: 1012 ; AVX2: # BB#0: 1013 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax 1014 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 1015 ; AVX2-NEXT: vmovq %xmm0, %rax 1016 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 1017 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] 1018 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1019 ; AVX2-NEXT: vmovq %xmm0, %rax 1020 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 1021 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 1022 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax 1023 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 1024 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 1025 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1026 ; AVX2-NEXT: vzeroupper 1027 ; AVX2-NEXT: retq 1028 %cvt = sitofp <4 x i64> %a to <4 x float> 1029 ret <4 x float> %cvt 1030 } 1031 1032 define <8 x float> @sitofp_8i32_to_8f32(<8 x i32> %a) { 1033 ; SSE-LABEL: sitofp_8i32_to_8f32: 1034 ; SSE: # BB#0: 1035 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 1036 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 1037 ; SSE-NEXT: retq 1038 ; 1039 ; AVX-LABEL: sitofp_8i32_to_8f32: 1040 ; AVX: # BB#0: 1041 ; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 1042 ; AVX-NEXT: retq 1043 %cvt = sitofp <8 x i32> %a to <8 x float> 1044 ret <8 x float> %cvt 1045 } 1046 1047 define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) { 1048 ; SSE-LABEL: sitofp_8i16_to_8f32: 1049 ; SSE: # BB#0: 1050 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1051 ; SSE-NEXT: psrad $16, %xmm1 1052 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm2 1053 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1054 ; SSE-NEXT: psrad $16, %xmm0 1055 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 1056 ; SSE-NEXT: movaps %xmm2, %xmm0 1057 ; SSE-NEXT: retq 1058 ; 1059 ; AVX1-LABEL: sitofp_8i16_to_8f32: 1060 ; AVX1: # BB#0: 1061 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 1062 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1063 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 1064 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1065 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1066 ; AVX1-NEXT: retq 1067 ; 1068 ; AVX2-LABEL: sitofp_8i16_to_8f32: 1069 ; AVX2: # BB#0: 1070 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 1071 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1072 ; AVX2-NEXT: retq 1073 %cvt = sitofp <8 x i16> %a to <8 x float> 1074 ret <8 x float> %cvt 1075 } 1076 1077 define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) { 1078 ; SSE-LABEL: sitofp_8i8_to_8f32: 1079 ; SSE: # BB#0: 1080 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1081 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 1082 ; SSE-NEXT: psrad $24, %xmm1 1083 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm2 1084 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1085 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1086 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1087 ; SSE-NEXT: psrad $24, %xmm0 1088 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 1089 ; SSE-NEXT: movaps %xmm2, %xmm0 1090 ; SSE-NEXT: retq 1091 ; 1092 ; AVX1-LABEL: sitofp_8i8_to_8f32: 1093 ; AVX1: # BB#0: 1094 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 1095 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1096 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 1097 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1098 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1099 ; AVX1-NEXT: retq 1100 ; 1101 ; AVX2-LABEL: sitofp_8i8_to_8f32: 1102 ; AVX2: # BB#0: 1103 ; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 1104 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1105 ; AVX2-NEXT: retq 1106 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1107 %cvt = sitofp <8 x i8> %shuf to <8 x float> 1108 ret <8 x float> %cvt 1109 } 1110 1111 define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) { 1112 ; SSE-LABEL: sitofp_16i8_to_8f32: 1113 ; SSE: # BB#0: 1114 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1115 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 1116 ; SSE-NEXT: psrad $24, %xmm1 1117 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm2 1118 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1119 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1120 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1121 ; SSE-NEXT: psrad $24, %xmm0 1122 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 1123 ; SSE-NEXT: movaps %xmm2, %xmm0 1124 ; SSE-NEXT: retq 1125 ; 1126 ; AVX1-LABEL: sitofp_16i8_to_8f32: 1127 ; AVX1: # BB#0: 1128 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 1129 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1130 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 1131 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1132 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1133 ; AVX1-NEXT: retq 1134 ; 1135 ; AVX2-LABEL: sitofp_16i8_to_8f32: 1136 ; AVX2: # BB#0: 1137 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1138 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 1139 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1140 ; AVX2-NEXT: retq 1141 %cvt = sitofp <16 x i8> %a to <16 x float> 1142 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1143 ret <8 x float> %shuf 1144 } 1145 1146 ; 1147 ; Unsigned Integer to Float 1148 ; 1149 1150 define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { 1151 ; SSE-LABEL: uitofp_2i64_to_4f32: 1152 ; SSE: # BB#0: 1153 ; SSE-NEXT: movdqa %xmm0, %xmm1 1154 ; SSE-NEXT: movd %xmm1, %rax 1155 ; SSE-NEXT: movl %eax, %ecx 1156 ; SSE-NEXT: andl $1, %ecx 1157 ; SSE-NEXT: testq %rax, %rax 1158 ; SSE-NEXT: js .LBB38_1 1159 ; SSE-NEXT: # BB#2: 1160 ; SSE-NEXT: xorps %xmm0, %xmm0 1161 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 1162 ; SSE-NEXT: jmp .LBB38_3 1163 ; SSE-NEXT: .LBB38_1: 1164 ; SSE-NEXT: shrq %rax 1165 ; SSE-NEXT: orq %rax, %rcx 1166 ; SSE-NEXT: xorps %xmm0, %xmm0 1167 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm0 1168 ; SSE-NEXT: addss %xmm0, %xmm0 1169 ; SSE-NEXT: .LBB38_3: 1170 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1171 ; SSE-NEXT: movd %xmm1, %rax 1172 ; SSE-NEXT: movl %eax, %ecx 1173 ; SSE-NEXT: andl $1, %ecx 1174 ; SSE-NEXT: testq %rax, %rax 1175 ; SSE-NEXT: js .LBB38_4 1176 ; SSE-NEXT: # BB#5: 1177 ; SSE-NEXT: xorps %xmm1, %xmm1 1178 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 1179 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1180 ; SSE-NEXT: retq 1181 ; SSE-NEXT: .LBB38_4: 1182 ; SSE-NEXT: shrq %rax 1183 ; SSE-NEXT: orq %rax, %rcx 1184 ; SSE-NEXT: xorps %xmm1, %xmm1 1185 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm1 1186 ; SSE-NEXT: addss %xmm1, %xmm1 1187 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1188 ; SSE-NEXT: retq 1189 ; 1190 ; AVX-LABEL: uitofp_2i64_to_4f32: 1191 ; AVX: # BB#0: 1192 ; AVX-NEXT: vpextrq $1, %xmm0, %rax 1193 ; AVX-NEXT: movl %eax, %ecx 1194 ; AVX-NEXT: andl $1, %ecx 1195 ; AVX-NEXT: testq %rax, %rax 1196 ; AVX-NEXT: js .LBB38_1 1197 ; AVX-NEXT: # BB#2: 1198 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 1199 ; AVX-NEXT: jmp .LBB38_3 1200 ; AVX-NEXT: .LBB38_1: 1201 ; AVX-NEXT: shrq %rax 1202 ; AVX-NEXT: orq %rax, %rcx 1203 ; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 1204 ; AVX-NEXT: vaddss %xmm1, %xmm1, %xmm1 1205 ; AVX-NEXT: .LBB38_3: 1206 ; AVX-NEXT: vmovq %xmm0, %rax 1207 ; AVX-NEXT: movl %eax, %ecx 1208 ; AVX-NEXT: andl $1, %ecx 1209 ; AVX-NEXT: testq %rax, %rax 1210 ; AVX-NEXT: js .LBB38_4 1211 ; AVX-NEXT: # BB#5: 1212 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1213 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 1214 ; AVX-NEXT: jmp .LBB38_6 1215 ; AVX-NEXT: .LBB38_4: 1216 ; AVX-NEXT: shrq %rax 1217 ; AVX-NEXT: orq %rax, %rcx 1218 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1219 ; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 1220 ; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0 1221 ; AVX-NEXT: .LBB38_6: 1222 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 1223 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1224 ; AVX-NEXT: testq %rax, %rax 1225 ; AVX-NEXT: js .LBB38_8 1226 ; AVX-NEXT: # BB#7: 1227 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 1228 ; AVX-NEXT: .LBB38_8: 1229 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 1230 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 1231 ; AVX-NEXT: retq 1232 %cvt = uitofp <2 x i64> %a to <2 x float> 1233 %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1234 ret <4 x float> %ext 1235 } 1236 1237 define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { 1238 ; SSE-LABEL: uitofp_4i64_to_4f32_undef: 1239 ; SSE: # BB#0: 1240 ; SSE-NEXT: movdqa %xmm0, %xmm1 1241 ; SSE-NEXT: testq %rax, %rax 1242 ; SSE-NEXT: xorps %xmm2, %xmm2 1243 ; SSE-NEXT: js .LBB39_2 1244 ; SSE-NEXT: # BB#1: 1245 ; SSE-NEXT: xorps %xmm2, %xmm2 1246 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 1247 ; SSE-NEXT: .LBB39_2: 1248 ; SSE-NEXT: movd %xmm1, %rax 1249 ; SSE-NEXT: movl %eax, %ecx 1250 ; SSE-NEXT: andl $1, %ecx 1251 ; SSE-NEXT: testq %rax, %rax 1252 ; SSE-NEXT: js .LBB39_3 1253 ; SSE-NEXT: # BB#4: 1254 ; SSE-NEXT: xorps %xmm0, %xmm0 1255 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 1256 ; SSE-NEXT: jmp .LBB39_5 1257 ; SSE-NEXT: .LBB39_3: 1258 ; SSE-NEXT: shrq %rax 1259 ; SSE-NEXT: orq %rax, %rcx 1260 ; SSE-NEXT: xorps %xmm0, %xmm0 1261 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm0 1262 ; SSE-NEXT: addss %xmm0, %xmm0 1263 ; SSE-NEXT: .LBB39_5: 1264 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1265 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1266 ; SSE-NEXT: movd %xmm1, %rax 1267 ; SSE-NEXT: movl %eax, %ecx 1268 ; SSE-NEXT: andl $1, %ecx 1269 ; SSE-NEXT: testq %rax, %rax 1270 ; SSE-NEXT: js .LBB39_6 1271 ; SSE-NEXT: # BB#7: 1272 ; SSE-NEXT: xorps %xmm1, %xmm1 1273 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 1274 ; SSE-NEXT: jmp .LBB39_8 1275 ; SSE-NEXT: .LBB39_6: 1276 ; SSE-NEXT: shrq %rax 1277 ; SSE-NEXT: orq %rax, %rcx 1278 ; SSE-NEXT: xorps %xmm1, %xmm1 1279 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm1 1280 ; SSE-NEXT: addss %xmm1, %xmm1 1281 ; SSE-NEXT: .LBB39_8: 1282 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1283 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1284 ; SSE-NEXT: retq 1285 ; 1286 ; AVX-LABEL: uitofp_4i64_to_4f32_undef: 1287 ; AVX: # BB#0: 1288 ; AVX-NEXT: vpextrq $1, %xmm0, %rax 1289 ; AVX-NEXT: movl %eax, %ecx 1290 ; AVX-NEXT: andl $1, %ecx 1291 ; AVX-NEXT: testq %rax, %rax 1292 ; AVX-NEXT: js .LBB39_1 1293 ; AVX-NEXT: # BB#2: 1294 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 1295 ; AVX-NEXT: jmp .LBB39_3 1296 ; AVX-NEXT: .LBB39_1: 1297 ; AVX-NEXT: shrq %rax 1298 ; AVX-NEXT: orq %rax, %rcx 1299 ; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 1300 ; AVX-NEXT: vaddss %xmm1, %xmm1, %xmm1 1301 ; AVX-NEXT: .LBB39_3: 1302 ; AVX-NEXT: vmovq %xmm0, %rax 1303 ; AVX-NEXT: movl %eax, %ecx 1304 ; AVX-NEXT: andl $1, %ecx 1305 ; AVX-NEXT: testq %rax, %rax 1306 ; AVX-NEXT: js .LBB39_4 1307 ; AVX-NEXT: # BB#5: 1308 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1309 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 1310 ; AVX-NEXT: jmp .LBB39_6 1311 ; AVX-NEXT: .LBB39_4: 1312 ; AVX-NEXT: shrq %rax 1313 ; AVX-NEXT: orq %rax, %rcx 1314 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1315 ; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 1316 ; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0 1317 ; AVX-NEXT: .LBB39_6: 1318 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 1319 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1320 ; AVX-NEXT: testq %rax, %rax 1321 ; AVX-NEXT: js .LBB39_8 1322 ; AVX-NEXT: # BB#7: 1323 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 1324 ; AVX-NEXT: .LBB39_8: 1325 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 1326 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 1327 ; AVX-NEXT: retq 1328 %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1329 %cvt = uitofp <4 x i64> %ext to <4 x float> 1330 ret <4 x float> %cvt 1331 } 1332 1333 define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) { 1334 ; SSE-LABEL: uitofp_4i32_to_4f32: 1335 ; SSE: # BB#0: 1336 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] 1337 ; SSE-NEXT: pand %xmm0, %xmm1 1338 ; SSE-NEXT: por {{.*}}(%rip), %xmm1 1339 ; SSE-NEXT: psrld $16, %xmm0 1340 ; SSE-NEXT: por {{.*}}(%rip), %xmm0 1341 ; SSE-NEXT: addps {{.*}}(%rip), %xmm0 1342 ; SSE-NEXT: addps %xmm1, %xmm0 1343 ; SSE-NEXT: retq 1344 ; 1345 ; AVX1-LABEL: uitofp_4i32_to_4f32: 1346 ; AVX1: # BB#0: 1347 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] 1348 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 1349 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] 1350 ; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 1351 ; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 1352 ; AVX1-NEXT: retq 1353 ; 1354 ; AVX2-LABEL: uitofp_4i32_to_4f32: 1355 ; AVX2: # BB#0: 1356 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 1357 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 1358 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 1359 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 1360 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 1361 ; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 1362 ; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0 1363 ; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 1364 ; AVX2-NEXT: retq 1365 %cvt = uitofp <4 x i32> %a to <4 x float> 1366 ret <4 x float> %cvt 1367 } 1368 1369 define <4 x float> @uitofp_4i16_to_4f32(<8 x i16> %a) { 1370 ; SSE-LABEL: uitofp_4i16_to_4f32: 1371 ; SSE: # BB#0: 1372 ; SSE-NEXT: pxor %xmm1, %xmm1 1373 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1374 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 1375 ; SSE-NEXT: retq 1376 ; 1377 ; AVX-LABEL: uitofp_4i16_to_4f32: 1378 ; AVX: # BB#0: 1379 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1380 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 1381 ; AVX-NEXT: retq 1382 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1383 %cvt = uitofp <4 x i16> %shuf to <4 x float> 1384 ret <4 x float> %cvt 1385 } 1386 1387 define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) { 1388 ; SSE-LABEL: uitofp_8i16_to_4f32: 1389 ; SSE: # BB#0: 1390 ; SSE-NEXT: pxor %xmm1, %xmm1 1391 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1392 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 1393 ; SSE-NEXT: retq 1394 ; 1395 ; AVX1-LABEL: uitofp_8i16_to_4f32: 1396 ; AVX1: # BB#0: 1397 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1398 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1399 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1400 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1401 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1402 ; AVX1-NEXT: # kill 1403 ; AVX1-NEXT: vzeroupper 1404 ; AVX1-NEXT: retq 1405 ; 1406 ; AVX2-LABEL: uitofp_8i16_to_4f32: 1407 ; AVX2: # BB#0: 1408 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1409 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1410 ; AVX2-NEXT: # kill 1411 ; AVX2-NEXT: vzeroupper 1412 ; AVX2-NEXT: retq 1413 %cvt = uitofp <8 x i16> %a to <8 x float> 1414 %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1415 ret <4 x float> %shuf 1416 } 1417 1418 define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) { 1419 ; SSE-LABEL: uitofp_4i8_to_4f32: 1420 ; SSE: # BB#0: 1421 ; SSE-NEXT: pxor %xmm1, %xmm1 1422 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1423 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1424 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 1425 ; SSE-NEXT: retq 1426 ; 1427 ; AVX-LABEL: uitofp_4i8_to_4f32: 1428 ; AVX: # BB#0: 1429 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1430 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 1431 ; AVX-NEXT: retq 1432 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1433 %cvt = uitofp <4 x i8> %shuf to <4 x float> 1434 ret <4 x float> %cvt 1435 } 1436 1437 define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) { 1438 ; SSE-LABEL: uitofp_16i8_to_4f32: 1439 ; SSE: # BB#0: 1440 ; SSE-NEXT: pxor %xmm1, %xmm1 1441 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1442 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1443 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 1444 ; SSE-NEXT: retq 1445 ; 1446 ; AVX1-LABEL: uitofp_16i8_to_4f32: 1447 ; AVX1: # BB#0: 1448 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1449 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1450 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1451 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1452 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1453 ; AVX1-NEXT: # kill 1454 ; AVX1-NEXT: vzeroupper 1455 ; AVX1-NEXT: retq 1456 ; 1457 ; AVX2-LABEL: uitofp_16i8_to_4f32: 1458 ; AVX2: # BB#0: 1459 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1460 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1461 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1462 ; AVX2-NEXT: # kill 1463 ; AVX2-NEXT: vzeroupper 1464 ; AVX2-NEXT: retq 1465 %cvt = uitofp <16 x i8> %a to <16 x float> 1466 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1467 ret <4 x float> %shuf 1468 } 1469 1470 define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { 1471 ; SSE-LABEL: uitofp_4i64_to_4f32: 1472 ; SSE: # BB#0: 1473 ; SSE-NEXT: movd %xmm1, %rax 1474 ; SSE-NEXT: movl %eax, %ecx 1475 ; SSE-NEXT: andl $1, %ecx 1476 ; SSE-NEXT: testq %rax, %rax 1477 ; SSE-NEXT: js .LBB45_1 1478 ; SSE-NEXT: # BB#2: 1479 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3 1480 ; SSE-NEXT: jmp .LBB45_3 1481 ; SSE-NEXT: .LBB45_1: 1482 ; SSE-NEXT: shrq %rax 1483 ; SSE-NEXT: orq %rax, %rcx 1484 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm3 1485 ; SSE-NEXT: addss %xmm3, %xmm3 1486 ; SSE-NEXT: .LBB45_3: 1487 ; SSE-NEXT: movd %xmm0, %rax 1488 ; SSE-NEXT: movl %eax, %ecx 1489 ; SSE-NEXT: andl $1, %ecx 1490 ; SSE-NEXT: testq %rax, %rax 1491 ; SSE-NEXT: js .LBB45_4 1492 ; SSE-NEXT: # BB#5: 1493 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 1494 ; SSE-NEXT: jmp .LBB45_6 1495 ; SSE-NEXT: .LBB45_4: 1496 ; SSE-NEXT: shrq %rax 1497 ; SSE-NEXT: orq %rax, %rcx 1498 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm2 1499 ; SSE-NEXT: addss %xmm2, %xmm2 1500 ; SSE-NEXT: .LBB45_6: 1501 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1502 ; SSE-NEXT: movd %xmm1, %rax 1503 ; SSE-NEXT: movl %eax, %ecx 1504 ; SSE-NEXT: andl $1, %ecx 1505 ; SSE-NEXT: testq %rax, %rax 1506 ; SSE-NEXT: js .LBB45_7 1507 ; SSE-NEXT: # BB#8: 1508 ; SSE-NEXT: xorps %xmm1, %xmm1 1509 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 1510 ; SSE-NEXT: jmp .LBB45_9 1511 ; SSE-NEXT: .LBB45_7: 1512 ; SSE-NEXT: shrq %rax 1513 ; SSE-NEXT: orq %rax, %rcx 1514 ; SSE-NEXT: xorps %xmm1, %xmm1 1515 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm1 1516 ; SSE-NEXT: addss %xmm1, %xmm1 1517 ; SSE-NEXT: .LBB45_9: 1518 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 1519 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1520 ; SSE-NEXT: movd %xmm0, %rax 1521 ; SSE-NEXT: movl %eax, %ecx 1522 ; SSE-NEXT: andl $1, %ecx 1523 ; SSE-NEXT: testq %rax, %rax 1524 ; SSE-NEXT: js .LBB45_10 1525 ; SSE-NEXT: # BB#11: 1526 ; SSE-NEXT: xorps %xmm0, %xmm0 1527 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 1528 ; SSE-NEXT: jmp .LBB45_12 1529 ; SSE-NEXT: .LBB45_10: 1530 ; SSE-NEXT: shrq %rax 1531 ; SSE-NEXT: orq %rax, %rcx 1532 ; SSE-NEXT: xorps %xmm0, %xmm0 1533 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm0 1534 ; SSE-NEXT: addss %xmm0, %xmm0 1535 ; SSE-NEXT: .LBB45_12: 1536 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1537 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 1538 ; SSE-NEXT: movaps %xmm2, %xmm0 1539 ; SSE-NEXT: retq 1540 ; 1541 ; AVX1-LABEL: uitofp_4i64_to_4f32: 1542 ; AVX1: # BB#0: 1543 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax 1544 ; AVX1-NEXT: movl %eax, %ecx 1545 ; AVX1-NEXT: andl $1, %ecx 1546 ; AVX1-NEXT: testq %rax, %rax 1547 ; AVX1-NEXT: js .LBB45_1 1548 ; AVX1-NEXT: # BB#2: 1549 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 1550 ; AVX1-NEXT: jmp .LBB45_3 1551 ; AVX1-NEXT: .LBB45_1: 1552 ; AVX1-NEXT: shrq %rax 1553 ; AVX1-NEXT: orq %rax, %rcx 1554 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 1555 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 1556 ; AVX1-NEXT: .LBB45_3: 1557 ; AVX1-NEXT: vmovq %xmm0, %rax 1558 ; AVX1-NEXT: movl %eax, %ecx 1559 ; AVX1-NEXT: andl $1, %ecx 1560 ; AVX1-NEXT: testq %rax, %rax 1561 ; AVX1-NEXT: js .LBB45_4 1562 ; AVX1-NEXT: # BB#5: 1563 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 1564 ; AVX1-NEXT: jmp .LBB45_6 1565 ; AVX1-NEXT: .LBB45_4: 1566 ; AVX1-NEXT: shrq %rax 1567 ; AVX1-NEXT: orq %rax, %rcx 1568 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 1569 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 1570 ; AVX1-NEXT: .LBB45_6: 1571 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] 1572 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1573 ; AVX1-NEXT: vmovq %xmm0, %rax 1574 ; AVX1-NEXT: movl %eax, %ecx 1575 ; AVX1-NEXT: andl $1, %ecx 1576 ; AVX1-NEXT: testq %rax, %rax 1577 ; AVX1-NEXT: js .LBB45_7 1578 ; AVX1-NEXT: # BB#8: 1579 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 1580 ; AVX1-NEXT: jmp .LBB45_9 1581 ; AVX1-NEXT: .LBB45_7: 1582 ; AVX1-NEXT: shrq %rax 1583 ; AVX1-NEXT: orq %rax, %rcx 1584 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 1585 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 1586 ; AVX1-NEXT: .LBB45_9: 1587 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 1588 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax 1589 ; AVX1-NEXT: movl %eax, %ecx 1590 ; AVX1-NEXT: andl $1, %ecx 1591 ; AVX1-NEXT: testq %rax, %rax 1592 ; AVX1-NEXT: js .LBB45_10 1593 ; AVX1-NEXT: # BB#11: 1594 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 1595 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 1596 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1597 ; AVX1-NEXT: vzeroupper 1598 ; AVX1-NEXT: retq 1599 ; AVX1-NEXT: .LBB45_10: 1600 ; AVX1-NEXT: shrq %rax 1601 ; AVX1-NEXT: orq %rax, %rcx 1602 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 1603 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 1604 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 1605 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1606 ; AVX1-NEXT: vzeroupper 1607 ; AVX1-NEXT: retq 1608 ; 1609 ; AVX2-LABEL: uitofp_4i64_to_4f32: 1610 ; AVX2: # BB#0: 1611 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax 1612 ; AVX2-NEXT: movl %eax, %ecx 1613 ; AVX2-NEXT: andl $1, %ecx 1614 ; AVX2-NEXT: testq %rax, %rax 1615 ; AVX2-NEXT: js .LBB45_1 1616 ; AVX2-NEXT: # BB#2: 1617 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 1618 ; AVX2-NEXT: jmp .LBB45_3 1619 ; AVX2-NEXT: .LBB45_1: 1620 ; AVX2-NEXT: shrq %rax 1621 ; AVX2-NEXT: orq %rax, %rcx 1622 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 1623 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 1624 ; AVX2-NEXT: .LBB45_3: 1625 ; AVX2-NEXT: vmovq %xmm0, %rax 1626 ; AVX2-NEXT: movl %eax, %ecx 1627 ; AVX2-NEXT: andl $1, %ecx 1628 ; AVX2-NEXT: testq %rax, %rax 1629 ; AVX2-NEXT: js .LBB45_4 1630 ; AVX2-NEXT: # BB#5: 1631 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 1632 ; AVX2-NEXT: jmp .LBB45_6 1633 ; AVX2-NEXT: .LBB45_4: 1634 ; AVX2-NEXT: shrq %rax 1635 ; AVX2-NEXT: orq %rax, %rcx 1636 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 1637 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 1638 ; AVX2-NEXT: .LBB45_6: 1639 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] 1640 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1641 ; AVX2-NEXT: vmovq %xmm0, %rax 1642 ; AVX2-NEXT: movl %eax, %ecx 1643 ; AVX2-NEXT: andl $1, %ecx 1644 ; AVX2-NEXT: testq %rax, %rax 1645 ; AVX2-NEXT: js .LBB45_7 1646 ; AVX2-NEXT: # BB#8: 1647 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 1648 ; AVX2-NEXT: jmp .LBB45_9 1649 ; AVX2-NEXT: .LBB45_7: 1650 ; AVX2-NEXT: shrq %rax 1651 ; AVX2-NEXT: orq %rax, %rcx 1652 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 1653 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 1654 ; AVX2-NEXT: .LBB45_9: 1655 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 1656 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax 1657 ; AVX2-NEXT: movl %eax, %ecx 1658 ; AVX2-NEXT: andl $1, %ecx 1659 ; AVX2-NEXT: testq %rax, %rax 1660 ; AVX2-NEXT: js .LBB45_10 1661 ; AVX2-NEXT: # BB#11: 1662 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 1663 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 1664 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1665 ; AVX2-NEXT: vzeroupper 1666 ; AVX2-NEXT: retq 1667 ; AVX2-NEXT: .LBB45_10: 1668 ; AVX2-NEXT: shrq %rax 1669 ; AVX2-NEXT: orq %rax, %rcx 1670 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 1671 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 1672 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 1673 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1674 ; AVX2-NEXT: vzeroupper 1675 ; AVX2-NEXT: retq 1676 %cvt = uitofp <4 x i64> %a to <4 x float> 1677 ret <4 x float> %cvt 1678 } 1679 1680 define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) { 1681 ; SSE-LABEL: uitofp_8i32_to_8f32: 1682 ; SSE: # BB#0: 1683 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] 1684 ; SSE-NEXT: movdqa %xmm0, %xmm3 1685 ; SSE-NEXT: pand %xmm2, %xmm3 1686 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200] 1687 ; SSE-NEXT: por %xmm4, %xmm3 1688 ; SSE-NEXT: psrld $16, %xmm0 1689 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928] 1690 ; SSE-NEXT: por %xmm5, %xmm0 1691 ; SSE-NEXT: movaps {{.*#+}} xmm6 = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11] 1692 ; SSE-NEXT: addps %xmm6, %xmm0 1693 ; SSE-NEXT: addps %xmm3, %xmm0 1694 ; SSE-NEXT: pand %xmm1, %xmm2 1695 ; SSE-NEXT: por %xmm4, %xmm2 1696 ; SSE-NEXT: psrld $16, %xmm1 1697 ; SSE-NEXT: por %xmm5, %xmm1 1698 ; SSE-NEXT: addps %xmm6, %xmm1 1699 ; SSE-NEXT: addps %xmm2, %xmm1 1700 ; SSE-NEXT: retq 1701 ; 1702 ; AVX1-LABEL: uitofp_8i32_to_8f32: 1703 ; AVX1: # BB#0: 1704 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1 1705 ; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 1706 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 1707 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1708 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 1709 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 1710 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1711 ; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 1712 ; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 1713 ; AVX1-NEXT: retq 1714 ; 1715 ; AVX2-LABEL: uitofp_8i32_to_8f32: 1716 ; AVX2: # BB#0: 1717 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 1718 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 1719 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 1720 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 1721 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] 1722 ; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 1723 ; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 1724 ; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 1725 ; AVX2-NEXT: retq 1726 %cvt = uitofp <8 x i32> %a to <8 x float> 1727 ret <8 x float> %cvt 1728 } 1729 1730 define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) { 1731 ; SSE-LABEL: uitofp_8i16_to_8f32: 1732 ; SSE: # BB#0: 1733 ; SSE-NEXT: pxor %xmm1, %xmm1 1734 ; SSE-NEXT: movdqa %xmm0, %xmm2 1735 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1736 ; SSE-NEXT: cvtdq2ps %xmm2, %xmm2 1737 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1738 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 1739 ; SSE-NEXT: movaps %xmm2, %xmm0 1740 ; SSE-NEXT: retq 1741 ; 1742 ; AVX1-LABEL: uitofp_8i16_to_8f32: 1743 ; AVX1: # BB#0: 1744 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1745 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1746 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1747 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1748 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1749 ; AVX1-NEXT: retq 1750 ; 1751 ; AVX2-LABEL: uitofp_8i16_to_8f32: 1752 ; AVX2: # BB#0: 1753 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1754 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1755 ; AVX2-NEXT: retq 1756 %cvt = uitofp <8 x i16> %a to <8 x float> 1757 ret <8 x float> %cvt 1758 } 1759 1760 define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) { 1761 ; SSE-LABEL: uitofp_8i8_to_8f32: 1762 ; SSE: # BB#0: 1763 ; SSE-NEXT: pxor %xmm1, %xmm1 1764 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1765 ; SSE-NEXT: movdqa %xmm0, %xmm2 1766 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1767 ; SSE-NEXT: cvtdq2ps %xmm2, %xmm2 1768 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1769 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 1770 ; SSE-NEXT: movaps %xmm2, %xmm0 1771 ; SSE-NEXT: retq 1772 ; 1773 ; AVX1-LABEL: uitofp_8i8_to_8f32: 1774 ; AVX1: # BB#0: 1775 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1776 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1777 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1778 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1779 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1780 ; AVX1-NEXT: retq 1781 ; 1782 ; AVX2-LABEL: uitofp_8i8_to_8f32: 1783 ; AVX2: # BB#0: 1784 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 1785 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1786 ; AVX2-NEXT: retq 1787 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1788 %cvt = uitofp <8 x i8> %shuf to <8 x float> 1789 ret <8 x float> %cvt 1790 } 1791 1792 define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) { 1793 ; SSE-LABEL: uitofp_16i8_to_8f32: 1794 ; SSE: # BB#0: 1795 ; SSE-NEXT: pxor %xmm1, %xmm1 1796 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1797 ; SSE-NEXT: movdqa %xmm0, %xmm2 1798 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1799 ; SSE-NEXT: cvtdq2ps %xmm2, %xmm2 1800 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1801 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 1802 ; SSE-NEXT: movaps %xmm2, %xmm0 1803 ; SSE-NEXT: retq 1804 ; 1805 ; AVX1-LABEL: uitofp_16i8_to_8f32: 1806 ; AVX1: # BB#0: 1807 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1808 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1809 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1810 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1811 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1812 ; AVX1-NEXT: retq 1813 ; 1814 ; AVX2-LABEL: uitofp_16i8_to_8f32: 1815 ; AVX2: # BB#0: 1816 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1817 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1818 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1819 ; AVX2-NEXT: retq 1820 %cvt = uitofp <16 x i8> %a to <16 x float> 1821 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1822 ret <8 x float> %shuf 1823 } 1824 1825 ; 1826 ; Load Signed Integer to Double 1827 ; 1828 1829 define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) { 1830 ; SSE-LABEL: sitofp_load_2i64_to_2f64: 1831 ; SSE: # BB#0: 1832 ; SSE-NEXT: movdqa (%rdi), %xmm1 1833 ; SSE-NEXT: movd %xmm1, %rax 1834 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0 1835 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1836 ; SSE-NEXT: movd %xmm1, %rax 1837 ; SSE-NEXT: xorps %xmm1, %xmm1 1838 ; SSE-NEXT: cvtsi2sdq %rax, %xmm1 1839 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1840 ; SSE-NEXT: retq 1841 ; 1842 ; AVX-LABEL: sitofp_load_2i64_to_2f64: 1843 ; AVX: # BB#0: 1844 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 1845 ; AVX-NEXT: vpextrq $1, %xmm0, %rax 1846 ; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 1847 ; AVX-NEXT: vmovq %xmm0, %rax 1848 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1849 ; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 1850 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1851 ; AVX-NEXT: retq 1852 %ld = load <2 x i64>, <2 x i64> *%a 1853 %cvt = sitofp <2 x i64> %ld to <2 x double> 1854 ret <2 x double> %cvt 1855 } 1856 1857 define <2 x double> @sitofp_load_2i32_to_2f64(<2 x i32> *%a) { 1858 ; SSE-LABEL: sitofp_load_2i32_to_2f64: 1859 ; SSE: # BB#0: 1860 ; SSE-NEXT: cvtdq2pd (%rdi), %xmm0 1861 ; SSE-NEXT: retq 1862 ; 1863 ; AVX-LABEL: sitofp_load_2i32_to_2f64: 1864 ; AVX: # BB#0: 1865 ; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0 1866 ; AVX-NEXT: retq 1867 %ld = load <2 x i32>, <2 x i32> *%a 1868 %cvt = sitofp <2 x i32> %ld to <2 x double> 1869 ret <2 x double> %cvt 1870 } 1871 1872 define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) { 1873 ; SSE-LABEL: sitofp_load_2i16_to_2f64: 1874 ; SSE: # BB#0: 1875 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1876 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1877 ; SSE-NEXT: psrad $16, %xmm0 1878 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 1879 ; SSE-NEXT: retq 1880 ; 1881 ; AVX-LABEL: sitofp_load_2i16_to_2f64: 1882 ; AVX: # BB#0: 1883 ; AVX-NEXT: vpmovsxwq (%rdi), %xmm0 1884 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1885 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 1886 ; AVX-NEXT: retq 1887 %ld = load <2 x i16>, <2 x i16> *%a 1888 %cvt = sitofp <2 x i16> %ld to <2 x double> 1889 ret <2 x double> %cvt 1890 } 1891 1892 define <2 x double> @sitofp_load_2i8_to_2f64(<2 x i8> *%a) { 1893 ; SSE-LABEL: sitofp_load_2i8_to_2f64: 1894 ; SSE: # BB#0: 1895 ; SSE-NEXT: movzwl (%rdi), %eax 1896 ; SSE-NEXT: movd %eax, %xmm0 1897 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1898 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1899 ; SSE-NEXT: psrad $24, %xmm0 1900 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 1901 ; SSE-NEXT: retq 1902 ; 1903 ; AVX-LABEL: sitofp_load_2i8_to_2f64: 1904 ; AVX: # BB#0: 1905 ; AVX-NEXT: vpmovsxbq (%rdi), %xmm0 1906 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1907 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 1908 ; AVX-NEXT: retq 1909 %ld = load <2 x i8>, <2 x i8> *%a 1910 %cvt = sitofp <2 x i8> %ld to <2 x double> 1911 ret <2 x double> %cvt 1912 } 1913 1914 define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) { 1915 ; SSE-LABEL: sitofp_load_4i64_to_4f64: 1916 ; SSE: # BB#0: 1917 ; SSE-NEXT: movdqa (%rdi), %xmm1 1918 ; SSE-NEXT: movdqa 16(%rdi), %xmm2 1919 ; SSE-NEXT: movd %xmm1, %rax 1920 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0 1921 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1922 ; SSE-NEXT: movd %xmm1, %rax 1923 ; SSE-NEXT: xorps %xmm1, %xmm1 1924 ; SSE-NEXT: cvtsi2sdq %rax, %xmm1 1925 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1926 ; SSE-NEXT: movd %xmm2, %rax 1927 ; SSE-NEXT: xorps %xmm1, %xmm1 1928 ; SSE-NEXT: cvtsi2sdq %rax, %xmm1 1929 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 1930 ; SSE-NEXT: movd %xmm2, %rax 1931 ; SSE-NEXT: xorps %xmm2, %xmm2 1932 ; SSE-NEXT: cvtsi2sdq %rax, %xmm2 1933 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1934 ; SSE-NEXT: retq 1935 ; 1936 ; AVX1-LABEL: sitofp_load_4i64_to_4f64: 1937 ; AVX1: # BB#0: 1938 ; AVX1-NEXT: vmovaps (%rdi), %ymm0 1939 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1940 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax 1941 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 1942 ; AVX1-NEXT: vmovq %xmm1, %rax 1943 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 1944 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1945 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax 1946 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 1947 ; AVX1-NEXT: vmovq %xmm0, %rax 1948 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 1949 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 1950 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] 1951 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1952 ; AVX1-NEXT: retq 1953 ; 1954 ; AVX2-LABEL: sitofp_load_4i64_to_4f64: 1955 ; AVX2: # BB#0: 1956 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1957 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1958 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax 1959 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 1960 ; AVX2-NEXT: vmovq %xmm1, %rax 1961 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 1962 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1963 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax 1964 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 1965 ; AVX2-NEXT: vmovq %xmm0, %rax 1966 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 1967 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 1968 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] 1969 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1970 ; AVX2-NEXT: retq 1971 %ld = load <4 x i64>, <4 x i64> *%a 1972 %cvt = sitofp <4 x i64> %ld to <4 x double> 1973 ret <4 x double> %cvt 1974 } 1975 1976 define <4 x double> @sitofp_load_4i32_to_4f64(<4 x i32> *%a) { 1977 ; SSE-LABEL: sitofp_load_4i32_to_4f64: 1978 ; SSE: # BB#0: 1979 ; SSE-NEXT: movdqa (%rdi), %xmm1 1980 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 1981 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1982 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 1983 ; SSE-NEXT: retq 1984 ; 1985 ; AVX-LABEL: sitofp_load_4i32_to_4f64: 1986 ; AVX: # BB#0: 1987 ; AVX-NEXT: vcvtdq2pd (%rdi), %ymm0 1988 ; AVX-NEXT: retq 1989 %ld = load <4 x i32>, <4 x i32> *%a 1990 %cvt = sitofp <4 x i32> %ld to <4 x double> 1991 ret <4 x double> %cvt 1992 } 1993 1994 define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) { 1995 ; SSE-LABEL: sitofp_load_4i16_to_4f64: 1996 ; SSE: # BB#0: 1997 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 1998 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1999 ; SSE-NEXT: psrad $16, %xmm1 2000 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 2001 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2002 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 2003 ; SSE-NEXT: retq 2004 ; 2005 ; AVX-LABEL: sitofp_load_4i16_to_4f64: 2006 ; AVX: # BB#0: 2007 ; AVX-NEXT: vpmovsxwd (%rdi), %xmm0 2008 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 2009 ; AVX-NEXT: retq 2010 %ld = load <4 x i16>, <4 x i16> *%a 2011 %cvt = sitofp <4 x i16> %ld to <4 x double> 2012 ret <4 x double> %cvt 2013 } 2014 2015 define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) { 2016 ; SSE-LABEL: sitofp_load_4i8_to_4f64: 2017 ; SSE: # BB#0: 2018 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2019 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2020 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2021 ; SSE-NEXT: psrad $24, %xmm1 2022 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 2023 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2024 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 2025 ; SSE-NEXT: retq 2026 ; 2027 ; AVX-LABEL: sitofp_load_4i8_to_4f64: 2028 ; AVX: # BB#0: 2029 ; AVX-NEXT: vpmovsxbd (%rdi), %xmm0 2030 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 2031 ; AVX-NEXT: retq 2032 %ld = load <4 x i8>, <4 x i8> *%a 2033 %cvt = sitofp <4 x i8> %ld to <4 x double> 2034 ret <4 x double> %cvt 2035 } 2036 2037 ; 2038 ; Load Unsigned Integer to Double 2039 ; 2040 2041 define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) { 2042 ; SSE-LABEL: uitofp_load_2i64_to_2f64: 2043 ; SSE: # BB#0: 2044 ; SSE-NEXT: movdqa (%rdi), %xmm1 2045 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] 2046 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 2047 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2048 ; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] 2049 ; SSE-NEXT: subpd %xmm4, %xmm1 2050 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 2051 ; SSE-NEXT: addpd %xmm1, %xmm0 2052 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 2053 ; SSE-NEXT: subpd %xmm4, %xmm3 2054 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] 2055 ; SSE-NEXT: addpd %xmm3, %xmm1 2056 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2057 ; SSE-NEXT: retq 2058 ; 2059 ; AVX-LABEL: uitofp_load_2i64_to_2f64: 2060 ; AVX: # BB#0: 2061 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 2062 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] 2063 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2064 ; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] 2065 ; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2 2066 ; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2 2067 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 2068 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2069 ; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0 2070 ; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 2071 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] 2072 ; AVX-NEXT: retq 2073 %ld = load <2 x i64>, <2 x i64> *%a 2074 %cvt = uitofp <2 x i64> %ld to <2 x double> 2075 ret <2 x double> %cvt 2076 } 2077 2078 define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) { 2079 ; SSE-LABEL: uitofp_load_2i32_to_2f64: 2080 ; SSE: # BB#0: 2081 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 2082 ; SSE-NEXT: pxor %xmm0, %xmm0 2083 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 2084 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] 2085 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 2086 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2087 ; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] 2088 ; SSE-NEXT: subpd %xmm4, %xmm1 2089 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 2090 ; SSE-NEXT: addpd %xmm1, %xmm0 2091 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 2092 ; SSE-NEXT: subpd %xmm4, %xmm3 2093 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] 2094 ; SSE-NEXT: addpd %xmm3, %xmm1 2095 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2096 ; SSE-NEXT: retq 2097 ; 2098 ; AVX-LABEL: uitofp_load_2i32_to_2f64: 2099 ; AVX: # BB#0: 2100 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero 2101 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] 2102 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2103 ; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] 2104 ; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2 2105 ; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2 2106 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 2107 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2108 ; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0 2109 ; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 2110 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] 2111 ; AVX-NEXT: retq 2112 %ld = load <2 x i32>, <2 x i32> *%a 2113 %cvt = uitofp <2 x i32> %ld to <2 x double> 2114 ret <2 x double> %cvt 2115 } 2116 2117 define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) { 2118 ; SSE-LABEL: uitofp_load_2i16_to_2f64: 2119 ; SSE: # BB#0: 2120 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2121 ; SSE-NEXT: pxor %xmm1, %xmm1 2122 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2123 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 2124 ; SSE-NEXT: retq 2125 ; 2126 ; AVX-LABEL: uitofp_load_2i16_to_2f64: 2127 ; AVX: # BB#0: 2128 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2129 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 2130 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 2131 ; AVX-NEXT: retq 2132 %ld = load <2 x i16>, <2 x i16> *%a 2133 %cvt = uitofp <2 x i16> %ld to <2 x double> 2134 ret <2 x double> %cvt 2135 } 2136 2137 define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) { 2138 ; SSE-LABEL: uitofp_load_2i8_to_2f64: 2139 ; SSE: # BB#0: 2140 ; SSE-NEXT: movzwl (%rdi), %eax 2141 ; SSE-NEXT: movd %eax, %xmm0 2142 ; SSE-NEXT: pxor %xmm1, %xmm1 2143 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 2144 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2145 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 2146 ; SSE-NEXT: retq 2147 ; 2148 ; AVX-LABEL: uitofp_load_2i8_to_2f64: 2149 ; AVX: # BB#0: 2150 ; AVX-NEXT: movzwl (%rdi), %eax 2151 ; AVX-NEXT: vmovd %eax, %xmm0 2152 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2153 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 2154 ; AVX-NEXT: retq 2155 %ld = load <2 x i8>, <2 x i8> *%a 2156 %cvt = uitofp <2 x i8> %ld to <2 x double> 2157 ret <2 x double> %cvt 2158 } 2159 2160 define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) { 2161 ; SSE-LABEL: uitofp_load_4i64_to_4f64: 2162 ; SSE: # BB#0: 2163 ; SSE-NEXT: movdqa (%rdi), %xmm1 2164 ; SSE-NEXT: movdqa 16(%rdi), %xmm2 2165 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0] 2166 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] 2167 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 2168 ; SSE-NEXT: movapd {{.*#+}} xmm5 = [4.503600e+15,1.934281e+25] 2169 ; SSE-NEXT: subpd %xmm5, %xmm1 2170 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 2171 ; SSE-NEXT: addpd %xmm1, %xmm0 2172 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 2173 ; SSE-NEXT: subpd %xmm5, %xmm4 2174 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,0,1] 2175 ; SSE-NEXT: addpd %xmm4, %xmm1 2176 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2177 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] 2178 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 2179 ; SSE-NEXT: subpd %xmm5, %xmm2 2180 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] 2181 ; SSE-NEXT: addpd %xmm2, %xmm1 2182 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 2183 ; SSE-NEXT: subpd %xmm5, %xmm4 2184 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1] 2185 ; SSE-NEXT: addpd %xmm4, %xmm2 2186 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 2187 ; SSE-NEXT: retq 2188 ; 2189 ; AVX1-LABEL: uitofp_load_4i64_to_4f64: 2190 ; AVX1: # BB#0: 2191 ; AVX1-NEXT: vmovaps (%rdi), %ymm0 2192 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2193 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] 2194 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2195 ; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] 2196 ; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3 2197 ; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 2198 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2199 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2200 ; AVX1-NEXT: vsubpd %xmm4, %xmm1, %xmm1 2201 ; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 2202 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0] 2203 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 2204 ; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3 2205 ; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 2206 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 2207 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 2208 ; AVX1-NEXT: vsubpd %xmm4, %xmm0, %xmm0 2209 ; AVX1-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 2210 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0] 2211 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2212 ; AVX1-NEXT: retq 2213 ; 2214 ; AVX2-LABEL: uitofp_load_4i64_to_4f64: 2215 ; AVX2: # BB#0: 2216 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2217 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2218 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] 2219 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2220 ; AVX2-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] 2221 ; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3 2222 ; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 2223 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2224 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2225 ; AVX2-NEXT: vsubpd %xmm4, %xmm1, %xmm1 2226 ; AVX2-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 2227 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0] 2228 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 2229 ; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3 2230 ; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 2231 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 2232 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 2233 ; AVX2-NEXT: vsubpd %xmm4, %xmm0, %xmm0 2234 ; AVX2-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 2235 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0] 2236 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2237 ; AVX2-NEXT: retq 2238 %ld = load <4 x i64>, <4 x i64> *%a 2239 %cvt = uitofp <4 x i64> %ld to <4 x double> 2240 ret <4 x double> %cvt 2241 } 2242 2243 define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) { 2244 ; SSE-LABEL: uitofp_load_4i32_to_4f64: 2245 ; SSE: # BB#0: 2246 ; SSE-NEXT: movdqa (%rdi), %xmm2 2247 ; SSE-NEXT: pxor %xmm1, %xmm1 2248 ; SSE-NEXT: movdqa %xmm2, %xmm3 2249 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 2250 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1127219200,1160773632,0,0] 2251 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1] 2252 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 2253 ; SSE-NEXT: movapd {{.*#+}} xmm6 = [4.503600e+15,1.934281e+25] 2254 ; SSE-NEXT: subpd %xmm6, %xmm3 2255 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] 2256 ; SSE-NEXT: addpd %xmm3, %xmm0 2257 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 2258 ; SSE-NEXT: subpd %xmm6, %xmm5 2259 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,0,1] 2260 ; SSE-NEXT: addpd %xmm5, %xmm3 2261 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] 2262 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 2263 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] 2264 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 2265 ; SSE-NEXT: subpd %xmm6, %xmm2 2266 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] 2267 ; SSE-NEXT: addpd %xmm2, %xmm1 2268 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 2269 ; SSE-NEXT: subpd %xmm6, %xmm3 2270 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] 2271 ; SSE-NEXT: addpd %xmm3, %xmm2 2272 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 2273 ; SSE-NEXT: retq 2274 ; 2275 ; AVX1-LABEL: uitofp_load_4i32_to_4f64: 2276 ; AVX1: # BB#0: 2277 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 2278 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 2279 ; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1 2280 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 2281 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 2282 ; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 2283 ; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 2284 ; AVX1-NEXT: retq 2285 ; 2286 ; AVX2-LABEL: uitofp_load_4i32_to_4f64: 2287 ; AVX2: # BB#0: 2288 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 2289 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 2290 ; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1 2291 ; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2 2292 ; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1 2293 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 2294 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 2295 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 2296 ; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 2297 ; AVX2-NEXT: retq 2298 %ld = load <4 x i32>, <4 x i32> *%a 2299 %cvt = uitofp <4 x i32> %ld to <4 x double> 2300 ret <4 x double> %cvt 2301 } 2302 2303 define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) { 2304 ; SSE-LABEL: uitofp_load_4i16_to_4f64: 2305 ; SSE: # BB#0: 2306 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 2307 ; SSE-NEXT: pxor %xmm0, %xmm0 2308 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2309 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 2310 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2311 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 2312 ; SSE-NEXT: retq 2313 ; 2314 ; AVX-LABEL: uitofp_load_4i16_to_4f64: 2315 ; AVX: # BB#0: 2316 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2317 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 2318 ; AVX-NEXT: retq 2319 %ld = load <4 x i16>, <4 x i16> *%a 2320 %cvt = uitofp <4 x i16> %ld to <4 x double> 2321 ret <4 x double> %cvt 2322 } 2323 2324 define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) { 2325 ; SSE-LABEL: uitofp_load_4i8_to_4f64: 2326 ; SSE: # BB#0: 2327 ; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2328 ; SSE-NEXT: pxor %xmm0, %xmm0 2329 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2330 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2331 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 2332 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2333 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 2334 ; SSE-NEXT: retq 2335 ; 2336 ; AVX-LABEL: uitofp_load_4i8_to_4f64: 2337 ; AVX: # BB#0: 2338 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2339 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 2340 ; AVX-NEXT: retq 2341 %ld = load <4 x i8>, <4 x i8> *%a 2342 %cvt = uitofp <4 x i8> %ld to <4 x double> 2343 ret <4 x double> %cvt 2344 } 2345 2346 ; 2347 ; Load Signed Integer to Float 2348 ; 2349 2350 define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) { 2351 ; SSE-LABEL: sitofp_load_4i64_to_4f32: 2352 ; SSE: # BB#0: 2353 ; SSE-NEXT: movdqa (%rdi), %xmm1 2354 ; SSE-NEXT: movdqa 16(%rdi), %xmm2 2355 ; SSE-NEXT: movd %xmm2, %rax 2356 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3 2357 ; SSE-NEXT: movd %xmm1, %rax 2358 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 2359 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 2360 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 2361 ; SSE-NEXT: movd %xmm2, %rax 2362 ; SSE-NEXT: xorps %xmm2, %xmm2 2363 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 2364 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2365 ; SSE-NEXT: movd %xmm1, %rax 2366 ; SSE-NEXT: xorps %xmm1, %xmm1 2367 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 2368 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2369 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2370 ; SSE-NEXT: retq 2371 ; 2372 ; AVX1-LABEL: sitofp_load_4i64_to_4f32: 2373 ; AVX1: # BB#0: 2374 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 2375 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax 2376 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 2377 ; AVX1-NEXT: vmovq %xmm0, %rax 2378 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2379 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] 2380 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2381 ; AVX1-NEXT: vmovq %xmm0, %rax 2382 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2383 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 2384 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax 2385 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 2386 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 2387 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 2388 ; AVX1-NEXT: vzeroupper 2389 ; AVX1-NEXT: retq 2390 ; 2391 ; AVX2-LABEL: sitofp_load_4i64_to_4f32: 2392 ; AVX2: # BB#0: 2393 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2394 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax 2395 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 2396 ; AVX2-NEXT: vmovq %xmm0, %rax 2397 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2398 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] 2399 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 2400 ; AVX2-NEXT: vmovq %xmm0, %rax 2401 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2402 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 2403 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax 2404 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 2405 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 2406 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 2407 ; AVX2-NEXT: vzeroupper 2408 ; AVX2-NEXT: retq 2409 %ld = load <4 x i64>, <4 x i64> *%a 2410 %cvt = sitofp <4 x i64> %ld to <4 x float> 2411 ret <4 x float> %cvt 2412 } 2413 2414 define <4 x float> @sitofp_load_4i32_to_4f32(<4 x i32> *%a) { 2415 ; SSE-LABEL: sitofp_load_4i32_to_4f32: 2416 ; SSE: # BB#0: 2417 ; SSE-NEXT: cvtdq2ps (%rdi), %xmm0 2418 ; SSE-NEXT: retq 2419 ; 2420 ; AVX-LABEL: sitofp_load_4i32_to_4f32: 2421 ; AVX: # BB#0: 2422 ; AVX-NEXT: vcvtdq2ps (%rdi), %xmm0 2423 ; AVX-NEXT: retq 2424 %ld = load <4 x i32>, <4 x i32> *%a 2425 %cvt = sitofp <4 x i32> %ld to <4 x float> 2426 ret <4 x float> %cvt 2427 } 2428 2429 define <4 x float> @sitofp_load_4i16_to_4f32(<4 x i16> *%a) { 2430 ; SSE-LABEL: sitofp_load_4i16_to_4f32: 2431 ; SSE: # BB#0: 2432 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 2433 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2434 ; SSE-NEXT: psrad $16, %xmm0 2435 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 2436 ; SSE-NEXT: retq 2437 ; 2438 ; AVX-LABEL: sitofp_load_4i16_to_4f32: 2439 ; AVX: # BB#0: 2440 ; AVX-NEXT: vpmovsxwd (%rdi), %xmm0 2441 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 2442 ; AVX-NEXT: retq 2443 %ld = load <4 x i16>, <4 x i16> *%a 2444 %cvt = sitofp <4 x i16> %ld to <4 x float> 2445 ret <4 x float> %cvt 2446 } 2447 2448 define <4 x float> @sitofp_load_4i8_to_4f32(<4 x i8> *%a) { 2449 ; SSE-LABEL: sitofp_load_4i8_to_4f32: 2450 ; SSE: # BB#0: 2451 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2452 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2453 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2454 ; SSE-NEXT: psrad $24, %xmm0 2455 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 2456 ; SSE-NEXT: retq 2457 ; 2458 ; AVX-LABEL: sitofp_load_4i8_to_4f32: 2459 ; AVX: # BB#0: 2460 ; AVX-NEXT: vpmovsxbd (%rdi), %xmm0 2461 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 2462 ; AVX-NEXT: retq 2463 %ld = load <4 x i8>, <4 x i8> *%a 2464 %cvt = sitofp <4 x i8> %ld to <4 x float> 2465 ret <4 x float> %cvt 2466 } 2467 2468 define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) { 2469 ; SSE-LABEL: sitofp_load_8i64_to_8f32: 2470 ; SSE: # BB#0: 2471 ; SSE-NEXT: movdqa (%rdi), %xmm1 2472 ; SSE-NEXT: movdqa 16(%rdi), %xmm2 2473 ; SSE-NEXT: movdqa 32(%rdi), %xmm3 2474 ; SSE-NEXT: movdqa 48(%rdi), %xmm4 2475 ; SSE-NEXT: movd %xmm2, %rax 2476 ; SSE-NEXT: cvtsi2ssq %rax, %xmm5 2477 ; SSE-NEXT: movd %xmm1, %rax 2478 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 2479 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 2480 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 2481 ; SSE-NEXT: movd %xmm2, %rax 2482 ; SSE-NEXT: xorps %xmm2, %xmm2 2483 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 2484 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2485 ; SSE-NEXT: movd %xmm1, %rax 2486 ; SSE-NEXT: xorps %xmm1, %xmm1 2487 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 2488 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2489 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2490 ; SSE-NEXT: movd %xmm4, %rax 2491 ; SSE-NEXT: xorps %xmm2, %xmm2 2492 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 2493 ; SSE-NEXT: movd %xmm3, %rax 2494 ; SSE-NEXT: xorps %xmm1, %xmm1 2495 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 2496 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2497 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1] 2498 ; SSE-NEXT: movd %xmm2, %rax 2499 ; SSE-NEXT: xorps %xmm2, %xmm2 2500 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 2501 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 2502 ; SSE-NEXT: movd %xmm3, %rax 2503 ; SSE-NEXT: xorps %xmm3, %xmm3 2504 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3 2505 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 2506 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 2507 ; SSE-NEXT: retq 2508 ; 2509 ; AVX1-LABEL: sitofp_load_8i64_to_8f32: 2510 ; AVX1: # BB#0: 2511 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 2512 ; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 2513 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax 2514 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2515 ; AVX1-NEXT: vmovq %xmm1, %rax 2516 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 2517 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 2518 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2519 ; AVX1-NEXT: vmovq %xmm1, %rax 2520 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 2521 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] 2522 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax 2523 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 2524 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] 2525 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax 2526 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2527 ; AVX1-NEXT: vmovq %xmm0, %rax 2528 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 2529 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 2530 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2531 ; AVX1-NEXT: vmovq %xmm0, %rax 2532 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 2533 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] 2534 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax 2535 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 2536 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 2537 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] 2538 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2539 ; AVX1-NEXT: retq 2540 ; 2541 ; AVX2-LABEL: sitofp_load_8i64_to_8f32: 2542 ; AVX2: # BB#0: 2543 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2544 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 2545 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax 2546 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2547 ; AVX2-NEXT: vmovq %xmm1, %rax 2548 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 2549 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 2550 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 2551 ; AVX2-NEXT: vmovq %xmm1, %rax 2552 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 2553 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] 2554 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax 2555 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 2556 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] 2557 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax 2558 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2559 ; AVX2-NEXT: vmovq %xmm0, %rax 2560 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 2561 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 2562 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 2563 ; AVX2-NEXT: vmovq %xmm0, %rax 2564 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 2565 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] 2566 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax 2567 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 2568 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 2569 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] 2570 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2571 ; AVX2-NEXT: retq 2572 %ld = load <8 x i64>, <8 x i64> *%a 2573 %cvt = sitofp <8 x i64> %ld to <8 x float> 2574 ret <8 x float> %cvt 2575 } 2576 2577 define <8 x float> @sitofp_load_8i32_to_8f32(<8 x i32> *%a) { 2578 ; SSE-LABEL: sitofp_load_8i32_to_8f32: 2579 ; SSE: # BB#0: 2580 ; SSE-NEXT: cvtdq2ps (%rdi), %xmm0 2581 ; SSE-NEXT: cvtdq2ps 16(%rdi), %xmm1 2582 ; SSE-NEXT: retq 2583 ; 2584 ; AVX-LABEL: sitofp_load_8i32_to_8f32: 2585 ; AVX: # BB#0: 2586 ; AVX-NEXT: vcvtdq2ps (%rdi), %ymm0 2587 ; AVX-NEXT: retq 2588 %ld = load <8 x i32>, <8 x i32> *%a 2589 %cvt = sitofp <8 x i32> %ld to <8 x float> 2590 ret <8 x float> %cvt 2591 } 2592 2593 define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) { 2594 ; SSE-LABEL: sitofp_load_8i16_to_8f32: 2595 ; SSE: # BB#0: 2596 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 2597 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2598 ; SSE-NEXT: psrad $16, %xmm0 2599 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 2600 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 2601 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 2602 ; SSE-NEXT: psrad $16, %xmm1 2603 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 2604 ; SSE-NEXT: retq 2605 ; 2606 ; AVX1-LABEL: sitofp_load_8i16_to_8f32: 2607 ; AVX1: # BB#0: 2608 ; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0 2609 ; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1 2610 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2611 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 2612 ; AVX1-NEXT: retq 2613 ; 2614 ; AVX2-LABEL: sitofp_load_8i16_to_8f32: 2615 ; AVX2: # BB#0: 2616 ; AVX2-NEXT: vpmovsxwd (%rdi), %ymm0 2617 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 2618 ; AVX2-NEXT: retq 2619 %ld = load <8 x i16>, <8 x i16> *%a 2620 %cvt = sitofp <8 x i16> %ld to <8 x float> 2621 ret <8 x float> %cvt 2622 } 2623 2624 define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) { 2625 ; SSE-LABEL: sitofp_load_8i8_to_8f32: 2626 ; SSE: # BB#0: 2627 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2628 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2629 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2630 ; SSE-NEXT: psrad $24, %xmm0 2631 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 2632 ; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2633 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2634 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 2635 ; SSE-NEXT: psrad $24, %xmm1 2636 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 2637 ; SSE-NEXT: retq 2638 ; 2639 ; AVX1-LABEL: sitofp_load_8i8_to_8f32: 2640 ; AVX1: # BB#0: 2641 ; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0 2642 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 2643 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 2644 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 2645 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2646 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 2647 ; AVX1-NEXT: retq 2648 ; 2649 ; AVX2-LABEL: sitofp_load_8i8_to_8f32: 2650 ; AVX2: # BB#0: 2651 ; AVX2-NEXT: vpmovsxbd (%rdi), %ymm0 2652 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 2653 ; AVX2-NEXT: retq 2654 %ld = load <8 x i8>, <8 x i8> *%a 2655 %cvt = sitofp <8 x i8> %ld to <8 x float> 2656 ret <8 x float> %cvt 2657 } 2658 2659 ; 2660 ; Load Unsigned Integer to Float 2661 ; 2662 2663 define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { 2664 ; SSE-LABEL: uitofp_load_4i64_to_4f32: 2665 ; SSE: # BB#0: 2666 ; SSE-NEXT: movdqa (%rdi), %xmm1 2667 ; SSE-NEXT: movdqa 16(%rdi), %xmm3 2668 ; SSE-NEXT: movd %xmm3, %rax 2669 ; SSE-NEXT: movl %eax, %ecx 2670 ; SSE-NEXT: andl $1, %ecx 2671 ; SSE-NEXT: testq %rax, %rax 2672 ; SSE-NEXT: js .LBB74_1 2673 ; SSE-NEXT: # BB#2: 2674 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 2675 ; SSE-NEXT: jmp .LBB74_3 2676 ; SSE-NEXT: .LBB74_1: 2677 ; SSE-NEXT: shrq %rax 2678 ; SSE-NEXT: orq %rax, %rcx 2679 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm2 2680 ; SSE-NEXT: addss %xmm2, %xmm2 2681 ; SSE-NEXT: .LBB74_3: 2682 ; SSE-NEXT: movd %xmm1, %rax 2683 ; SSE-NEXT: movl %eax, %ecx 2684 ; SSE-NEXT: andl $1, %ecx 2685 ; SSE-NEXT: testq %rax, %rax 2686 ; SSE-NEXT: js .LBB74_4 2687 ; SSE-NEXT: # BB#5: 2688 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 2689 ; SSE-NEXT: jmp .LBB74_6 2690 ; SSE-NEXT: .LBB74_4: 2691 ; SSE-NEXT: shrq %rax 2692 ; SSE-NEXT: orq %rax, %rcx 2693 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm0 2694 ; SSE-NEXT: addss %xmm0, %xmm0 2695 ; SSE-NEXT: .LBB74_6: 2696 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 2697 ; SSE-NEXT: movd %xmm3, %rax 2698 ; SSE-NEXT: movl %eax, %ecx 2699 ; SSE-NEXT: andl $1, %ecx 2700 ; SSE-NEXT: testq %rax, %rax 2701 ; SSE-NEXT: js .LBB74_7 2702 ; SSE-NEXT: # BB#8: 2703 ; SSE-NEXT: xorps %xmm3, %xmm3 2704 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3 2705 ; SSE-NEXT: jmp .LBB74_9 2706 ; SSE-NEXT: .LBB74_7: 2707 ; SSE-NEXT: shrq %rax 2708 ; SSE-NEXT: orq %rax, %rcx 2709 ; SSE-NEXT: xorps %xmm3, %xmm3 2710 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm3 2711 ; SSE-NEXT: addss %xmm3, %xmm3 2712 ; SSE-NEXT: .LBB74_9: 2713 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 2714 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2715 ; SSE-NEXT: movd %xmm1, %rax 2716 ; SSE-NEXT: movl %eax, %ecx 2717 ; SSE-NEXT: andl $1, %ecx 2718 ; SSE-NEXT: testq %rax, %rax 2719 ; SSE-NEXT: js .LBB74_10 2720 ; SSE-NEXT: # BB#11: 2721 ; SSE-NEXT: xorps %xmm1, %xmm1 2722 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 2723 ; SSE-NEXT: jmp .LBB74_12 2724 ; SSE-NEXT: .LBB74_10: 2725 ; SSE-NEXT: shrq %rax 2726 ; SSE-NEXT: orq %rax, %rcx 2727 ; SSE-NEXT: xorps %xmm1, %xmm1 2728 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm1 2729 ; SSE-NEXT: addss %xmm1, %xmm1 2730 ; SSE-NEXT: .LBB74_12: 2731 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 2732 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2733 ; SSE-NEXT: retq 2734 ; 2735 ; AVX1-LABEL: uitofp_load_4i64_to_4f32: 2736 ; AVX1: # BB#0: 2737 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 2738 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax 2739 ; AVX1-NEXT: movl %eax, %ecx 2740 ; AVX1-NEXT: andl $1, %ecx 2741 ; AVX1-NEXT: testq %rax, %rax 2742 ; AVX1-NEXT: js .LBB74_1 2743 ; AVX1-NEXT: # BB#2: 2744 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 2745 ; AVX1-NEXT: jmp .LBB74_3 2746 ; AVX1-NEXT: .LBB74_1: 2747 ; AVX1-NEXT: shrq %rax 2748 ; AVX1-NEXT: orq %rax, %rcx 2749 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 2750 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 2751 ; AVX1-NEXT: .LBB74_3: 2752 ; AVX1-NEXT: vmovq %xmm0, %rax 2753 ; AVX1-NEXT: movl %eax, %ecx 2754 ; AVX1-NEXT: andl $1, %ecx 2755 ; AVX1-NEXT: testq %rax, %rax 2756 ; AVX1-NEXT: js .LBB74_4 2757 ; AVX1-NEXT: # BB#5: 2758 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2759 ; AVX1-NEXT: jmp .LBB74_6 2760 ; AVX1-NEXT: .LBB74_4: 2761 ; AVX1-NEXT: shrq %rax 2762 ; AVX1-NEXT: orq %rax, %rcx 2763 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 2764 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 2765 ; AVX1-NEXT: .LBB74_6: 2766 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] 2767 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2768 ; AVX1-NEXT: vmovq %xmm0, %rax 2769 ; AVX1-NEXT: movl %eax, %ecx 2770 ; AVX1-NEXT: andl $1, %ecx 2771 ; AVX1-NEXT: testq %rax, %rax 2772 ; AVX1-NEXT: js .LBB74_7 2773 ; AVX1-NEXT: # BB#8: 2774 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2775 ; AVX1-NEXT: jmp .LBB74_9 2776 ; AVX1-NEXT: .LBB74_7: 2777 ; AVX1-NEXT: shrq %rax 2778 ; AVX1-NEXT: orq %rax, %rcx 2779 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 2780 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 2781 ; AVX1-NEXT: .LBB74_9: 2782 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 2783 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax 2784 ; AVX1-NEXT: movl %eax, %ecx 2785 ; AVX1-NEXT: andl $1, %ecx 2786 ; AVX1-NEXT: testq %rax, %rax 2787 ; AVX1-NEXT: js .LBB74_10 2788 ; AVX1-NEXT: # BB#11: 2789 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 2790 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 2791 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 2792 ; AVX1-NEXT: vzeroupper 2793 ; AVX1-NEXT: retq 2794 ; AVX1-NEXT: .LBB74_10: 2795 ; AVX1-NEXT: shrq %rax 2796 ; AVX1-NEXT: orq %rax, %rcx 2797 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 2798 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 2799 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 2800 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 2801 ; AVX1-NEXT: vzeroupper 2802 ; AVX1-NEXT: retq 2803 ; 2804 ; AVX2-LABEL: uitofp_load_4i64_to_4f32: 2805 ; AVX2: # BB#0: 2806 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2807 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax 2808 ; AVX2-NEXT: movl %eax, %ecx 2809 ; AVX2-NEXT: andl $1, %ecx 2810 ; AVX2-NEXT: testq %rax, %rax 2811 ; AVX2-NEXT: js .LBB74_1 2812 ; AVX2-NEXT: # BB#2: 2813 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 2814 ; AVX2-NEXT: jmp .LBB74_3 2815 ; AVX2-NEXT: .LBB74_1: 2816 ; AVX2-NEXT: shrq %rax 2817 ; AVX2-NEXT: orq %rax, %rcx 2818 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 2819 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 2820 ; AVX2-NEXT: .LBB74_3: 2821 ; AVX2-NEXT: vmovq %xmm0, %rax 2822 ; AVX2-NEXT: movl %eax, %ecx 2823 ; AVX2-NEXT: andl $1, %ecx 2824 ; AVX2-NEXT: testq %rax, %rax 2825 ; AVX2-NEXT: js .LBB74_4 2826 ; AVX2-NEXT: # BB#5: 2827 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2828 ; AVX2-NEXT: jmp .LBB74_6 2829 ; AVX2-NEXT: .LBB74_4: 2830 ; AVX2-NEXT: shrq %rax 2831 ; AVX2-NEXT: orq %rax, %rcx 2832 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 2833 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 2834 ; AVX2-NEXT: .LBB74_6: 2835 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] 2836 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 2837 ; AVX2-NEXT: vmovq %xmm0, %rax 2838 ; AVX2-NEXT: movl %eax, %ecx 2839 ; AVX2-NEXT: andl $1, %ecx 2840 ; AVX2-NEXT: testq %rax, %rax 2841 ; AVX2-NEXT: js .LBB74_7 2842 ; AVX2-NEXT: # BB#8: 2843 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2844 ; AVX2-NEXT: jmp .LBB74_9 2845 ; AVX2-NEXT: .LBB74_7: 2846 ; AVX2-NEXT: shrq %rax 2847 ; AVX2-NEXT: orq %rax, %rcx 2848 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 2849 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 2850 ; AVX2-NEXT: .LBB74_9: 2851 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 2852 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax 2853 ; AVX2-NEXT: movl %eax, %ecx 2854 ; AVX2-NEXT: andl $1, %ecx 2855 ; AVX2-NEXT: testq %rax, %rax 2856 ; AVX2-NEXT: js .LBB74_10 2857 ; AVX2-NEXT: # BB#11: 2858 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 2859 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 2860 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 2861 ; AVX2-NEXT: vzeroupper 2862 ; AVX2-NEXT: retq 2863 ; AVX2-NEXT: .LBB74_10: 2864 ; AVX2-NEXT: shrq %rax 2865 ; AVX2-NEXT: orq %rax, %rcx 2866 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 2867 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 2868 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 2869 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 2870 ; AVX2-NEXT: vzeroupper 2871 ; AVX2-NEXT: retq 2872 %ld = load <4 x i64>, <4 x i64> *%a 2873 %cvt = uitofp <4 x i64> %ld to <4 x float> 2874 ret <4 x float> %cvt 2875 } 2876 2877 define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) { 2878 ; SSE-LABEL: uitofp_load_4i32_to_4f32: 2879 ; SSE: # BB#0: 2880 ; SSE-NEXT: movdqa (%rdi), %xmm0 2881 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] 2882 ; SSE-NEXT: pand %xmm0, %xmm1 2883 ; SSE-NEXT: por {{.*}}(%rip), %xmm1 2884 ; SSE-NEXT: psrld $16, %xmm0 2885 ; SSE-NEXT: por {{.*}}(%rip), %xmm0 2886 ; SSE-NEXT: addps {{.*}}(%rip), %xmm0 2887 ; SSE-NEXT: addps %xmm1, %xmm0 2888 ; SSE-NEXT: retq 2889 ; 2890 ; AVX1-LABEL: uitofp_load_4i32_to_4f32: 2891 ; AVX1: # BB#0: 2892 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 2893 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] 2894 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 2895 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] 2896 ; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 2897 ; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 2898 ; AVX1-NEXT: retq 2899 ; 2900 ; AVX2-LABEL: uitofp_load_4i32_to_4f32: 2901 ; AVX2: # BB#0: 2902 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 2903 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 2904 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 2905 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 2906 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 2907 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 2908 ; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 2909 ; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0 2910 ; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 2911 ; AVX2-NEXT: retq 2912 %ld = load <4 x i32>, <4 x i32> *%a 2913 %cvt = uitofp <4 x i32> %ld to <4 x float> 2914 ret <4 x float> %cvt 2915 } 2916 2917 define <4 x float> @uitofp_load_4i16_to_4f32(<4 x i16> *%a) { 2918 ; SSE-LABEL: uitofp_load_4i16_to_4f32: 2919 ; SSE: # BB#0: 2920 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 2921 ; SSE-NEXT: pxor %xmm1, %xmm1 2922 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2923 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 2924 ; SSE-NEXT: retq 2925 ; 2926 ; AVX-LABEL: uitofp_load_4i16_to_4f32: 2927 ; AVX: # BB#0: 2928 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2929 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 2930 ; AVX-NEXT: retq 2931 %ld = load <4 x i16>, <4 x i16> *%a 2932 %cvt = uitofp <4 x i16> %ld to <4 x float> 2933 ret <4 x float> %cvt 2934 } 2935 2936 define <4 x float> @uitofp_load_4i8_to_4f32(<4 x i8> *%a) { 2937 ; SSE-LABEL: uitofp_load_4i8_to_4f32: 2938 ; SSE: # BB#0: 2939 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2940 ; SSE-NEXT: pxor %xmm1, %xmm1 2941 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 2942 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2943 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 2944 ; SSE-NEXT: retq 2945 ; 2946 ; AVX-LABEL: uitofp_load_4i8_to_4f32: 2947 ; AVX: # BB#0: 2948 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2949 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 2950 ; AVX-NEXT: retq 2951 %ld = load <4 x i8>, <4 x i8> *%a 2952 %cvt = uitofp <4 x i8> %ld to <4 x float> 2953 ret <4 x float> %cvt 2954 } 2955 2956 define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { 2957 ; SSE-LABEL: uitofp_load_8i64_to_8f32: 2958 ; SSE: # BB#0: 2959 ; SSE-NEXT: movdqa (%rdi), %xmm1 2960 ; SSE-NEXT: movdqa 16(%rdi), %xmm5 2961 ; SSE-NEXT: movdqa 32(%rdi), %xmm2 2962 ; SSE-NEXT: movdqa 48(%rdi), %xmm3 2963 ; SSE-NEXT: movd %xmm5, %rax 2964 ; SSE-NEXT: movl %eax, %ecx 2965 ; SSE-NEXT: andl $1, %ecx 2966 ; SSE-NEXT: testq %rax, %rax 2967 ; SSE-NEXT: js .LBB78_1 2968 ; SSE-NEXT: # BB#2: 2969 ; SSE-NEXT: cvtsi2ssq %rax, %xmm4 2970 ; SSE-NEXT: jmp .LBB78_3 2971 ; SSE-NEXT: .LBB78_1: 2972 ; SSE-NEXT: shrq %rax 2973 ; SSE-NEXT: orq %rax, %rcx 2974 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm4 2975 ; SSE-NEXT: addss %xmm4, %xmm4 2976 ; SSE-NEXT: .LBB78_3: 2977 ; SSE-NEXT: movd %xmm1, %rax 2978 ; SSE-NEXT: movl %eax, %ecx 2979 ; SSE-NEXT: andl $1, %ecx 2980 ; SSE-NEXT: testq %rax, %rax 2981 ; SSE-NEXT: js .LBB78_4 2982 ; SSE-NEXT: # BB#5: 2983 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 2984 ; SSE-NEXT: jmp .LBB78_6 2985 ; SSE-NEXT: .LBB78_4: 2986 ; SSE-NEXT: shrq %rax 2987 ; SSE-NEXT: orq %rax, %rcx 2988 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm0 2989 ; SSE-NEXT: addss %xmm0, %xmm0 2990 ; SSE-NEXT: .LBB78_6: 2991 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] 2992 ; SSE-NEXT: movd %xmm5, %rax 2993 ; SSE-NEXT: movl %eax, %ecx 2994 ; SSE-NEXT: andl $1, %ecx 2995 ; SSE-NEXT: testq %rax, %rax 2996 ; SSE-NEXT: js .LBB78_7 2997 ; SSE-NEXT: # BB#8: 2998 ; SSE-NEXT: cvtsi2ssq %rax, %xmm6 2999 ; SSE-NEXT: jmp .LBB78_9 3000 ; SSE-NEXT: .LBB78_7: 3001 ; SSE-NEXT: shrq %rax 3002 ; SSE-NEXT: orq %rax, %rcx 3003 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm6 3004 ; SSE-NEXT: addss %xmm6, %xmm6 3005 ; SSE-NEXT: .LBB78_9: 3006 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 3007 ; SSE-NEXT: movd %xmm1, %rax 3008 ; SSE-NEXT: movl %eax, %ecx 3009 ; SSE-NEXT: andl $1, %ecx 3010 ; SSE-NEXT: testq %rax, %rax 3011 ; SSE-NEXT: js .LBB78_10 3012 ; SSE-NEXT: # BB#11: 3013 ; SSE-NEXT: xorps %xmm5, %xmm5 3014 ; SSE-NEXT: cvtsi2ssq %rax, %xmm5 3015 ; SSE-NEXT: jmp .LBB78_12 3016 ; SSE-NEXT: .LBB78_10: 3017 ; SSE-NEXT: shrq %rax 3018 ; SSE-NEXT: orq %rax, %rcx 3019 ; SSE-NEXT: xorps %xmm5, %xmm5 3020 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm5 3021 ; SSE-NEXT: addss %xmm5, %xmm5 3022 ; SSE-NEXT: .LBB78_12: 3023 ; SSE-NEXT: movd %xmm3, %rax 3024 ; SSE-NEXT: movl %eax, %ecx 3025 ; SSE-NEXT: andl $1, %ecx 3026 ; SSE-NEXT: testq %rax, %rax 3027 ; SSE-NEXT: js .LBB78_13 3028 ; SSE-NEXT: # BB#14: 3029 ; SSE-NEXT: cvtsi2ssq %rax, %xmm7 3030 ; SSE-NEXT: jmp .LBB78_15 3031 ; SSE-NEXT: .LBB78_13: 3032 ; SSE-NEXT: shrq %rax 3033 ; SSE-NEXT: orq %rax, %rcx 3034 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm7 3035 ; SSE-NEXT: addss %xmm7, %xmm7 3036 ; SSE-NEXT: .LBB78_15: 3037 ; SSE-NEXT: movd %xmm2, %rax 3038 ; SSE-NEXT: movl %eax, %ecx 3039 ; SSE-NEXT: andl $1, %ecx 3040 ; SSE-NEXT: testq %rax, %rax 3041 ; SSE-NEXT: js .LBB78_16 3042 ; SSE-NEXT: # BB#17: 3043 ; SSE-NEXT: xorps %xmm1, %xmm1 3044 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 3045 ; SSE-NEXT: jmp .LBB78_18 3046 ; SSE-NEXT: .LBB78_16: 3047 ; SSE-NEXT: shrq %rax 3048 ; SSE-NEXT: orq %rax, %rcx 3049 ; SSE-NEXT: xorps %xmm1, %xmm1 3050 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm1 3051 ; SSE-NEXT: addss %xmm1, %xmm1 3052 ; SSE-NEXT: .LBB78_18: 3053 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 3054 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 3055 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 3056 ; SSE-NEXT: movd %xmm3, %rax 3057 ; SSE-NEXT: movl %eax, %ecx 3058 ; SSE-NEXT: andl $1, %ecx 3059 ; SSE-NEXT: testq %rax, %rax 3060 ; SSE-NEXT: js .LBB78_19 3061 ; SSE-NEXT: # BB#20: 3062 ; SSE-NEXT: xorps %xmm3, %xmm3 3063 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3 3064 ; SSE-NEXT: jmp .LBB78_21 3065 ; SSE-NEXT: .LBB78_19: 3066 ; SSE-NEXT: shrq %rax 3067 ; SSE-NEXT: orq %rax, %rcx 3068 ; SSE-NEXT: xorps %xmm3, %xmm3 3069 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm3 3070 ; SSE-NEXT: addss %xmm3, %xmm3 3071 ; SSE-NEXT: .LBB78_21: 3072 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 3073 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] 3074 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 3075 ; SSE-NEXT: movd %xmm2, %rax 3076 ; SSE-NEXT: movl %eax, %ecx 3077 ; SSE-NEXT: andl $1, %ecx 3078 ; SSE-NEXT: testq %rax, %rax 3079 ; SSE-NEXT: js .LBB78_22 3080 ; SSE-NEXT: # BB#23: 3081 ; SSE-NEXT: xorps %xmm2, %xmm2 3082 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 3083 ; SSE-NEXT: jmp .LBB78_24 3084 ; SSE-NEXT: .LBB78_22: 3085 ; SSE-NEXT: shrq %rax 3086 ; SSE-NEXT: orq %rax, %rcx 3087 ; SSE-NEXT: xorps %xmm2, %xmm2 3088 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm2 3089 ; SSE-NEXT: addss %xmm2, %xmm2 3090 ; SSE-NEXT: .LBB78_24: 3091 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 3092 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 3093 ; SSE-NEXT: retq 3094 ; 3095 ; AVX1-LABEL: uitofp_load_8i64_to_8f32: 3096 ; AVX1: # BB#0: 3097 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 3098 ; AVX1-NEXT: vmovdqa 32(%rdi), %ymm2 3099 ; AVX1-NEXT: vpextrq $1, %xmm2, %rax 3100 ; AVX1-NEXT: movl %eax, %ecx 3101 ; AVX1-NEXT: andl $1, %ecx 3102 ; AVX1-NEXT: testq %rax, %rax 3103 ; AVX1-NEXT: js .LBB78_1 3104 ; AVX1-NEXT: # BB#2: 3105 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 3106 ; AVX1-NEXT: jmp .LBB78_3 3107 ; AVX1-NEXT: .LBB78_1: 3108 ; AVX1-NEXT: shrq %rax 3109 ; AVX1-NEXT: orq %rax, %rcx 3110 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 3111 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 3112 ; AVX1-NEXT: .LBB78_3: 3113 ; AVX1-NEXT: vmovq %xmm2, %rax 3114 ; AVX1-NEXT: movl %eax, %ecx 3115 ; AVX1-NEXT: andl $1, %ecx 3116 ; AVX1-NEXT: testq %rax, %rax 3117 ; AVX1-NEXT: js .LBB78_4 3118 ; AVX1-NEXT: # BB#5: 3119 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 3120 ; AVX1-NEXT: jmp .LBB78_6 3121 ; AVX1-NEXT: .LBB78_4: 3122 ; AVX1-NEXT: shrq %rax 3123 ; AVX1-NEXT: orq %rax, %rcx 3124 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3 3125 ; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 3126 ; AVX1-NEXT: .LBB78_6: 3127 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 3128 ; AVX1-NEXT: vmovq %xmm2, %rax 3129 ; AVX1-NEXT: movl %eax, %ecx 3130 ; AVX1-NEXT: andl $1, %ecx 3131 ; AVX1-NEXT: testq %rax, %rax 3132 ; AVX1-NEXT: js .LBB78_7 3133 ; AVX1-NEXT: # BB#8: 3134 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm4 3135 ; AVX1-NEXT: jmp .LBB78_9 3136 ; AVX1-NEXT: .LBB78_7: 3137 ; AVX1-NEXT: shrq %rax 3138 ; AVX1-NEXT: orq %rax, %rcx 3139 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm4 3140 ; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4 3141 ; AVX1-NEXT: .LBB78_9: 3142 ; AVX1-NEXT: vpextrq $1, %xmm2, %rax 3143 ; AVX1-NEXT: movl %eax, %ecx 3144 ; AVX1-NEXT: andl $1, %ecx 3145 ; AVX1-NEXT: testq %rax, %rax 3146 ; AVX1-NEXT: js .LBB78_10 3147 ; AVX1-NEXT: # BB#11: 3148 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 3149 ; AVX1-NEXT: jmp .LBB78_12 3150 ; AVX1-NEXT: .LBB78_10: 3151 ; AVX1-NEXT: shrq %rax 3152 ; AVX1-NEXT: orq %rax, %rcx 3153 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 3154 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 3155 ; AVX1-NEXT: .LBB78_12: 3156 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax 3157 ; AVX1-NEXT: movl %eax, %ecx 3158 ; AVX1-NEXT: andl $1, %ecx 3159 ; AVX1-NEXT: testq %rax, %rax 3160 ; AVX1-NEXT: js .LBB78_13 3161 ; AVX1-NEXT: # BB#14: 3162 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5 3163 ; AVX1-NEXT: jmp .LBB78_15 3164 ; AVX1-NEXT: .LBB78_13: 3165 ; AVX1-NEXT: shrq %rax 3166 ; AVX1-NEXT: orq %rax, %rcx 3167 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm5 3168 ; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5 3169 ; AVX1-NEXT: .LBB78_15: 3170 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] 3171 ; AVX1-NEXT: vmovq %xmm0, %rax 3172 ; AVX1-NEXT: movl %eax, %ecx 3173 ; AVX1-NEXT: andl $1, %ecx 3174 ; AVX1-NEXT: testq %rax, %rax 3175 ; AVX1-NEXT: js .LBB78_16 3176 ; AVX1-NEXT: # BB#17: 3177 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 3178 ; AVX1-NEXT: jmp .LBB78_18 3179 ; AVX1-NEXT: .LBB78_16: 3180 ; AVX1-NEXT: shrq %rax 3181 ; AVX1-NEXT: orq %rax, %rcx 3182 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3 3183 ; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 3184 ; AVX1-NEXT: .LBB78_18: 3185 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] 3186 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3] 3187 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 3188 ; AVX1-NEXT: vmovq %xmm4, %rax 3189 ; AVX1-NEXT: movl %eax, %ecx 3190 ; AVX1-NEXT: andl $1, %ecx 3191 ; AVX1-NEXT: testq %rax, %rax 3192 ; AVX1-NEXT: js .LBB78_19 3193 ; AVX1-NEXT: # BB#20: 3194 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 3195 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5 3196 ; AVX1-NEXT: jmp .LBB78_21 3197 ; AVX1-NEXT: .LBB78_19: 3198 ; AVX1-NEXT: shrq %rax 3199 ; AVX1-NEXT: orq %rax, %rcx 3200 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 3201 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 3202 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5 3203 ; AVX1-NEXT: .LBB78_21: 3204 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] 3205 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3] 3206 ; AVX1-NEXT: vpextrq $1, %xmm4, %rax 3207 ; AVX1-NEXT: movl %eax, %ecx 3208 ; AVX1-NEXT: andl $1, %ecx 3209 ; AVX1-NEXT: testq %rax, %rax 3210 ; AVX1-NEXT: js .LBB78_22 3211 ; AVX1-NEXT: # BB#23: 3212 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 3213 ; AVX1-NEXT: jmp .LBB78_24 3214 ; AVX1-NEXT: .LBB78_22: 3215 ; AVX1-NEXT: shrq %rax 3216 ; AVX1-NEXT: orq %rax, %rcx 3217 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 3218 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 3219 ; AVX1-NEXT: .LBB78_24: 3220 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] 3221 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 3222 ; AVX1-NEXT: retq 3223 ; 3224 ; AVX2-LABEL: uitofp_load_8i64_to_8f32: 3225 ; AVX2: # BB#0: 3226 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 3227 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 3228 ; AVX2-NEXT: vpextrq $1, %xmm2, %rax 3229 ; AVX2-NEXT: movl %eax, %ecx 3230 ; AVX2-NEXT: andl $1, %ecx 3231 ; AVX2-NEXT: testq %rax, %rax 3232 ; AVX2-NEXT: js .LBB78_1 3233 ; AVX2-NEXT: # BB#2: 3234 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 3235 ; AVX2-NEXT: jmp .LBB78_3 3236 ; AVX2-NEXT: .LBB78_1: 3237 ; AVX2-NEXT: shrq %rax 3238 ; AVX2-NEXT: orq %rax, %rcx 3239 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 3240 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 3241 ; AVX2-NEXT: .LBB78_3: 3242 ; AVX2-NEXT: vmovq %xmm2, %rax 3243 ; AVX2-NEXT: movl %eax, %ecx 3244 ; AVX2-NEXT: andl $1, %ecx 3245 ; AVX2-NEXT: testq %rax, %rax 3246 ; AVX2-NEXT: js .LBB78_4 3247 ; AVX2-NEXT: # BB#5: 3248 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 3249 ; AVX2-NEXT: jmp .LBB78_6 3250 ; AVX2-NEXT: .LBB78_4: 3251 ; AVX2-NEXT: shrq %rax 3252 ; AVX2-NEXT: orq %rax, %rcx 3253 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3 3254 ; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 3255 ; AVX2-NEXT: .LBB78_6: 3256 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 3257 ; AVX2-NEXT: vmovq %xmm2, %rax 3258 ; AVX2-NEXT: movl %eax, %ecx 3259 ; AVX2-NEXT: andl $1, %ecx 3260 ; AVX2-NEXT: testq %rax, %rax 3261 ; AVX2-NEXT: js .LBB78_7 3262 ; AVX2-NEXT: # BB#8: 3263 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm4 3264 ; AVX2-NEXT: jmp .LBB78_9 3265 ; AVX2-NEXT: .LBB78_7: 3266 ; AVX2-NEXT: shrq %rax 3267 ; AVX2-NEXT: orq %rax, %rcx 3268 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm4 3269 ; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4 3270 ; AVX2-NEXT: .LBB78_9: 3271 ; AVX2-NEXT: vpextrq $1, %xmm2, %rax 3272 ; AVX2-NEXT: movl %eax, %ecx 3273 ; AVX2-NEXT: andl $1, %ecx 3274 ; AVX2-NEXT: testq %rax, %rax 3275 ; AVX2-NEXT: js .LBB78_10 3276 ; AVX2-NEXT: # BB#11: 3277 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 3278 ; AVX2-NEXT: jmp .LBB78_12 3279 ; AVX2-NEXT: .LBB78_10: 3280 ; AVX2-NEXT: shrq %rax 3281 ; AVX2-NEXT: orq %rax, %rcx 3282 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 3283 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 3284 ; AVX2-NEXT: .LBB78_12: 3285 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax 3286 ; AVX2-NEXT: movl %eax, %ecx 3287 ; AVX2-NEXT: andl $1, %ecx 3288 ; AVX2-NEXT: testq %rax, %rax 3289 ; AVX2-NEXT: js .LBB78_13 3290 ; AVX2-NEXT: # BB#14: 3291 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5 3292 ; AVX2-NEXT: jmp .LBB78_15 3293 ; AVX2-NEXT: .LBB78_13: 3294 ; AVX2-NEXT: shrq %rax 3295 ; AVX2-NEXT: orq %rax, %rcx 3296 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm5 3297 ; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5 3298 ; AVX2-NEXT: .LBB78_15: 3299 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] 3300 ; AVX2-NEXT: vmovq %xmm0, %rax 3301 ; AVX2-NEXT: movl %eax, %ecx 3302 ; AVX2-NEXT: andl $1, %ecx 3303 ; AVX2-NEXT: testq %rax, %rax 3304 ; AVX2-NEXT: js .LBB78_16 3305 ; AVX2-NEXT: # BB#17: 3306 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 3307 ; AVX2-NEXT: jmp .LBB78_18 3308 ; AVX2-NEXT: .LBB78_16: 3309 ; AVX2-NEXT: shrq %rax 3310 ; AVX2-NEXT: orq %rax, %rcx 3311 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3 3312 ; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 3313 ; AVX2-NEXT: .LBB78_18: 3314 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] 3315 ; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3] 3316 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 3317 ; AVX2-NEXT: vmovq %xmm4, %rax 3318 ; AVX2-NEXT: movl %eax, %ecx 3319 ; AVX2-NEXT: andl $1, %ecx 3320 ; AVX2-NEXT: testq %rax, %rax 3321 ; AVX2-NEXT: js .LBB78_19 3322 ; AVX2-NEXT: # BB#20: 3323 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 3324 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5 3325 ; AVX2-NEXT: jmp .LBB78_21 3326 ; AVX2-NEXT: .LBB78_19: 3327 ; AVX2-NEXT: shrq %rax 3328 ; AVX2-NEXT: orq %rax, %rcx 3329 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 3330 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 3331 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5 3332 ; AVX2-NEXT: .LBB78_21: 3333 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] 3334 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3] 3335 ; AVX2-NEXT: vpextrq $1, %xmm4, %rax 3336 ; AVX2-NEXT: movl %eax, %ecx 3337 ; AVX2-NEXT: andl $1, %ecx 3338 ; AVX2-NEXT: testq %rax, %rax 3339 ; AVX2-NEXT: js .LBB78_22 3340 ; AVX2-NEXT: # BB#23: 3341 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 3342 ; AVX2-NEXT: jmp .LBB78_24 3343 ; AVX2-NEXT: .LBB78_22: 3344 ; AVX2-NEXT: shrq %rax 3345 ; AVX2-NEXT: orq %rax, %rcx 3346 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 3347 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 3348 ; AVX2-NEXT: .LBB78_24: 3349 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] 3350 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 3351 ; AVX2-NEXT: retq 3352 %ld = load <8 x i64>, <8 x i64> *%a 3353 %cvt = uitofp <8 x i64> %ld to <8 x float> 3354 ret <8 x float> %cvt 3355 } 3356 3357 define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) { 3358 ; SSE-LABEL: uitofp_load_8i32_to_8f32: 3359 ; SSE: # BB#0: 3360 ; SSE-NEXT: movdqa (%rdi), %xmm0 3361 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 3362 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] 3363 ; SSE-NEXT: movdqa %xmm0, %xmm3 3364 ; SSE-NEXT: pand %xmm2, %xmm3 3365 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200] 3366 ; SSE-NEXT: por %xmm4, %xmm3 3367 ; SSE-NEXT: psrld $16, %xmm0 3368 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928] 3369 ; SSE-NEXT: por %xmm5, %xmm0 3370 ; SSE-NEXT: movaps {{.*#+}} xmm6 = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11] 3371 ; SSE-NEXT: addps %xmm6, %xmm0 3372 ; SSE-NEXT: addps %xmm3, %xmm0 3373 ; SSE-NEXT: pand %xmm1, %xmm2 3374 ; SSE-NEXT: por %xmm4, %xmm2 3375 ; SSE-NEXT: psrld $16, %xmm1 3376 ; SSE-NEXT: por %xmm5, %xmm1 3377 ; SSE-NEXT: addps %xmm6, %xmm1 3378 ; SSE-NEXT: addps %xmm2, %xmm1 3379 ; SSE-NEXT: retq 3380 ; 3381 ; AVX1-LABEL: uitofp_load_8i32_to_8f32: 3382 ; AVX1: # BB#0: 3383 ; AVX1-NEXT: vmovaps (%rdi), %ymm0 3384 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1 3385 ; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 3386 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 3387 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3388 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 3389 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 3390 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 3391 ; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 3392 ; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 3393 ; AVX1-NEXT: retq 3394 ; 3395 ; AVX2-LABEL: uitofp_load_8i32_to_8f32: 3396 ; AVX2: # BB#0: 3397 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 3398 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 3399 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 3400 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 3401 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 3402 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] 3403 ; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 3404 ; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 3405 ; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 3406 ; AVX2-NEXT: retq 3407 %ld = load <8 x i32>, <8 x i32> *%a 3408 %cvt = uitofp <8 x i32> %ld to <8 x float> 3409 ret <8 x float> %cvt 3410 } 3411 3412 define <8 x float> @uitofp_load_8i16_to_8f32(<8 x i16> *%a) { 3413 ; SSE-LABEL: uitofp_load_8i16_to_8f32: 3414 ; SSE: # BB#0: 3415 ; SSE-NEXT: movdqa (%rdi), %xmm1 3416 ; SSE-NEXT: pxor %xmm2, %xmm2 3417 ; SSE-NEXT: movdqa %xmm1, %xmm0 3418 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 3419 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 3420 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 3421 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 3422 ; SSE-NEXT: retq 3423 ; 3424 ; AVX1-LABEL: uitofp_load_8i16_to_8f32: 3425 ; AVX1: # BB#0: 3426 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 3427 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 3428 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 3429 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 3430 ; AVX1-NEXT: retq 3431 ; 3432 ; AVX2-LABEL: uitofp_load_8i16_to_8f32: 3433 ; AVX2: # BB#0: 3434 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 3435 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 3436 ; AVX2-NEXT: retq 3437 %ld = load <8 x i16>, <8 x i16> *%a 3438 %cvt = uitofp <8 x i16> %ld to <8 x float> 3439 ret <8 x float> %cvt 3440 } 3441 3442 define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) { 3443 ; SSE-LABEL: uitofp_load_8i8_to_8f32: 3444 ; SSE: # BB#0: 3445 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 3446 ; SSE-NEXT: pxor %xmm2, %xmm2 3447 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 3448 ; SSE-NEXT: movdqa %xmm1, %xmm0 3449 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 3450 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 3451 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 3452 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 3453 ; SSE-NEXT: retq 3454 ; 3455 ; AVX1-LABEL: uitofp_load_8i8_to_8f32: 3456 ; AVX1: # BB#0: 3457 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 3458 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 3459 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 3460 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 3461 ; AVX1-NEXT: retq 3462 ; 3463 ; AVX2-LABEL: uitofp_load_8i8_to_8f32: 3464 ; AVX2: # BB#0: 3465 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 3466 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 3467 ; AVX2-NEXT: retq 3468 %ld = load <8 x i8>, <8 x i8> *%a 3469 %cvt = uitofp <8 x i8> %ld to <8 x float> 3470 ret <8 x float> %cvt 3471 } 3472 3473 ; 3474 ; Aggregates 3475 ; 3476 3477 %Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float>* }> 3478 define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) { 3479 ; SSE-LABEL: aggregate_sitofp_8i16_to_8f32: 3480 ; SSE: # BB#0: 3481 ; SSE-NEXT: movq 24(%rdi), %rax 3482 ; SSE-NEXT: movdqu 8(%rdi), %xmm0 3483 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3484 ; SSE-NEXT: psrad $16, %xmm1 3485 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 3486 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 3487 ; SSE-NEXT: psrad $16, %xmm0 3488 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 3489 ; SSE-NEXT: movaps %xmm0, 16(%rax) 3490 ; SSE-NEXT: movaps %xmm1, (%rax) 3491 ; SSE-NEXT: retq 3492 ; 3493 ; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32: 3494 ; AVX1: # BB#0: 3495 ; AVX1-NEXT: movq 24(%rdi), %rax 3496 ; AVX1-NEXT: vmovdqu 8(%rdi), %xmm0 3497 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 3498 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 3499 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 3500 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 3501 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 3502 ; AVX1-NEXT: vmovaps %ymm0, (%rax) 3503 ; AVX1-NEXT: vzeroupper 3504 ; AVX1-NEXT: retq 3505 ; 3506 ; AVX2-LABEL: aggregate_sitofp_8i16_to_8f32: 3507 ; AVX2: # BB#0: 3508 ; AVX2-NEXT: movq 24(%rdi), %rax 3509 ; AVX2-NEXT: vpmovsxwd 8(%rdi), %ymm0 3510 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 3511 ; AVX2-NEXT: vmovaps %ymm0, (%rax) 3512 ; AVX2-NEXT: vzeroupper 3513 ; AVX2-NEXT: retq 3514 %1 = load %Arguments, %Arguments* %a0, align 1 3515 %2 = extractvalue %Arguments %1, 1 3516 %3 = extractvalue %Arguments %1, 2 3517 %4 = sitofp <8 x i16> %2 to <8 x float> 3518 store <8 x float> %4, <8 x float>* %3, align 32 3519 ret void 3520 } 3521