Home | History | Annotate | Download | only in X86
      1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
      4 ;
      5 ; 32-bit tests to make sure we're not doing anything stupid.
      6 ; RUN: llc < %s -mtriple=i686-unknown-unknown
      7 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse
      8 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2
      9 
     10 ;
     11 ; Signed Integer to Double
     12 ;
     13 
     14 define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
     15 ; SSE-LABEL: sitofp_2i64_to_2f64:
     16 ; SSE:       # BB#0:
     17 ; SSE-NEXT:    movd %xmm0, %rax
     18 ; SSE-NEXT:    cvtsi2sdq %rax, %xmm1
     19 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
     20 ; SSE-NEXT:    movd %xmm0, %rax
     21 ; SSE-NEXT:    xorps %xmm0, %xmm0
     22 ; SSE-NEXT:    cvtsi2sdq %rax, %xmm0
     23 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
     24 ; SSE-NEXT:    movapd %xmm1, %xmm0
     25 ; SSE-NEXT:    retq
     26 ;
     27 ; AVX-LABEL: sitofp_2i64_to_2f64:
     28 ; AVX:       # BB#0:
     29 ; AVX-NEXT:    vpextrq $1, %xmm0, %rax
     30 ; AVX-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm1
     31 ; AVX-NEXT:    vmovq %xmm0, %rax
     32 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
     33 ; AVX-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm0
     34 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
     35 ; AVX-NEXT:    retq
     36   %cvt = sitofp <2 x i64> %a to <2 x double>
     37   ret <2 x double> %cvt
     38 }
     39 
     40 define <2 x double> @sitofp_2i32_to_2f64(<4 x i32> %a) {
     41 ; SSE-LABEL: sitofp_2i32_to_2f64:
     42 ; SSE:       # BB#0:
     43 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
     44 ; SSE-NEXT:    retq
     45 ;
     46 ; AVX-LABEL: sitofp_2i32_to_2f64:
     47 ; AVX:       # BB#0:
     48 ; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
     49 ; AVX-NEXT:    retq
     50   %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
     51   %cvt = sitofp <2 x i32> %shuf to <2 x double>
     52   ret <2 x double> %cvt
     53 }
     54 
     55 define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) {
     56 ; SSE-LABEL: sitofp_4i32_to_2f64:
     57 ; SSE:       # BB#0:
     58 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
     59 ; SSE-NEXT:    retq
     60 ;
     61 ; AVX-LABEL: sitofp_4i32_to_2f64:
     62 ; AVX:       # BB#0:
     63 ; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
     64 ; AVX-NEXT:    # kill
     65 ; AVX-NEXT:    vzeroupper
     66 ; AVX-NEXT:    retq
     67   %cvt = sitofp <4 x i32> %a to <4 x double>
     68   %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
     69   ret <2 x double> %shuf
     70 }
     71 
     72 define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) {
     73 ; SSE-LABEL: sitofp_2i16_to_2f64:
     74 ; SSE:       # BB#0:
     75 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
     76 ; SSE-NEXT:    psrad $16, %xmm0
     77 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
     78 ; SSE-NEXT:    retq
     79 ;
     80 ; AVX-LABEL: sitofp_2i16_to_2f64:
     81 ; AVX:       # BB#0:
     82 ; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
     83 ; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
     84 ; AVX-NEXT:    retq
     85   %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
     86   %cvt = sitofp <2 x i16> %shuf to <2 x double>
     87   ret <2 x double> %cvt
     88 }
     89 
     90 define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) {
     91 ; SSE-LABEL: sitofp_8i16_to_2f64:
     92 ; SSE:       # BB#0:
     93 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
     94 ; SSE-NEXT:    psrad $16, %xmm0
     95 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
     96 ; SSE-NEXT:    retq
     97 ;
     98 ; AVX1-LABEL: sitofp_8i16_to_2f64:
     99 ; AVX1:       # BB#0:
    100 ; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
    101 ; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
    102 ; AVX1-NEXT:    # kill
    103 ; AVX1-NEXT:    vzeroupper
    104 ; AVX1-NEXT:    retq
    105 ;
    106 ; AVX2-LABEL: sitofp_8i16_to_2f64:
    107 ; AVX2:       # BB#0:
    108 ; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
    109 ; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
    110 ; AVX2-NEXT:    # kill
    111 ; AVX2-NEXT:    vzeroupper
    112 ; AVX2-NEXT:    retq
    113   %cvt = sitofp <8 x i16> %a to <8 x double>
    114   %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
    115   ret <2 x double> %shuf
    116 }
    117 
    118 define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) {
    119 ; SSE-LABEL: sitofp_2i8_to_2f64:
    120 ; SSE:       # BB#0:
    121 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    122 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
    123 ; SSE-NEXT:    psrad $24, %xmm0
    124 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
    125 ; SSE-NEXT:    retq
    126 ;
    127 ; AVX-LABEL: sitofp_2i8_to_2f64:
    128 ; AVX:       # BB#0:
    129 ; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
    130 ; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
    131 ; AVX-NEXT:    retq
    132   %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
    133   %cvt = sitofp <2 x i8> %shuf to <2 x double>
    134   ret <2 x double> %cvt
    135 }
    136 
    137 define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
    138 ; SSE-LABEL: sitofp_16i8_to_2f64:
    139 ; SSE:       # BB#0:
    140 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    141 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
    142 ; SSE-NEXT:    psrad $24, %xmm0
    143 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
    144 ; SSE-NEXT:    retq
    145 ;
    146 ; AVX1-LABEL: sitofp_16i8_to_2f64:
    147 ; AVX1:       # BB#0:
    148 ; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
    149 ; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
    150 ; AVX1-NEXT:    # kill
    151 ; AVX1-NEXT:    vzeroupper
    152 ; AVX1-NEXT:    retq
    153 ;
    154 ; AVX2-LABEL: sitofp_16i8_to_2f64:
    155 ; AVX2:       # BB#0:
    156 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
    157 ; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
    158 ; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
    159 ; AVX2-NEXT:    # kill
    160 ; AVX2-NEXT:    vzeroupper
    161 ; AVX2-NEXT:    retq
    162   %cvt = sitofp <16 x i8> %a to <16 x double>
    163   %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
    164   ret <2 x double> %shuf
    165 }
    166 
    167 define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
    168 ; SSE-LABEL: sitofp_4i64_to_4f64:
    169 ; SSE:       # BB#0:
    170 ; SSE-NEXT:    movd %xmm0, %rax
    171 ; SSE-NEXT:    cvtsi2sdq %rax, %xmm2
    172 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    173 ; SSE-NEXT:    movd %xmm0, %rax
    174 ; SSE-NEXT:    xorps %xmm0, %xmm0
    175 ; SSE-NEXT:    cvtsi2sdq %rax, %xmm0
    176 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
    177 ; SSE-NEXT:    movd %xmm1, %rax
    178 ; SSE-NEXT:    cvtsi2sdq %rax, %xmm3
    179 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
    180 ; SSE-NEXT:    movd %xmm0, %rax
    181 ; SSE-NEXT:    xorps %xmm0, %xmm0
    182 ; SSE-NEXT:    cvtsi2sdq %rax, %xmm0
    183 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0]
    184 ; SSE-NEXT:    movapd %xmm2, %xmm0
    185 ; SSE-NEXT:    movapd %xmm3, %xmm1
    186 ; SSE-NEXT:    retq
    187 ;
    188 ; AVX1-LABEL: sitofp_4i64_to_4f64:
    189 ; AVX1:       # BB#0:
    190 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    191 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
    192 ; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm2
    193 ; AVX1-NEXT:    vmovq %xmm1, %rax
    194 ; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm1
    195 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
    196 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
    197 ; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm2
    198 ; AVX1-NEXT:    vmovq %xmm0, %rax
    199 ; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
    200 ; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm0
    201 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
    202 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    203 ; AVX1-NEXT:    retq
    204 ;
    205 ; AVX2-LABEL: sitofp_4i64_to_4f64:
    206 ; AVX2:       # BB#0:
    207 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    208 ; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
    209 ; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm2
    210 ; AVX2-NEXT:    vmovq %xmm1, %rax
    211 ; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm1
    212 ; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
    213 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
    214 ; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm2
    215 ; AVX2-NEXT:    vmovq %xmm0, %rax
    216 ; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
    217 ; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm0
    218 ; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
    219 ; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    220 ; AVX2-NEXT:    retq
    221   %cvt = sitofp <4 x i64> %a to <4 x double>
    222   ret <4 x double> %cvt
    223 }
    224 
    225 define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) {
    226 ; SSE-LABEL: sitofp_4i32_to_4f64:
    227 ; SSE:       # BB#0:
    228 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
    229 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    230 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
    231 ; SSE-NEXT:    movaps %xmm2, %xmm0
    232 ; SSE-NEXT:    retq
    233 ;
    234 ; AVX-LABEL: sitofp_4i32_to_4f64:
    235 ; AVX:       # BB#0:
    236 ; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
    237 ; AVX-NEXT:    retq
    238   %cvt = sitofp <4 x i32> %a to <4 x double>
    239   ret <4 x double> %cvt
    240 }
    241 
    242 define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) {
    243 ; SSE-LABEL: sitofp_4i16_to_4f64:
    244 ; SSE:       # BB#0:
    245 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
    246 ; SSE-NEXT:    psrad $16, %xmm1
    247 ; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
    248 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
    249 ; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
    250 ; SSE-NEXT:    retq
    251 ;
    252 ; AVX-LABEL: sitofp_4i16_to_4f64:
    253 ; AVX:       # BB#0:
    254 ; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
    255 ; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
    256 ; AVX-NEXT:    retq
    257   %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    258   %cvt = sitofp <4 x i16> %shuf to <4 x double>
    259   ret <4 x double> %cvt
    260 }
    261 
    262 define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) {
    263 ; SSE-LABEL: sitofp_8i16_to_4f64:
    264 ; SSE:       # BB#0:
    265 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
    266 ; SSE-NEXT:    psrad $16, %xmm1
    267 ; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
    268 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
    269 ; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
    270 ; SSE-NEXT:    retq
    271 ;
    272 ; AVX1-LABEL: sitofp_8i16_to_4f64:
    273 ; AVX1:       # BB#0:
    274 ; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
    275 ; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
    276 ; AVX1-NEXT:    retq
    277 ;
    278 ; AVX2-LABEL: sitofp_8i16_to_4f64:
    279 ; AVX2:       # BB#0:
    280 ; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
    281 ; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
    282 ; AVX2-NEXT:    retq
    283   %cvt = sitofp <8 x i16> %a to <8 x double>
    284   %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    285   ret <4 x double> %shuf
    286 }
    287 
    288 define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) {
    289 ; SSE-LABEL: sitofp_4i8_to_4f64:
    290 ; SSE:       # BB#0:
    291 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    292 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
    293 ; SSE-NEXT:    psrad $24, %xmm1
    294 ; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
    295 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
    296 ; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
    297 ; SSE-NEXT:    retq
    298 ;
    299 ; AVX-LABEL: sitofp_4i8_to_4f64:
    300 ; AVX:       # BB#0:
    301 ; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
    302 ; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
    303 ; AVX-NEXT:    retq
    304   %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    305   %cvt = sitofp <4 x i8> %shuf to <4 x double>
    306   ret <4 x double> %cvt
    307 }
    308 
    309 define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
    310 ; SSE-LABEL: sitofp_16i8_to_4f64:
    311 ; SSE:       # BB#0:
    312 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    313 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
    314 ; SSE-NEXT:    psrad $24, %xmm1
    315 ; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
    316 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
    317 ; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
    318 ; SSE-NEXT:    retq
    319 ;
    320 ; AVX1-LABEL: sitofp_16i8_to_4f64:
    321 ; AVX1:       # BB#0:
    322 ; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
    323 ; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
    324 ; AVX1-NEXT:    retq
    325 ;
    326 ; AVX2-LABEL: sitofp_16i8_to_4f64:
    327 ; AVX2:       # BB#0:
    328 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
    329 ; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
    330 ; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
    331 ; AVX2-NEXT:    retq
    332   %cvt = sitofp <16 x i8> %a to <16 x double>
    333   %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    334   ret <4 x double> %shuf
    335 }
    336 
    337 ;
    338 ; Unsigned Integer to Double
    339 ;
    340 
    341 define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) {
    342 ; SSE-LABEL: uitofp_2i64_to_2f64:
    343 ; SSE:       # BB#0:
    344 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
    345 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
    346 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    347 ; SSE-NEXT:    movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
    348 ; SSE-NEXT:    subpd %xmm3, %xmm0
    349 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
    350 ; SSE-NEXT:    addpd %xmm4, %xmm0
    351 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
    352 ; SSE-NEXT:    subpd %xmm3, %xmm2
    353 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
    354 ; SSE-NEXT:    addpd %xmm2, %xmm1
    355 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    356 ; SSE-NEXT:    retq
    357 ;
    358 ; AVX-LABEL: uitofp_2i64_to_2f64:
    359 ; AVX:       # BB#0:
    360 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
    361 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    362 ; AVX-NEXT:    vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
    363 ; AVX-NEXT:    vsubpd %xmm3, %xmm2, %xmm2
    364 ; AVX-NEXT:    vhaddpd %xmm2, %xmm2, %xmm2
    365 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    366 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    367 ; AVX-NEXT:    vsubpd %xmm3, %xmm0, %xmm0
    368 ; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
    369 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
    370 ; AVX-NEXT:    retq
    371   %cvt = uitofp <2 x i64> %a to <2 x double>
    372   ret <2 x double> %cvt
    373 }
    374 
    375 define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
    376 ; SSE-LABEL: uitofp_2i32_to_2f64:
    377 ; SSE:       # BB#0:
    378 ; SSE-NEXT:    pxor %xmm1, %xmm1
    379 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    380 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
    381 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
    382 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    383 ; SSE-NEXT:    movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
    384 ; SSE-NEXT:    subpd %xmm3, %xmm0
    385 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
    386 ; SSE-NEXT:    addpd %xmm4, %xmm0
    387 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
    388 ; SSE-NEXT:    subpd %xmm3, %xmm2
    389 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
    390 ; SSE-NEXT:    addpd %xmm2, %xmm1
    391 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    392 ; SSE-NEXT:    retq
    393 ;
    394 ; AVX-LABEL: uitofp_2i32_to_2f64:
    395 ; AVX:       # BB#0:
    396 ; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
    397 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
    398 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    399 ; AVX-NEXT:    vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
    400 ; AVX-NEXT:    vsubpd %xmm3, %xmm2, %xmm2
    401 ; AVX-NEXT:    vhaddpd %xmm2, %xmm2, %xmm2
    402 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    403 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    404 ; AVX-NEXT:    vsubpd %xmm3, %xmm0, %xmm0
    405 ; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
    406 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
    407 ; AVX-NEXT:    retq
    408   %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
    409   %cvt = uitofp <2 x i32> %shuf to <2 x double>
    410   ret <2 x double> %cvt
    411 }
    412 
    413 define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
    414 ; SSE-LABEL: uitofp_4i32_to_2f64:
    415 ; SSE:       # BB#0:
    416 ; SSE-NEXT:    pxor %xmm1, %xmm1
    417 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    418 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
    419 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
    420 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    421 ; SSE-NEXT:    movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
    422 ; SSE-NEXT:    subpd %xmm3, %xmm0
    423 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
    424 ; SSE-NEXT:    addpd %xmm4, %xmm0
    425 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
    426 ; SSE-NEXT:    subpd %xmm3, %xmm2
    427 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
    428 ; SSE-NEXT:    addpd %xmm2, %xmm1
    429 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    430 ; SSE-NEXT:    retq
    431 ;
    432 ; AVX1-LABEL: uitofp_4i32_to_2f64:
    433 ; AVX1:       # BB#0:
    434 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
    435 ; AVX1-NEXT:    vcvtdq2pd %xmm1, %ymm1
    436 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
    437 ; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
    438 ; AVX1-NEXT:    vmulpd {{.*}}(%rip), %ymm0, %ymm0
    439 ; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
    440 ; AVX1-NEXT:    # kill
    441 ; AVX1-NEXT:    vzeroupper
    442 ; AVX1-NEXT:    retq
    443 ;
    444 ; AVX2-LABEL: uitofp_4i32_to_2f64:
    445 ; AVX2:       # BB#0:
    446 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
    447 ; AVX2-NEXT:    vcvtdq2pd %xmm1, %ymm1
    448 ; AVX2-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm2
    449 ; AVX2-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
    450 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
    451 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
    452 ; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
    453 ; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
    454 ; AVX2-NEXT:    # kill
    455 ; AVX2-NEXT:    vzeroupper
    456 ; AVX2-NEXT:    retq
    457   %cvt = uitofp <4 x i32> %a to <4 x double>
    458   %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
    459   ret <2 x double> %shuf
    460 }
    461 
    462 define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) {
    463 ; SSE-LABEL: uitofp_2i16_to_2f64:
    464 ; SSE:       # BB#0:
    465 ; SSE-NEXT:    pxor %xmm1, %xmm1
    466 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    467 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
    468 ; SSE-NEXT:    retq
    469 ;
    470 ; AVX-LABEL: uitofp_2i16_to_2f64:
    471 ; AVX:       # BB#0:
    472 ; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    473 ; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
    474 ; AVX-NEXT:    retq
    475   %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
    476   %cvt = uitofp <2 x i16> %shuf to <2 x double>
    477   ret <2 x double> %cvt
    478 }
    479 
    480 define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) {
    481 ; SSE-LABEL: uitofp_8i16_to_2f64:
    482 ; SSE:       # BB#0:
    483 ; SSE-NEXT:    pxor %xmm1, %xmm1
    484 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    485 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
    486 ; SSE-NEXT:    retq
    487 ;
    488 ; AVX1-LABEL: uitofp_8i16_to_2f64:
    489 ; AVX1:       # BB#0:
    490 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    491 ; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
    492 ; AVX1-NEXT:    # kill
    493 ; AVX1-NEXT:    vzeroupper
    494 ; AVX1-NEXT:    retq
    495 ;
    496 ; AVX2-LABEL: uitofp_8i16_to_2f64:
    497 ; AVX2:       # BB#0:
    498 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    499 ; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
    500 ; AVX2-NEXT:    # kill
    501 ; AVX2-NEXT:    vzeroupper
    502 ; AVX2-NEXT:    retq
    503   %cvt = uitofp <8 x i16> %a to <8 x double>
    504   %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
    505   ret <2 x double> %shuf
    506 }
    507 
    508 define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) {
    509 ; SSE-LABEL: uitofp_2i8_to_2f64:
    510 ; SSE:       # BB#0:
    511 ; SSE-NEXT:    pxor %xmm1, %xmm1
    512 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    513 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    514 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
    515 ; SSE-NEXT:    retq
    516 ;
    517 ; AVX-LABEL: uitofp_2i8_to_2f64:
    518 ; AVX:       # BB#0:
    519 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    520 ; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
    521 ; AVX-NEXT:    retq
    522   %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
    523   %cvt = uitofp <2 x i8> %shuf to <2 x double>
    524   ret <2 x double> %cvt
    525 }
    526 
    527 define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
    528 ; SSE-LABEL: uitofp_16i8_to_2f64:
    529 ; SSE:       # BB#0:
    530 ; SSE-NEXT:    pxor %xmm1, %xmm1
    531 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    532 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    533 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
    534 ; SSE-NEXT:    retq
    535 ;
    536 ; AVX1-LABEL: uitofp_16i8_to_2f64:
    537 ; AVX1:       # BB#0:
    538 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    539 ; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
    540 ; AVX1-NEXT:    # kill
    541 ; AVX1-NEXT:    vzeroupper
    542 ; AVX1-NEXT:    retq
    543 ;
    544 ; AVX2-LABEL: uitofp_16i8_to_2f64:
    545 ; AVX2:       # BB#0:
    546 ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
    547 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    548 ; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
    549 ; AVX2-NEXT:    # kill
    550 ; AVX2-NEXT:    vzeroupper
    551 ; AVX2-NEXT:    retq
    552   %cvt = uitofp <16 x i8> %a to <16 x double>
    553   %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
    554   ret <2 x double> %shuf
    555 }
    556 
    557 define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) {
    558 ; SSE-LABEL: uitofp_4i64_to_4f64:
    559 ; SSE:       # BB#0:
    560 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
    561 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
    562 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    563 ; SSE-NEXT:    movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
    564 ; SSE-NEXT:    subpd %xmm4, %xmm0
    565 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
    566 ; SSE-NEXT:    addpd %xmm5, %xmm0
    567 ; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
    568 ; SSE-NEXT:    subpd %xmm4, %xmm3
    569 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
    570 ; SSE-NEXT:    addpd %xmm3, %xmm5
    571 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
    572 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
    573 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    574 ; SSE-NEXT:    subpd %xmm4, %xmm1
    575 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
    576 ; SSE-NEXT:    addpd %xmm5, %xmm1
    577 ; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
    578 ; SSE-NEXT:    subpd %xmm4, %xmm3
    579 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
    580 ; SSE-NEXT:    addpd %xmm3, %xmm2
    581 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
    582 ; SSE-NEXT:    retq
    583 ;
    584 ; AVX1-LABEL: uitofp_4i64_to_4f64:
    585 ; AVX1:       # BB#0:
    586 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    587 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
    588 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    589 ; AVX1-NEXT:    vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
    590 ; AVX1-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
    591 ; AVX1-NEXT:    vhaddpd %xmm3, %xmm3, %xmm3
    592 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
    593 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    594 ; AVX1-NEXT:    vsubpd %xmm4, %xmm1, %xmm1
    595 ; AVX1-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
    596 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
    597 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    598 ; AVX1-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
    599 ; AVX1-NEXT:    vhaddpd %xmm3, %xmm3, %xmm3
    600 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    601 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    602 ; AVX1-NEXT:    vsubpd %xmm4, %xmm0, %xmm0
    603 ; AVX1-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
    604 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
    605 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    606 ; AVX1-NEXT:    retq
    607 ;
    608 ; AVX2-LABEL: uitofp_4i64_to_4f64:
    609 ; AVX2:       # BB#0:
    610 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    611 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
    612 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    613 ; AVX2-NEXT:    vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
    614 ; AVX2-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
    615 ; AVX2-NEXT:    vhaddpd %xmm3, %xmm3, %xmm3
    616 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
    617 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    618 ; AVX2-NEXT:    vsubpd %xmm4, %xmm1, %xmm1
    619 ; AVX2-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
    620 ; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
    621 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    622 ; AVX2-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
    623 ; AVX2-NEXT:    vhaddpd %xmm3, %xmm3, %xmm3
    624 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    625 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    626 ; AVX2-NEXT:    vsubpd %xmm4, %xmm0, %xmm0
    627 ; AVX2-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
    628 ; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
    629 ; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    630 ; AVX2-NEXT:    retq
    631   %cvt = uitofp <4 x i64> %a to <4 x double>
    632   ret <4 x double> %cvt
    633 }
    634 
    635 define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
    636 ; SSE-LABEL: uitofp_4i32_to_4f64:
    637 ; SSE:       # BB#0:
    638 ; SSE-NEXT:    movdqa %xmm0, %xmm2
    639 ; SSE-NEXT:    pxor %xmm1, %xmm1
    640 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    641 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
    642 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
    643 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
    644 ; SSE-NEXT:    movapd {{.*#+}} xmm5 = [4.503600e+15,1.934281e+25]
    645 ; SSE-NEXT:    subpd %xmm5, %xmm0
    646 ; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
    647 ; SSE-NEXT:    addpd %xmm6, %xmm0
    648 ; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
    649 ; SSE-NEXT:    subpd %xmm5, %xmm4
    650 ; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1]
    651 ; SSE-NEXT:    addpd %xmm4, %xmm6
    652 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm6[0]
    653 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
    654 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
    655 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
    656 ; SSE-NEXT:    subpd %xmm5, %xmm2
    657 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
    658 ; SSE-NEXT:    addpd %xmm2, %xmm1
    659 ; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
    660 ; SSE-NEXT:    subpd %xmm5, %xmm4
    661 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
    662 ; SSE-NEXT:    addpd %xmm4, %xmm2
    663 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
    664 ; SSE-NEXT:    retq
    665 ;
    666 ; AVX1-LABEL: uitofp_4i32_to_4f64:
    667 ; AVX1:       # BB#0:
    668 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
    669 ; AVX1-NEXT:    vcvtdq2pd %xmm1, %ymm1
    670 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
    671 ; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
    672 ; AVX1-NEXT:    vmulpd {{.*}}(%rip), %ymm0, %ymm0
    673 ; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
    674 ; AVX1-NEXT:    retq
    675 ;
    676 ; AVX2-LABEL: uitofp_4i32_to_4f64:
    677 ; AVX2:       # BB#0:
    678 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
    679 ; AVX2-NEXT:    vcvtdq2pd %xmm1, %ymm1
    680 ; AVX2-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm2
    681 ; AVX2-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
    682 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
    683 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
    684 ; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
    685 ; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
    686 ; AVX2-NEXT:    retq
    687   %cvt = uitofp <4 x i32> %a to <4 x double>
    688   ret <4 x double> %cvt
    689 }
    690 
    691 define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) {
    692 ; SSE-LABEL: uitofp_4i16_to_4f64:
    693 ; SSE:       # BB#0:
    694 ; SSE-NEXT:    pxor %xmm1, %xmm1
    695 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    696 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
    697 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    698 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
    699 ; SSE-NEXT:    movaps %xmm2, %xmm0
    700 ; SSE-NEXT:    retq
    701 ;
    702 ; AVX-LABEL: uitofp_4i16_to_4f64:
    703 ; AVX:       # BB#0:
    704 ; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    705 ; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
    706 ; AVX-NEXT:    retq
    707   %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    708   %cvt = uitofp <4 x i16> %shuf to <4 x double>
    709   ret <4 x double> %cvt
    710 }
    711 
    712 define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) {
    713 ; SSE-LABEL: uitofp_8i16_to_4f64:
    714 ; SSE:       # BB#0:
    715 ; SSE-NEXT:    pxor %xmm1, %xmm1
    716 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    717 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
    718 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    719 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
    720 ; SSE-NEXT:    movaps %xmm2, %xmm0
    721 ; SSE-NEXT:    retq
    722 ;
    723 ; AVX1-LABEL: uitofp_8i16_to_4f64:
    724 ; AVX1:       # BB#0:
    725 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    726 ; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
    727 ; AVX1-NEXT:    retq
    728 ;
    729 ; AVX2-LABEL: uitofp_8i16_to_4f64:
    730 ; AVX2:       # BB#0:
    731 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    732 ; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
    733 ; AVX2-NEXT:    retq
    734   %cvt = uitofp <8 x i16> %a to <8 x double>
    735   %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    736   ret <4 x double> %shuf
    737 }
    738 
    739 define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) {
    740 ; SSE-LABEL: uitofp_4i8_to_4f64:
    741 ; SSE:       # BB#0:
    742 ; SSE-NEXT:    pxor %xmm1, %xmm1
    743 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    744 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    745 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
    746 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    747 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
    748 ; SSE-NEXT:    movaps %xmm2, %xmm0
    749 ; SSE-NEXT:    retq
    750 ;
    751 ; AVX-LABEL: uitofp_4i8_to_4f64:
    752 ; AVX:       # BB#0:
    753 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    754 ; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
    755 ; AVX-NEXT:    retq
    756   %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    757   %cvt = uitofp <4 x i8> %shuf to <4 x double>
    758   ret <4 x double> %cvt
    759 }
    760 
    761 define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) {
    762 ; SSE-LABEL: uitofp_16i8_to_4f64:
    763 ; SSE:       # BB#0:
    764 ; SSE-NEXT:    pxor %xmm1, %xmm1
    765 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    766 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    767 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
    768 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    769 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
    770 ; SSE-NEXT:    movaps %xmm2, %xmm0
    771 ; SSE-NEXT:    retq
    772 ;
    773 ; AVX1-LABEL: uitofp_16i8_to_4f64:
    774 ; AVX1:       # BB#0:
    775 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    776 ; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
    777 ; AVX1-NEXT:    retq
    778 ;
    779 ; AVX2-LABEL: uitofp_16i8_to_4f64:
    780 ; AVX2:       # BB#0:
    781 ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
    782 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    783 ; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
    784 ; AVX2-NEXT:    retq
    785   %cvt = uitofp <16 x i8> %a to <16 x double>
    786   %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    787   ret <4 x double> %shuf
    788 }
    789 
    790 ;
    791 ; Signed Integer to Float
    792 ;
    793 
    794 define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
    795 ; SSE-LABEL: sitofp_2i64_to_4f32:
    796 ; SSE:       # BB#0:
    797 ; SSE-NEXT:    movd %xmm0, %rax
    798 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
    799 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    800 ; SSE-NEXT:    movd %xmm0, %rax
    801 ; SSE-NEXT:    xorps %xmm0, %xmm0
    802 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
    803 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    804 ; SSE-NEXT:    movaps %xmm1, %xmm0
    805 ; SSE-NEXT:    retq
    806 ;
    807 ; AVX-LABEL: sitofp_2i64_to_4f32:
    808 ; AVX:       # BB#0:
    809 ; AVX-NEXT:    vpextrq $1, %xmm0, %rax
    810 ; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
    811 ; AVX-NEXT:    vmovq %xmm0, %rax
    812 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
    813 ; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
    814 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
    815 ; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
    816 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
    817 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
    818 ; AVX-NEXT:    retq
    819   %cvt = sitofp <2 x i64> %a to <2 x float>
    820   %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
    821   ret <4 x float> %ext
    822 }
    823 
    824 define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
    825 ; SSE-LABEL: sitofp_4i64_to_4f32_undef:
    826 ; SSE:       # BB#0:
    827 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
    828 ; SSE-NEXT:    movd %xmm0, %rax
    829 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
    830 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    831 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    832 ; SSE-NEXT:    movd %xmm0, %rax
    833 ; SSE-NEXT:    xorps %xmm0, %xmm0
    834 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
    835 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    836 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    837 ; SSE-NEXT:    movaps %xmm1, %xmm0
    838 ; SSE-NEXT:    retq
    839 ;
    840 ; AVX-LABEL: sitofp_4i64_to_4f32_undef:
    841 ; AVX:       # BB#0:
    842 ; AVX-NEXT:    vpextrq $1, %xmm0, %rax
    843 ; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
    844 ; AVX-NEXT:    vmovq %xmm0, %rax
    845 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
    846 ; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
    847 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
    848 ; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
    849 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
    850 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
    851 ; AVX-NEXT:    retq
    852   %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
    853   %cvt = sitofp <4 x i64> %ext to <4 x float>
    854   ret <4 x float> %cvt
    855 }
    856 
    857 define <4 x float> @sitofp_4i32_to_4f32(<4 x i32> %a) {
    858 ; SSE-LABEL: sitofp_4i32_to_4f32:
    859 ; SSE:       # BB#0:
    860 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
    861 ; SSE-NEXT:    retq
    862 ;
    863 ; AVX-LABEL: sitofp_4i32_to_4f32:
    864 ; AVX:       # BB#0:
    865 ; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
    866 ; AVX-NEXT:    retq
    867   %cvt = sitofp <4 x i32> %a to <4 x float>
    868   ret <4 x float> %cvt
    869 }
    870 
    871 define <4 x float> @sitofp_4i16_to_4f32(<8 x i16> %a) {
    872 ; SSE-LABEL: sitofp_4i16_to_4f32:
    873 ; SSE:       # BB#0:
    874 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
    875 ; SSE-NEXT:    psrad $16, %xmm0
    876 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
    877 ; SSE-NEXT:    retq
    878 ;
    879 ; AVX-LABEL: sitofp_4i16_to_4f32:
    880 ; AVX:       # BB#0:
    881 ; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
    882 ; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
    883 ; AVX-NEXT:    retq
    884   %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    885   %cvt = sitofp <4 x i16> %shuf to <4 x float>
    886   ret <4 x float> %cvt
    887 }
    888 
    889 define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) {
    890 ; SSE-LABEL: sitofp_8i16_to_4f32:
    891 ; SSE:       # BB#0:
    892 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
    893 ; SSE-NEXT:    psrad $16, %xmm0
    894 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
    895 ; SSE-NEXT:    retq
    896 ;
    897 ; AVX1-LABEL: sitofp_8i16_to_4f32:
    898 ; AVX1:       # BB#0:
    899 ; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
    900 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    901 ; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
    902 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
    903 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
    904 ; AVX1-NEXT:    # kill
    905 ; AVX1-NEXT:    vzeroupper
    906 ; AVX1-NEXT:    retq
    907 ;
    908 ; AVX2-LABEL: sitofp_8i16_to_4f32:
    909 ; AVX2:       # BB#0:
    910 ; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
    911 ; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
    912 ; AVX2-NEXT:    # kill
    913 ; AVX2-NEXT:    vzeroupper
    914 ; AVX2-NEXT:    retq
    915   %cvt = sitofp <8 x i16> %a to <8 x float>
    916   %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    917   ret <4 x float> %shuf
    918 }
    919 
    920 define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) {
    921 ; SSE-LABEL: sitofp_4i8_to_4f32:
    922 ; SSE:       # BB#0:
    923 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    924 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
    925 ; SSE-NEXT:    psrad $24, %xmm0
    926 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
    927 ; SSE-NEXT:    retq
    928 ;
    929 ; AVX-LABEL: sitofp_4i8_to_4f32:
    930 ; AVX:       # BB#0:
    931 ; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
    932 ; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
    933 ; AVX-NEXT:    retq
    934   %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    935   %cvt = sitofp <4 x i8> %shuf to <4 x float>
    936   ret <4 x float> %cvt
    937 }
    938 
    939 define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
    940 ; SSE-LABEL: sitofp_16i8_to_4f32:
    941 ; SSE:       # BB#0:
    942 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    943 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
    944 ; SSE-NEXT:    psrad $24, %xmm0
    945 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
    946 ; SSE-NEXT:    retq
    947 ;
    948 ; AVX1-LABEL: sitofp_16i8_to_4f32:
    949 ; AVX1:       # BB#0:
    950 ; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
    951 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
    952 ; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
    953 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
    954 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
    955 ; AVX1-NEXT:    # kill
    956 ; AVX1-NEXT:    vzeroupper
    957 ; AVX1-NEXT:    retq
    958 ;
    959 ; AVX2-LABEL: sitofp_16i8_to_4f32:
    960 ; AVX2:       # BB#0:
    961 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
    962 ; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
    963 ; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
    964 ; AVX2-NEXT:    # kill
    965 ; AVX2-NEXT:    vzeroupper
    966 ; AVX2-NEXT:    retq
    967   %cvt = sitofp <16 x i8> %a to <16 x float>
    968   %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    969   ret <4 x float> %shuf
    970 }
    971 
    972 define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
    973 ; SSE-LABEL: sitofp_4i64_to_4f32:
    974 ; SSE:       # BB#0:
    975 ; SSE-NEXT:    movd %xmm1, %rax
    976 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm3
    977 ; SSE-NEXT:    movd %xmm0, %rax
    978 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
    979 ; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
    980 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
    981 ; SSE-NEXT:    movd %xmm1, %rax
    982 ; SSE-NEXT:    xorps %xmm1, %xmm1
    983 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
    984 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    985 ; SSE-NEXT:    movd %xmm0, %rax
    986 ; SSE-NEXT:    xorps %xmm0, %xmm0
    987 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
    988 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    989 ; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
    990 ; SSE-NEXT:    movaps %xmm2, %xmm0
    991 ; SSE-NEXT:    retq
    992 ;
    993 ; AVX1-LABEL: sitofp_4i64_to_4f32:
    994 ; AVX1:       # BB#0:
    995 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
    996 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
    997 ; AVX1-NEXT:    vmovq %xmm0, %rax
    998 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
    999 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
   1000 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1001 ; AVX1-NEXT:    vmovq %xmm0, %rax
   1002 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   1003 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
   1004 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
   1005 ; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   1006 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
   1007 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
   1008 ; AVX1-NEXT:    vzeroupper
   1009 ; AVX1-NEXT:    retq
   1010 ;
   1011 ; AVX2-LABEL: sitofp_4i64_to_4f32:
   1012 ; AVX2:       # BB#0:
   1013 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
   1014 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
   1015 ; AVX2-NEXT:    vmovq %xmm0, %rax
   1016 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   1017 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
   1018 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
   1019 ; AVX2-NEXT:    vmovq %xmm0, %rax
   1020 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   1021 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
   1022 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
   1023 ; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   1024 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
   1025 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
   1026 ; AVX2-NEXT:    vzeroupper
   1027 ; AVX2-NEXT:    retq
   1028   %cvt = sitofp <4 x i64> %a to <4 x float>
   1029   ret <4 x float> %cvt
   1030 }
   1031 
   1032 define <8 x float> @sitofp_8i32_to_8f32(<8 x i32> %a) {
   1033 ; SSE-LABEL: sitofp_8i32_to_8f32:
   1034 ; SSE:       # BB#0:
   1035 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
   1036 ; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
   1037 ; SSE-NEXT:    retq
   1038 ;
   1039 ; AVX-LABEL: sitofp_8i32_to_8f32:
   1040 ; AVX:       # BB#0:
   1041 ; AVX-NEXT:    vcvtdq2ps %ymm0, %ymm0
   1042 ; AVX-NEXT:    retq
   1043   %cvt = sitofp <8 x i32> %a to <8 x float>
   1044   ret <8 x float> %cvt
   1045 }
   1046 
   1047 define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) {
   1048 ; SSE-LABEL: sitofp_8i16_to_8f32:
   1049 ; SSE:       # BB#0:
   1050 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   1051 ; SSE-NEXT:    psrad $16, %xmm1
   1052 ; SSE-NEXT:    cvtdq2ps %xmm1, %xmm2
   1053 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
   1054 ; SSE-NEXT:    psrad $16, %xmm0
   1055 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
   1056 ; SSE-NEXT:    movaps %xmm2, %xmm0
   1057 ; SSE-NEXT:    retq
   1058 ;
   1059 ; AVX1-LABEL: sitofp_8i16_to_8f32:
   1060 ; AVX1:       # BB#0:
   1061 ; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
   1062 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   1063 ; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
   1064 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   1065 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
   1066 ; AVX1-NEXT:    retq
   1067 ;
   1068 ; AVX2-LABEL: sitofp_8i16_to_8f32:
   1069 ; AVX2:       # BB#0:
   1070 ; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
   1071 ; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
   1072 ; AVX2-NEXT:    retq
   1073   %cvt = sitofp <8 x i16> %a to <8 x float>
   1074   ret <8 x float> %cvt
   1075 }
   1076 
   1077 define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) {
   1078 ; SSE-LABEL: sitofp_8i8_to_8f32:
   1079 ; SSE:       # BB#0:
   1080 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1081 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
   1082 ; SSE-NEXT:    psrad $24, %xmm1
   1083 ; SSE-NEXT:    cvtdq2ps %xmm1, %xmm2
   1084 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
   1085 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1086 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1087 ; SSE-NEXT:    psrad $24, %xmm0
   1088 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
   1089 ; SSE-NEXT:    movaps %xmm2, %xmm0
   1090 ; SSE-NEXT:    retq
   1091 ;
   1092 ; AVX1-LABEL: sitofp_8i8_to_8f32:
   1093 ; AVX1:       # BB#0:
   1094 ; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
   1095 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
   1096 ; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
   1097 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   1098 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
   1099 ; AVX1-NEXT:    retq
   1100 ;
   1101 ; AVX2-LABEL: sitofp_8i8_to_8f32:
   1102 ; AVX2:       # BB#0:
   1103 ; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
   1104 ; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
   1105 ; AVX2-NEXT:    retq
   1106   %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1107   %cvt = sitofp <8 x i8> %shuf to <8 x float>
   1108   ret <8 x float> %cvt
   1109 }
   1110 
   1111 define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) {
   1112 ; SSE-LABEL: sitofp_16i8_to_8f32:
   1113 ; SSE:       # BB#0:
   1114 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1115 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
   1116 ; SSE-NEXT:    psrad $24, %xmm1
   1117 ; SSE-NEXT:    cvtdq2ps %xmm1, %xmm2
   1118 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
   1119 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1120 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1121 ; SSE-NEXT:    psrad $24, %xmm0
   1122 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
   1123 ; SSE-NEXT:    movaps %xmm2, %xmm0
   1124 ; SSE-NEXT:    retq
   1125 ;
   1126 ; AVX1-LABEL: sitofp_16i8_to_8f32:
   1127 ; AVX1:       # BB#0:
   1128 ; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
   1129 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
   1130 ; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
   1131 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   1132 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
   1133 ; AVX1-NEXT:    retq
   1134 ;
   1135 ; AVX2-LABEL: sitofp_16i8_to_8f32:
   1136 ; AVX2:       # BB#0:
   1137 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   1138 ; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
   1139 ; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
   1140 ; AVX2-NEXT:    retq
   1141   %cvt = sitofp <16 x i8> %a to <16 x float>
   1142   %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1143   ret <8 x float> %shuf
   1144 }
   1145 
   1146 ;
   1147 ; Unsigned Integer to Float
   1148 ;
   1149 
   1150 define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
   1151 ; SSE-LABEL: uitofp_2i64_to_4f32:
   1152 ; SSE:       # BB#0:
   1153 ; SSE-NEXT:    movdqa %xmm0, %xmm1
   1154 ; SSE-NEXT:    movd %xmm1, %rax
   1155 ; SSE-NEXT:    movl %eax, %ecx
   1156 ; SSE-NEXT:    andl $1, %ecx
   1157 ; SSE-NEXT:    testq %rax, %rax
   1158 ; SSE-NEXT:    js .LBB38_1
   1159 ; SSE-NEXT:  # BB#2:
   1160 ; SSE-NEXT:    xorps %xmm0, %xmm0
   1161 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
   1162 ; SSE-NEXT:    jmp .LBB38_3
   1163 ; SSE-NEXT:  .LBB38_1:
   1164 ; SSE-NEXT:    shrq %rax
   1165 ; SSE-NEXT:    orq %rax, %rcx
   1166 ; SSE-NEXT:    xorps %xmm0, %xmm0
   1167 ; SSE-NEXT:    cvtsi2ssq %rcx, %xmm0
   1168 ; SSE-NEXT:    addss %xmm0, %xmm0
   1169 ; SSE-NEXT:  .LBB38_3:
   1170 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   1171 ; SSE-NEXT:    movd %xmm1, %rax
   1172 ; SSE-NEXT:    movl %eax, %ecx
   1173 ; SSE-NEXT:    andl $1, %ecx
   1174 ; SSE-NEXT:    testq %rax, %rax
   1175 ; SSE-NEXT:    js .LBB38_4
   1176 ; SSE-NEXT:  # BB#5:
   1177 ; SSE-NEXT:    xorps %xmm1, %xmm1
   1178 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
   1179 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1180 ; SSE-NEXT:    retq
   1181 ; SSE-NEXT:  .LBB38_4:
   1182 ; SSE-NEXT:    shrq %rax
   1183 ; SSE-NEXT:    orq %rax, %rcx
   1184 ; SSE-NEXT:    xorps %xmm1, %xmm1
   1185 ; SSE-NEXT:    cvtsi2ssq %rcx, %xmm1
   1186 ; SSE-NEXT:    addss %xmm1, %xmm1
   1187 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1188 ; SSE-NEXT:    retq
   1189 ;
   1190 ; AVX-LABEL: uitofp_2i64_to_4f32:
   1191 ; AVX:       # BB#0:
   1192 ; AVX-NEXT:    vpextrq $1, %xmm0, %rax
   1193 ; AVX-NEXT:    movl %eax, %ecx
   1194 ; AVX-NEXT:    andl $1, %ecx
   1195 ; AVX-NEXT:    testq %rax, %rax
   1196 ; AVX-NEXT:    js .LBB38_1
   1197 ; AVX-NEXT:  # BB#2:
   1198 ; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
   1199 ; AVX-NEXT:    jmp .LBB38_3
   1200 ; AVX-NEXT:  .LBB38_1:
   1201 ; AVX-NEXT:    shrq %rax
   1202 ; AVX-NEXT:    orq %rax, %rcx
   1203 ; AVX-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
   1204 ; AVX-NEXT:    vaddss %xmm1, %xmm1, %xmm1
   1205 ; AVX-NEXT:  .LBB38_3:
   1206 ; AVX-NEXT:    vmovq %xmm0, %rax
   1207 ; AVX-NEXT:    movl %eax, %ecx
   1208 ; AVX-NEXT:    andl $1, %ecx
   1209 ; AVX-NEXT:    testq %rax, %rax
   1210 ; AVX-NEXT:    js .LBB38_4
   1211 ; AVX-NEXT:  # BB#5:
   1212 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   1213 ; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
   1214 ; AVX-NEXT:    jmp .LBB38_6
   1215 ; AVX-NEXT:  .LBB38_4:
   1216 ; AVX-NEXT:    shrq %rax
   1217 ; AVX-NEXT:    orq %rax, %rcx
   1218 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   1219 ; AVX-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
   1220 ; AVX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
   1221 ; AVX-NEXT:  .LBB38_6:
   1222 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
   1223 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
   1224 ; AVX-NEXT:    testq %rax, %rax
   1225 ; AVX-NEXT:    js .LBB38_8
   1226 ; AVX-NEXT:  # BB#7:
   1227 ; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
   1228 ; AVX-NEXT:  .LBB38_8:
   1229 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
   1230 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
   1231 ; AVX-NEXT:    retq
   1232   %cvt = uitofp <2 x i64> %a to <2 x float>
   1233   %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   1234   ret <4 x float> %ext
   1235 }
   1236 
   1237 define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
   1238 ; SSE-LABEL: uitofp_4i64_to_4f32_undef:
   1239 ; SSE:       # BB#0:
   1240 ; SSE-NEXT:    movdqa %xmm0, %xmm1
   1241 ; SSE-NEXT:    testq %rax, %rax
   1242 ; SSE-NEXT:    xorps %xmm2, %xmm2
   1243 ; SSE-NEXT:    js .LBB39_2
   1244 ; SSE-NEXT:  # BB#1:
   1245 ; SSE-NEXT:    xorps %xmm2, %xmm2
   1246 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
   1247 ; SSE-NEXT:  .LBB39_2:
   1248 ; SSE-NEXT:    movd %xmm1, %rax
   1249 ; SSE-NEXT:    movl %eax, %ecx
   1250 ; SSE-NEXT:    andl $1, %ecx
   1251 ; SSE-NEXT:    testq %rax, %rax
   1252 ; SSE-NEXT:    js .LBB39_3
   1253 ; SSE-NEXT:  # BB#4:
   1254 ; SSE-NEXT:    xorps %xmm0, %xmm0
   1255 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
   1256 ; SSE-NEXT:    jmp .LBB39_5
   1257 ; SSE-NEXT:  .LBB39_3:
   1258 ; SSE-NEXT:    shrq %rax
   1259 ; SSE-NEXT:    orq %rax, %rcx
   1260 ; SSE-NEXT:    xorps %xmm0, %xmm0
   1261 ; SSE-NEXT:    cvtsi2ssq %rcx, %xmm0
   1262 ; SSE-NEXT:    addss %xmm0, %xmm0
   1263 ; SSE-NEXT:  .LBB39_5:
   1264 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
   1265 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   1266 ; SSE-NEXT:    movd %xmm1, %rax
   1267 ; SSE-NEXT:    movl %eax, %ecx
   1268 ; SSE-NEXT:    andl $1, %ecx
   1269 ; SSE-NEXT:    testq %rax, %rax
   1270 ; SSE-NEXT:    js .LBB39_6
   1271 ; SSE-NEXT:  # BB#7:
   1272 ; SSE-NEXT:    xorps %xmm1, %xmm1
   1273 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
   1274 ; SSE-NEXT:    jmp .LBB39_8
   1275 ; SSE-NEXT:  .LBB39_6:
   1276 ; SSE-NEXT:    shrq %rax
   1277 ; SSE-NEXT:    orq %rax, %rcx
   1278 ; SSE-NEXT:    xorps %xmm1, %xmm1
   1279 ; SSE-NEXT:    cvtsi2ssq %rcx, %xmm1
   1280 ; SSE-NEXT:    addss %xmm1, %xmm1
   1281 ; SSE-NEXT:  .LBB39_8:
   1282 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   1283 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1284 ; SSE-NEXT:    retq
   1285 ;
   1286 ; AVX-LABEL: uitofp_4i64_to_4f32_undef:
   1287 ; AVX:       # BB#0:
   1288 ; AVX-NEXT:    vpextrq $1, %xmm0, %rax
   1289 ; AVX-NEXT:    movl %eax, %ecx
   1290 ; AVX-NEXT:    andl $1, %ecx
   1291 ; AVX-NEXT:    testq %rax, %rax
   1292 ; AVX-NEXT:    js .LBB39_1
   1293 ; AVX-NEXT:  # BB#2:
   1294 ; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
   1295 ; AVX-NEXT:    jmp .LBB39_3
   1296 ; AVX-NEXT:  .LBB39_1:
   1297 ; AVX-NEXT:    shrq %rax
   1298 ; AVX-NEXT:    orq %rax, %rcx
   1299 ; AVX-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
   1300 ; AVX-NEXT:    vaddss %xmm1, %xmm1, %xmm1
   1301 ; AVX-NEXT:  .LBB39_3:
   1302 ; AVX-NEXT:    vmovq %xmm0, %rax
   1303 ; AVX-NEXT:    movl %eax, %ecx
   1304 ; AVX-NEXT:    andl $1, %ecx
   1305 ; AVX-NEXT:    testq %rax, %rax
   1306 ; AVX-NEXT:    js .LBB39_4
   1307 ; AVX-NEXT:  # BB#5:
   1308 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   1309 ; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
   1310 ; AVX-NEXT:    jmp .LBB39_6
   1311 ; AVX-NEXT:  .LBB39_4:
   1312 ; AVX-NEXT:    shrq %rax
   1313 ; AVX-NEXT:    orq %rax, %rcx
   1314 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   1315 ; AVX-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
   1316 ; AVX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
   1317 ; AVX-NEXT:  .LBB39_6:
   1318 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
   1319 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
   1320 ; AVX-NEXT:    testq %rax, %rax
   1321 ; AVX-NEXT:    js .LBB39_8
   1322 ; AVX-NEXT:  # BB#7:
   1323 ; AVX-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
   1324 ; AVX-NEXT:  .LBB39_8:
   1325 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
   1326 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
   1327 ; AVX-NEXT:    retq
   1328   %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   1329   %cvt = uitofp <4 x i64> %ext to <4 x float>
   1330   ret <4 x float> %cvt
   1331 }
   1332 
   1333 define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) {
   1334 ; SSE-LABEL: uitofp_4i32_to_4f32:
   1335 ; SSE:       # BB#0:
   1336 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
   1337 ; SSE-NEXT:    pand %xmm0, %xmm1
   1338 ; SSE-NEXT:    por {{.*}}(%rip), %xmm1
   1339 ; SSE-NEXT:    psrld $16, %xmm0
   1340 ; SSE-NEXT:    por {{.*}}(%rip), %xmm0
   1341 ; SSE-NEXT:    addps {{.*}}(%rip), %xmm0
   1342 ; SSE-NEXT:    addps %xmm1, %xmm0
   1343 ; SSE-NEXT:    retq
   1344 ;
   1345 ; AVX1-LABEL: uitofp_4i32_to_4f32:
   1346 ; AVX1:       # BB#0:
   1347 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
   1348 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
   1349 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
   1350 ; AVX1-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
   1351 ; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
   1352 ; AVX1-NEXT:    retq
   1353 ;
   1354 ; AVX2-LABEL: uitofp_4i32_to_4f32:
   1355 ; AVX2:       # BB#0:
   1356 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
   1357 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
   1358 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
   1359 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
   1360 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
   1361 ; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
   1362 ; AVX2-NEXT:    vaddps %xmm2, %xmm0, %xmm0
   1363 ; AVX2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
   1364 ; AVX2-NEXT:    retq
   1365   %cvt = uitofp <4 x i32> %a to <4 x float>
   1366   ret <4 x float> %cvt
   1367 }
   1368 
   1369 define <4 x float> @uitofp_4i16_to_4f32(<8 x i16> %a) {
   1370 ; SSE-LABEL: uitofp_4i16_to_4f32:
   1371 ; SSE:       # BB#0:
   1372 ; SSE-NEXT:    pxor %xmm1, %xmm1
   1373 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1374 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
   1375 ; SSE-NEXT:    retq
   1376 ;
   1377 ; AVX-LABEL: uitofp_4i16_to_4f32:
   1378 ; AVX:       # BB#0:
   1379 ; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   1380 ; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
   1381 ; AVX-NEXT:    retq
   1382   %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1383   %cvt = uitofp <4 x i16> %shuf to <4 x float>
   1384   ret <4 x float> %cvt
   1385 }
   1386 
   1387 define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) {
   1388 ; SSE-LABEL: uitofp_8i16_to_4f32:
   1389 ; SSE:       # BB#0:
   1390 ; SSE-NEXT:    pxor %xmm1, %xmm1
   1391 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1392 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
   1393 ; SSE-NEXT:    retq
   1394 ;
   1395 ; AVX1-LABEL: uitofp_8i16_to_4f32:
   1396 ; AVX1:       # BB#0:
   1397 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1398 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   1399 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   1400 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1401 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
   1402 ; AVX1-NEXT:    # kill
   1403 ; AVX1-NEXT:    vzeroupper
   1404 ; AVX1-NEXT:    retq
   1405 ;
   1406 ; AVX2-LABEL: uitofp_8i16_to_4f32:
   1407 ; AVX2:       # BB#0:
   1408 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1409 ; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
   1410 ; AVX2-NEXT:    # kill
   1411 ; AVX2-NEXT:    vzeroupper
   1412 ; AVX2-NEXT:    retq
   1413   %cvt = uitofp <8 x i16> %a to <8 x float>
   1414   %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1415   ret <4 x float> %shuf
   1416 }
   1417 
   1418 define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) {
   1419 ; SSE-LABEL: uitofp_4i8_to_4f32:
   1420 ; SSE:       # BB#0:
   1421 ; SSE-NEXT:    pxor %xmm1, %xmm1
   1422 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   1423 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1424 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
   1425 ; SSE-NEXT:    retq
   1426 ;
   1427 ; AVX-LABEL: uitofp_4i8_to_4f32:
   1428 ; AVX:       # BB#0:
   1429 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
   1430 ; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
   1431 ; AVX-NEXT:    retq
   1432   %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1433   %cvt = uitofp <4 x i8> %shuf to <4 x float>
   1434   ret <4 x float> %cvt
   1435 }
   1436 
   1437 define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
   1438 ; SSE-LABEL: uitofp_16i8_to_4f32:
   1439 ; SSE:       # BB#0:
   1440 ; SSE-NEXT:    pxor %xmm1, %xmm1
   1441 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   1442 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1443 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
   1444 ; SSE-NEXT:    retq
   1445 ;
   1446 ; AVX1-LABEL: uitofp_16i8_to_4f32:
   1447 ; AVX1:       # BB#0:
   1448 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
   1449 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
   1450 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
   1451 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   1452 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
   1453 ; AVX1-NEXT:    # kill
   1454 ; AVX1-NEXT:    vzeroupper
   1455 ; AVX1-NEXT:    retq
   1456 ;
   1457 ; AVX2-LABEL: uitofp_16i8_to_4f32:
   1458 ; AVX2:       # BB#0:
   1459 ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
   1460 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1461 ; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
   1462 ; AVX2-NEXT:    # kill
   1463 ; AVX2-NEXT:    vzeroupper
   1464 ; AVX2-NEXT:    retq
   1465   %cvt = uitofp <16 x i8> %a to <16 x float>
   1466   %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1467   ret <4 x float> %shuf
   1468 }
   1469 
   1470 define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
   1471 ; SSE-LABEL: uitofp_4i64_to_4f32:
   1472 ; SSE:       # BB#0:
   1473 ; SSE-NEXT:    movd %xmm1, %rax
   1474 ; SSE-NEXT:    movl %eax, %ecx
   1475 ; SSE-NEXT:    andl $1, %ecx
   1476 ; SSE-NEXT:    testq %rax, %rax
   1477 ; SSE-NEXT:    js .LBB45_1
   1478 ; SSE-NEXT:  # BB#2:
   1479 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm3
   1480 ; SSE-NEXT:    jmp .LBB45_3
   1481 ; SSE-NEXT:  .LBB45_1:
   1482 ; SSE-NEXT:    shrq %rax
   1483 ; SSE-NEXT:    orq %rax, %rcx
   1484 ; SSE-NEXT:    cvtsi2ssq %rcx, %xmm3
   1485 ; SSE-NEXT:    addss %xmm3, %xmm3
   1486 ; SSE-NEXT:  .LBB45_3:
   1487 ; SSE-NEXT:    movd %xmm0, %rax
   1488 ; SSE-NEXT:    movl %eax, %ecx
   1489 ; SSE-NEXT:    andl $1, %ecx
   1490 ; SSE-NEXT:    testq %rax, %rax
   1491 ; SSE-NEXT:    js .LBB45_4
   1492 ; SSE-NEXT:  # BB#5:
   1493 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
   1494 ; SSE-NEXT:    jmp .LBB45_6
   1495 ; SSE-NEXT:  .LBB45_4:
   1496 ; SSE-NEXT:    shrq %rax
   1497 ; SSE-NEXT:    orq %rax, %rcx
   1498 ; SSE-NEXT:    cvtsi2ssq %rcx, %xmm2
   1499 ; SSE-NEXT:    addss %xmm2, %xmm2
   1500 ; SSE-NEXT:  .LBB45_6:
   1501 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   1502 ; SSE-NEXT:    movd %xmm1, %rax
   1503 ; SSE-NEXT:    movl %eax, %ecx
   1504 ; SSE-NEXT:    andl $1, %ecx
   1505 ; SSE-NEXT:    testq %rax, %rax
   1506 ; SSE-NEXT:    js .LBB45_7
   1507 ; SSE-NEXT:  # BB#8:
   1508 ; SSE-NEXT:    xorps %xmm1, %xmm1
   1509 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
   1510 ; SSE-NEXT:    jmp .LBB45_9
   1511 ; SSE-NEXT:  .LBB45_7:
   1512 ; SSE-NEXT:    shrq %rax
   1513 ; SSE-NEXT:    orq %rax, %rcx
   1514 ; SSE-NEXT:    xorps %xmm1, %xmm1
   1515 ; SSE-NEXT:    cvtsi2ssq %rcx, %xmm1
   1516 ; SSE-NEXT:    addss %xmm1, %xmm1
   1517 ; SSE-NEXT:  .LBB45_9:
   1518 ; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
   1519 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   1520 ; SSE-NEXT:    movd %xmm0, %rax
   1521 ; SSE-NEXT:    movl %eax, %ecx
   1522 ; SSE-NEXT:    andl $1, %ecx
   1523 ; SSE-NEXT:    testq %rax, %rax
   1524 ; SSE-NEXT:    js .LBB45_10
   1525 ; SSE-NEXT:  # BB#11:
   1526 ; SSE-NEXT:    xorps %xmm0, %xmm0
   1527 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
   1528 ; SSE-NEXT:    jmp .LBB45_12
   1529 ; SSE-NEXT:  .LBB45_10:
   1530 ; SSE-NEXT:    shrq %rax
   1531 ; SSE-NEXT:    orq %rax, %rcx
   1532 ; SSE-NEXT:    xorps %xmm0, %xmm0
   1533 ; SSE-NEXT:    cvtsi2ssq %rcx, %xmm0
   1534 ; SSE-NEXT:    addss %xmm0, %xmm0
   1535 ; SSE-NEXT:  .LBB45_12:
   1536 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1537 ; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
   1538 ; SSE-NEXT:    movaps %xmm2, %xmm0
   1539 ; SSE-NEXT:    retq
   1540 ;
   1541 ; AVX1-LABEL: uitofp_4i64_to_4f32:
   1542 ; AVX1:       # BB#0:
   1543 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
   1544 ; AVX1-NEXT:    movl %eax, %ecx
   1545 ; AVX1-NEXT:    andl $1, %ecx
   1546 ; AVX1-NEXT:    testq %rax, %rax
   1547 ; AVX1-NEXT:    js .LBB45_1
   1548 ; AVX1-NEXT:  # BB#2:
   1549 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
   1550 ; AVX1-NEXT:    jmp .LBB45_3
   1551 ; AVX1-NEXT:  .LBB45_1:
   1552 ; AVX1-NEXT:    shrq %rax
   1553 ; AVX1-NEXT:    orq %rax, %rcx
   1554 ; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
   1555 ; AVX1-NEXT:    vaddss %xmm1, %xmm1, %xmm1
   1556 ; AVX1-NEXT:  .LBB45_3:
   1557 ; AVX1-NEXT:    vmovq %xmm0, %rax
   1558 ; AVX1-NEXT:    movl %eax, %ecx
   1559 ; AVX1-NEXT:    andl $1, %ecx
   1560 ; AVX1-NEXT:    testq %rax, %rax
   1561 ; AVX1-NEXT:    js .LBB45_4
   1562 ; AVX1-NEXT:  # BB#5:
   1563 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   1564 ; AVX1-NEXT:    jmp .LBB45_6
   1565 ; AVX1-NEXT:  .LBB45_4:
   1566 ; AVX1-NEXT:    shrq %rax
   1567 ; AVX1-NEXT:    orq %rax, %rcx
   1568 ; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
   1569 ; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
   1570 ; AVX1-NEXT:  .LBB45_6:
   1571 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
   1572 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1573 ; AVX1-NEXT:    vmovq %xmm0, %rax
   1574 ; AVX1-NEXT:    movl %eax, %ecx
   1575 ; AVX1-NEXT:    andl $1, %ecx
   1576 ; AVX1-NEXT:    testq %rax, %rax
   1577 ; AVX1-NEXT:    js .LBB45_7
   1578 ; AVX1-NEXT:  # BB#8:
   1579 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   1580 ; AVX1-NEXT:    jmp .LBB45_9
   1581 ; AVX1-NEXT:  .LBB45_7:
   1582 ; AVX1-NEXT:    shrq %rax
   1583 ; AVX1-NEXT:    orq %rax, %rcx
   1584 ; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
   1585 ; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
   1586 ; AVX1-NEXT:  .LBB45_9:
   1587 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
   1588 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
   1589 ; AVX1-NEXT:    movl %eax, %ecx
   1590 ; AVX1-NEXT:    andl $1, %ecx
   1591 ; AVX1-NEXT:    testq %rax, %rax
   1592 ; AVX1-NEXT:    js .LBB45_10
   1593 ; AVX1-NEXT:  # BB#11:
   1594 ; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   1595 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
   1596 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
   1597 ; AVX1-NEXT:    vzeroupper
   1598 ; AVX1-NEXT:    retq
   1599 ; AVX1-NEXT:  .LBB45_10:
   1600 ; AVX1-NEXT:    shrq %rax
   1601 ; AVX1-NEXT:    orq %rax, %rcx
   1602 ; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   1603 ; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
   1604 ; AVX1-NEXT:    vaddss %xmm0, %xmm0, %xmm0
   1605 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
   1606 ; AVX1-NEXT:    vzeroupper
   1607 ; AVX1-NEXT:    retq
   1608 ;
   1609 ; AVX2-LABEL: uitofp_4i64_to_4f32:
   1610 ; AVX2:       # BB#0:
   1611 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
   1612 ; AVX2-NEXT:    movl %eax, %ecx
   1613 ; AVX2-NEXT:    andl $1, %ecx
   1614 ; AVX2-NEXT:    testq %rax, %rax
   1615 ; AVX2-NEXT:    js .LBB45_1
   1616 ; AVX2-NEXT:  # BB#2:
   1617 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
   1618 ; AVX2-NEXT:    jmp .LBB45_3
   1619 ; AVX2-NEXT:  .LBB45_1:
   1620 ; AVX2-NEXT:    shrq %rax
   1621 ; AVX2-NEXT:    orq %rax, %rcx
   1622 ; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
   1623 ; AVX2-NEXT:    vaddss %xmm1, %xmm1, %xmm1
   1624 ; AVX2-NEXT:  .LBB45_3:
   1625 ; AVX2-NEXT:    vmovq %xmm0, %rax
   1626 ; AVX2-NEXT:    movl %eax, %ecx
   1627 ; AVX2-NEXT:    andl $1, %ecx
   1628 ; AVX2-NEXT:    testq %rax, %rax
   1629 ; AVX2-NEXT:    js .LBB45_4
   1630 ; AVX2-NEXT:  # BB#5:
   1631 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   1632 ; AVX2-NEXT:    jmp .LBB45_6
   1633 ; AVX2-NEXT:  .LBB45_4:
   1634 ; AVX2-NEXT:    shrq %rax
   1635 ; AVX2-NEXT:    orq %rax, %rcx
   1636 ; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
   1637 ; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
   1638 ; AVX2-NEXT:  .LBB45_6:
   1639 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
   1640 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
   1641 ; AVX2-NEXT:    vmovq %xmm0, %rax
   1642 ; AVX2-NEXT:    movl %eax, %ecx
   1643 ; AVX2-NEXT:    andl $1, %ecx
   1644 ; AVX2-NEXT:    testq %rax, %rax
   1645 ; AVX2-NEXT:    js .LBB45_7
   1646 ; AVX2-NEXT:  # BB#8:
   1647 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   1648 ; AVX2-NEXT:    jmp .LBB45_9
   1649 ; AVX2-NEXT:  .LBB45_7:
   1650 ; AVX2-NEXT:    shrq %rax
   1651 ; AVX2-NEXT:    orq %rax, %rcx
   1652 ; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
   1653 ; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
   1654 ; AVX2-NEXT:  .LBB45_9:
   1655 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
   1656 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
   1657 ; AVX2-NEXT:    movl %eax, %ecx
   1658 ; AVX2-NEXT:    andl $1, %ecx
   1659 ; AVX2-NEXT:    testq %rax, %rax
   1660 ; AVX2-NEXT:    js .LBB45_10
   1661 ; AVX2-NEXT:  # BB#11:
   1662 ; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   1663 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
   1664 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
   1665 ; AVX2-NEXT:    vzeroupper
   1666 ; AVX2-NEXT:    retq
   1667 ; AVX2-NEXT:  .LBB45_10:
   1668 ; AVX2-NEXT:    shrq %rax
   1669 ; AVX2-NEXT:    orq %rax, %rcx
   1670 ; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   1671 ; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
   1672 ; AVX2-NEXT:    vaddss %xmm0, %xmm0, %xmm0
   1673 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
   1674 ; AVX2-NEXT:    vzeroupper
   1675 ; AVX2-NEXT:    retq
   1676   %cvt = uitofp <4 x i64> %a to <4 x float>
   1677   ret <4 x float> %cvt
   1678 }
   1679 
   1680 define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) {
   1681 ; SSE-LABEL: uitofp_8i32_to_8f32:
   1682 ; SSE:       # BB#0:
   1683 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
   1684 ; SSE-NEXT:    movdqa %xmm0, %xmm3
   1685 ; SSE-NEXT:    pand %xmm2, %xmm3
   1686 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
   1687 ; SSE-NEXT:    por %xmm4, %xmm3
   1688 ; SSE-NEXT:    psrld $16, %xmm0
   1689 ; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
   1690 ; SSE-NEXT:    por %xmm5, %xmm0
   1691 ; SSE-NEXT:    movaps {{.*#+}} xmm6 = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
   1692 ; SSE-NEXT:    addps %xmm6, %xmm0
   1693 ; SSE-NEXT:    addps %xmm3, %xmm0
   1694 ; SSE-NEXT:    pand %xmm1, %xmm2
   1695 ; SSE-NEXT:    por %xmm4, %xmm2
   1696 ; SSE-NEXT:    psrld $16, %xmm1
   1697 ; SSE-NEXT:    por %xmm5, %xmm1
   1698 ; SSE-NEXT:    addps %xmm6, %xmm1
   1699 ; SSE-NEXT:    addps %xmm2, %xmm1
   1700 ; SSE-NEXT:    retq
   1701 ;
   1702 ; AVX1-LABEL: uitofp_8i32_to_8f32:
   1703 ; AVX1:       # BB#0:
   1704 ; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm1
   1705 ; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
   1706 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
   1707 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1708 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
   1709 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
   1710 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
   1711 ; AVX1-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
   1712 ; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
   1713 ; AVX1-NEXT:    retq
   1714 ;
   1715 ; AVX2-LABEL: uitofp_8i32_to_8f32:
   1716 ; AVX2:       # BB#0:
   1717 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
   1718 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
   1719 ; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
   1720 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
   1721 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
   1722 ; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
   1723 ; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
   1724 ; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
   1725 ; AVX2-NEXT:    retq
   1726   %cvt = uitofp <8 x i32> %a to <8 x float>
   1727   ret <8 x float> %cvt
   1728 }
   1729 
   1730 define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) {
   1731 ; SSE-LABEL: uitofp_8i16_to_8f32:
   1732 ; SSE:       # BB#0:
   1733 ; SSE-NEXT:    pxor %xmm1, %xmm1
   1734 ; SSE-NEXT:    movdqa %xmm0, %xmm2
   1735 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
   1736 ; SSE-NEXT:    cvtdq2ps %xmm2, %xmm2
   1737 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   1738 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
   1739 ; SSE-NEXT:    movaps %xmm2, %xmm0
   1740 ; SSE-NEXT:    retq
   1741 ;
   1742 ; AVX1-LABEL: uitofp_8i16_to_8f32:
   1743 ; AVX1:       # BB#0:
   1744 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1745 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   1746 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   1747 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1748 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
   1749 ; AVX1-NEXT:    retq
   1750 ;
   1751 ; AVX2-LABEL: uitofp_8i16_to_8f32:
   1752 ; AVX2:       # BB#0:
   1753 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1754 ; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
   1755 ; AVX2-NEXT:    retq
   1756   %cvt = uitofp <8 x i16> %a to <8 x float>
   1757   ret <8 x float> %cvt
   1758 }
   1759 
   1760 define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) {
   1761 ; SSE-LABEL: uitofp_8i8_to_8f32:
   1762 ; SSE:       # BB#0:
   1763 ; SSE-NEXT:    pxor %xmm1, %xmm1
   1764 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   1765 ; SSE-NEXT:    movdqa %xmm0, %xmm2
   1766 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
   1767 ; SSE-NEXT:    cvtdq2ps %xmm2, %xmm2
   1768 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   1769 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
   1770 ; SSE-NEXT:    movaps %xmm2, %xmm0
   1771 ; SSE-NEXT:    retq
   1772 ;
   1773 ; AVX1-LABEL: uitofp_8i8_to_8f32:
   1774 ; AVX1:       # BB#0:
   1775 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
   1776 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
   1777 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
   1778 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   1779 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
   1780 ; AVX1-NEXT:    retq
   1781 ;
   1782 ; AVX2-LABEL: uitofp_8i8_to_8f32:
   1783 ; AVX2:       # BB#0:
   1784 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
   1785 ; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
   1786 ; AVX2-NEXT:    retq
   1787   %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1788   %cvt = uitofp <8 x i8> %shuf to <8 x float>
   1789   ret <8 x float> %cvt
   1790 }
   1791 
   1792 define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
   1793 ; SSE-LABEL: uitofp_16i8_to_8f32:
   1794 ; SSE:       # BB#0:
   1795 ; SSE-NEXT:    pxor %xmm1, %xmm1
   1796 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   1797 ; SSE-NEXT:    movdqa %xmm0, %xmm2
   1798 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
   1799 ; SSE-NEXT:    cvtdq2ps %xmm2, %xmm2
   1800 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   1801 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
   1802 ; SSE-NEXT:    movaps %xmm2, %xmm0
   1803 ; SSE-NEXT:    retq
   1804 ;
   1805 ; AVX1-LABEL: uitofp_16i8_to_8f32:
   1806 ; AVX1:       # BB#0:
   1807 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
   1808 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
   1809 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
   1810 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   1811 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
   1812 ; AVX1-NEXT:    retq
   1813 ;
   1814 ; AVX2-LABEL: uitofp_16i8_to_8f32:
   1815 ; AVX2:       # BB#0:
   1816 ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
   1817 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1818 ; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
   1819 ; AVX2-NEXT:    retq
   1820   %cvt = uitofp <16 x i8> %a to <16 x float>
   1821   %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1822   ret <8 x float> %shuf
   1823 }
   1824 
   1825 ;
   1826 ; Load Signed Integer to Double
   1827 ;
   1828 
   1829 define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
   1830 ; SSE-LABEL: sitofp_load_2i64_to_2f64:
   1831 ; SSE:       # BB#0:
   1832 ; SSE-NEXT:    movdqa (%rdi), %xmm1
   1833 ; SSE-NEXT:    movd %xmm1, %rax
   1834 ; SSE-NEXT:    cvtsi2sdq %rax, %xmm0
   1835 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   1836 ; SSE-NEXT:    movd %xmm1, %rax
   1837 ; SSE-NEXT:    xorps %xmm1, %xmm1
   1838 ; SSE-NEXT:    cvtsi2sdq %rax, %xmm1
   1839 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1840 ; SSE-NEXT:    retq
   1841 ;
   1842 ; AVX-LABEL: sitofp_load_2i64_to_2f64:
   1843 ; AVX:       # BB#0:
   1844 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
   1845 ; AVX-NEXT:    vpextrq $1, %xmm0, %rax
   1846 ; AVX-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm1
   1847 ; AVX-NEXT:    vmovq %xmm0, %rax
   1848 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   1849 ; AVX-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm0
   1850 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1851 ; AVX-NEXT:    retq
   1852   %ld = load <2 x i64>, <2 x i64> *%a
   1853   %cvt = sitofp <2 x i64> %ld to <2 x double>
   1854   ret <2 x double> %cvt
   1855 }
   1856 
   1857 define <2 x double> @sitofp_load_2i32_to_2f64(<2 x i32> *%a) {
   1858 ; SSE-LABEL: sitofp_load_2i32_to_2f64:
   1859 ; SSE:       # BB#0:
   1860 ; SSE-NEXT:    cvtdq2pd (%rdi), %xmm0
   1861 ; SSE-NEXT:    retq
   1862 ;
   1863 ; AVX-LABEL: sitofp_load_2i32_to_2f64:
   1864 ; AVX:       # BB#0:
   1865 ; AVX-NEXT:    vcvtdq2pd (%rdi), %xmm0
   1866 ; AVX-NEXT:    retq
   1867   %ld = load <2 x i32>, <2 x i32> *%a
   1868   %cvt = sitofp <2 x i32> %ld to <2 x double>
   1869   ret <2 x double> %cvt
   1870 }
   1871 
   1872 define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) {
   1873 ; SSE-LABEL: sitofp_load_2i16_to_2f64:
   1874 ; SSE:       # BB#0:
   1875 ; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1876 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1877 ; SSE-NEXT:    psrad $16, %xmm0
   1878 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
   1879 ; SSE-NEXT:    retq
   1880 ;
   1881 ; AVX-LABEL: sitofp_load_2i16_to_2f64:
   1882 ; AVX:       # BB#0:
   1883 ; AVX-NEXT:    vpmovsxwq (%rdi), %xmm0
   1884 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1885 ; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
   1886 ; AVX-NEXT:    retq
   1887   %ld = load <2 x i16>, <2 x i16> *%a
   1888   %cvt = sitofp <2 x i16> %ld to <2 x double>
   1889   ret <2 x double> %cvt
   1890 }
   1891 
   1892 define <2 x double> @sitofp_load_2i8_to_2f64(<2 x i8> *%a) {
   1893 ; SSE-LABEL: sitofp_load_2i8_to_2f64:
   1894 ; SSE:       # BB#0:
   1895 ; SSE-NEXT:    movzwl (%rdi), %eax
   1896 ; SSE-NEXT:    movd %eax, %xmm0
   1897 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1898 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1899 ; SSE-NEXT:    psrad $24, %xmm0
   1900 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
   1901 ; SSE-NEXT:    retq
   1902 ;
   1903 ; AVX-LABEL: sitofp_load_2i8_to_2f64:
   1904 ; AVX:       # BB#0:
   1905 ; AVX-NEXT:    vpmovsxbq (%rdi), %xmm0
   1906 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1907 ; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
   1908 ; AVX-NEXT:    retq
   1909   %ld = load <2 x i8>, <2 x i8> *%a
   1910   %cvt = sitofp <2 x i8> %ld to <2 x double>
   1911   ret <2 x double> %cvt
   1912 }
   1913 
   1914 define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
   1915 ; SSE-LABEL: sitofp_load_4i64_to_4f64:
   1916 ; SSE:       # BB#0:
   1917 ; SSE-NEXT:    movdqa (%rdi), %xmm1
   1918 ; SSE-NEXT:    movdqa 16(%rdi), %xmm2
   1919 ; SSE-NEXT:    movd %xmm1, %rax
   1920 ; SSE-NEXT:    cvtsi2sdq %rax, %xmm0
   1921 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   1922 ; SSE-NEXT:    movd %xmm1, %rax
   1923 ; SSE-NEXT:    xorps %xmm1, %xmm1
   1924 ; SSE-NEXT:    cvtsi2sdq %rax, %xmm1
   1925 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1926 ; SSE-NEXT:    movd %xmm2, %rax
   1927 ; SSE-NEXT:    xorps %xmm1, %xmm1
   1928 ; SSE-NEXT:    cvtsi2sdq %rax, %xmm1
   1929 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
   1930 ; SSE-NEXT:    movd %xmm2, %rax
   1931 ; SSE-NEXT:    xorps %xmm2, %xmm2
   1932 ; SSE-NEXT:    cvtsi2sdq %rax, %xmm2
   1933 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
   1934 ; SSE-NEXT:    retq
   1935 ;
   1936 ; AVX1-LABEL: sitofp_load_4i64_to_4f64:
   1937 ; AVX1:       # BB#0:
   1938 ; AVX1-NEXT:    vmovaps (%rdi), %ymm0
   1939 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1940 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
   1941 ; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm2
   1942 ; AVX1-NEXT:    vmovq %xmm1, %rax
   1943 ; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm1
   1944 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
   1945 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
   1946 ; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm2
   1947 ; AVX1-NEXT:    vmovq %xmm0, %rax
   1948 ; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   1949 ; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm0
   1950 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   1951 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1952 ; AVX1-NEXT:    retq
   1953 ;
   1954 ; AVX2-LABEL: sitofp_load_4i64_to_4f64:
   1955 ; AVX2:       # BB#0:
   1956 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
   1957 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1958 ; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
   1959 ; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm2
   1960 ; AVX2-NEXT:    vmovq %xmm1, %rax
   1961 ; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm1
   1962 ; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
   1963 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
   1964 ; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm2
   1965 ; AVX2-NEXT:    vmovq %xmm0, %rax
   1966 ; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   1967 ; AVX2-NEXT:    vcvtsi2sdq %rax, %xmm0, %xmm0
   1968 ; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   1969 ; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1970 ; AVX2-NEXT:    retq
   1971   %ld = load <4 x i64>, <4 x i64> *%a
   1972   %cvt = sitofp <4 x i64> %ld to <4 x double>
   1973   ret <4 x double> %cvt
   1974 }
   1975 
   1976 define <4 x double> @sitofp_load_4i32_to_4f64(<4 x i32> *%a) {
   1977 ; SSE-LABEL: sitofp_load_4i32_to_4f64:
   1978 ; SSE:       # BB#0:
   1979 ; SSE-NEXT:    movdqa (%rdi), %xmm1
   1980 ; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
   1981 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   1982 ; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
   1983 ; SSE-NEXT:    retq
   1984 ;
   1985 ; AVX-LABEL: sitofp_load_4i32_to_4f64:
   1986 ; AVX:       # BB#0:
   1987 ; AVX-NEXT:    vcvtdq2pd (%rdi), %ymm0
   1988 ; AVX-NEXT:    retq
   1989   %ld = load <4 x i32>, <4 x i32> *%a
   1990   %cvt = sitofp <4 x i32> %ld to <4 x double>
   1991   ret <4 x double> %cvt
   1992 }
   1993 
   1994 define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) {
   1995 ; SSE-LABEL: sitofp_load_4i16_to_4f64:
   1996 ; SSE:       # BB#0:
   1997 ; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
   1998 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   1999 ; SSE-NEXT:    psrad $16, %xmm1
   2000 ; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
   2001 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   2002 ; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
   2003 ; SSE-NEXT:    retq
   2004 ;
   2005 ; AVX-LABEL: sitofp_load_4i16_to_4f64:
   2006 ; AVX:       # BB#0:
   2007 ; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
   2008 ; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
   2009 ; AVX-NEXT:    retq
   2010   %ld = load <4 x i16>, <4 x i16> *%a
   2011   %cvt = sitofp <4 x i16> %ld to <4 x double>
   2012   ret <4 x double> %cvt
   2013 }
   2014 
   2015 define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) {
   2016 ; SSE-LABEL: sitofp_load_4i8_to_4f64:
   2017 ; SSE:       # BB#0:
   2018 ; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2019 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2020 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   2021 ; SSE-NEXT:    psrad $24, %xmm1
   2022 ; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
   2023 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   2024 ; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
   2025 ; SSE-NEXT:    retq
   2026 ;
   2027 ; AVX-LABEL: sitofp_load_4i8_to_4f64:
   2028 ; AVX:       # BB#0:
   2029 ; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
   2030 ; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
   2031 ; AVX-NEXT:    retq
   2032   %ld = load <4 x i8>, <4 x i8> *%a
   2033   %cvt = sitofp <4 x i8> %ld to <4 x double>
   2034   ret <4 x double> %cvt
   2035 }
   2036 
   2037 ;
   2038 ; Load Unsigned Integer to Double
   2039 ;
   2040 
   2041 define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) {
   2042 ; SSE-LABEL: uitofp_load_2i64_to_2f64:
   2043 ; SSE:       # BB#0:
   2044 ; SSE-NEXT:    movdqa (%rdi), %xmm1
   2045 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
   2046 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
   2047 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   2048 ; SSE-NEXT:    movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
   2049 ; SSE-NEXT:    subpd %xmm4, %xmm1
   2050 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
   2051 ; SSE-NEXT:    addpd %xmm1, %xmm0
   2052 ; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
   2053 ; SSE-NEXT:    subpd %xmm4, %xmm3
   2054 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
   2055 ; SSE-NEXT:    addpd %xmm3, %xmm1
   2056 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2057 ; SSE-NEXT:    retq
   2058 ;
   2059 ; AVX-LABEL: uitofp_load_2i64_to_2f64:
   2060 ; AVX:       # BB#0:
   2061 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
   2062 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
   2063 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   2064 ; AVX-NEXT:    vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
   2065 ; AVX-NEXT:    vsubpd %xmm3, %xmm2, %xmm2
   2066 ; AVX-NEXT:    vhaddpd %xmm2, %xmm2, %xmm2
   2067 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   2068 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   2069 ; AVX-NEXT:    vsubpd %xmm3, %xmm0, %xmm0
   2070 ; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
   2071 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
   2072 ; AVX-NEXT:    retq
   2073   %ld = load <2 x i64>, <2 x i64> *%a
   2074   %cvt = uitofp <2 x i64> %ld to <2 x double>
   2075   ret <2 x double> %cvt
   2076 }
   2077 
   2078 define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) {
   2079 ; SSE-LABEL: uitofp_load_2i32_to_2f64:
   2080 ; SSE:       # BB#0:
   2081 ; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
   2082 ; SSE-NEXT:    pxor %xmm0, %xmm0
   2083 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
   2084 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
   2085 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
   2086 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   2087 ; SSE-NEXT:    movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
   2088 ; SSE-NEXT:    subpd %xmm4, %xmm1
   2089 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
   2090 ; SSE-NEXT:    addpd %xmm1, %xmm0
   2091 ; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
   2092 ; SSE-NEXT:    subpd %xmm4, %xmm3
   2093 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
   2094 ; SSE-NEXT:    addpd %xmm3, %xmm1
   2095 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2096 ; SSE-NEXT:    retq
   2097 ;
   2098 ; AVX-LABEL: uitofp_load_2i32_to_2f64:
   2099 ; AVX:       # BB#0:
   2100 ; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
   2101 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
   2102 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   2103 ; AVX-NEXT:    vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
   2104 ; AVX-NEXT:    vsubpd %xmm3, %xmm2, %xmm2
   2105 ; AVX-NEXT:    vhaddpd %xmm2, %xmm2, %xmm2
   2106 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   2107 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   2108 ; AVX-NEXT:    vsubpd %xmm3, %xmm0, %xmm0
   2109 ; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
   2110 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
   2111 ; AVX-NEXT:    retq
   2112   %ld = load <2 x i32>, <2 x i32> *%a
   2113   %cvt = uitofp <2 x i32> %ld to <2 x double>
   2114   ret <2 x double> %cvt
   2115 }
   2116 
   2117 define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) {
   2118 ; SSE-LABEL: uitofp_load_2i16_to_2f64:
   2119 ; SSE:       # BB#0:
   2120 ; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2121 ; SSE-NEXT:    pxor %xmm1, %xmm1
   2122 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   2123 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
   2124 ; SSE-NEXT:    retq
   2125 ;
   2126 ; AVX-LABEL: uitofp_load_2i16_to_2f64:
   2127 ; AVX:       # BB#0:
   2128 ; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2129 ; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   2130 ; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
   2131 ; AVX-NEXT:    retq
   2132   %ld = load <2 x i16>, <2 x i16> *%a
   2133   %cvt = uitofp <2 x i16> %ld to <2 x double>
   2134   ret <2 x double> %cvt
   2135 }
   2136 
   2137 define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) {
   2138 ; SSE-LABEL: uitofp_load_2i8_to_2f64:
   2139 ; SSE:       # BB#0:
   2140 ; SSE-NEXT:    movzwl (%rdi), %eax
   2141 ; SSE-NEXT:    movd %eax, %xmm0
   2142 ; SSE-NEXT:    pxor %xmm1, %xmm1
   2143 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   2144 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   2145 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
   2146 ; SSE-NEXT:    retq
   2147 ;
   2148 ; AVX-LABEL: uitofp_load_2i8_to_2f64:
   2149 ; AVX:       # BB#0:
   2150 ; AVX-NEXT:    movzwl (%rdi), %eax
   2151 ; AVX-NEXT:    vmovd %eax, %xmm0
   2152 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
   2153 ; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
   2154 ; AVX-NEXT:    retq
   2155   %ld = load <2 x i8>, <2 x i8> *%a
   2156   %cvt = uitofp <2 x i8> %ld to <2 x double>
   2157   ret <2 x double> %cvt
   2158 }
   2159 
   2160 define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) {
   2161 ; SSE-LABEL: uitofp_load_4i64_to_4f64:
   2162 ; SSE:       # BB#0:
   2163 ; SSE-NEXT:    movdqa (%rdi), %xmm1
   2164 ; SSE-NEXT:    movdqa 16(%rdi), %xmm2
   2165 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
   2166 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
   2167 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
   2168 ; SSE-NEXT:    movapd {{.*#+}} xmm5 = [4.503600e+15,1.934281e+25]
   2169 ; SSE-NEXT:    subpd %xmm5, %xmm1
   2170 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
   2171 ; SSE-NEXT:    addpd %xmm1, %xmm0
   2172 ; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
   2173 ; SSE-NEXT:    subpd %xmm5, %xmm4
   2174 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,0,1]
   2175 ; SSE-NEXT:    addpd %xmm4, %xmm1
   2176 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2177 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
   2178 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
   2179 ; SSE-NEXT:    subpd %xmm5, %xmm2
   2180 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
   2181 ; SSE-NEXT:    addpd %xmm2, %xmm1
   2182 ; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
   2183 ; SSE-NEXT:    subpd %xmm5, %xmm4
   2184 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
   2185 ; SSE-NEXT:    addpd %xmm4, %xmm2
   2186 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
   2187 ; SSE-NEXT:    retq
   2188 ;
   2189 ; AVX1-LABEL: uitofp_load_4i64_to_4f64:
   2190 ; AVX1:       # BB#0:
   2191 ; AVX1-NEXT:    vmovaps (%rdi), %ymm0
   2192 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   2193 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
   2194 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   2195 ; AVX1-NEXT:    vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
   2196 ; AVX1-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
   2197 ; AVX1-NEXT:    vhaddpd %xmm3, %xmm3, %xmm3
   2198 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   2199 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   2200 ; AVX1-NEXT:    vsubpd %xmm4, %xmm1, %xmm1
   2201 ; AVX1-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
   2202 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
   2203 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
   2204 ; AVX1-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
   2205 ; AVX1-NEXT:    vhaddpd %xmm3, %xmm3, %xmm3
   2206 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   2207 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
   2208 ; AVX1-NEXT:    vsubpd %xmm4, %xmm0, %xmm0
   2209 ; AVX1-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
   2210 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
   2211 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   2212 ; AVX1-NEXT:    retq
   2213 ;
   2214 ; AVX2-LABEL: uitofp_load_4i64_to_4f64:
   2215 ; AVX2:       # BB#0:
   2216 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
   2217 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   2218 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
   2219 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   2220 ; AVX2-NEXT:    vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
   2221 ; AVX2-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
   2222 ; AVX2-NEXT:    vhaddpd %xmm3, %xmm3, %xmm3
   2223 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   2224 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   2225 ; AVX2-NEXT:    vsubpd %xmm4, %xmm1, %xmm1
   2226 ; AVX2-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
   2227 ; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
   2228 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
   2229 ; AVX2-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
   2230 ; AVX2-NEXT:    vhaddpd %xmm3, %xmm3, %xmm3
   2231 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   2232 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
   2233 ; AVX2-NEXT:    vsubpd %xmm4, %xmm0, %xmm0
   2234 ; AVX2-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
   2235 ; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
   2236 ; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   2237 ; AVX2-NEXT:    retq
   2238   %ld = load <4 x i64>, <4 x i64> *%a
   2239   %cvt = uitofp <4 x i64> %ld to <4 x double>
   2240   ret <4 x double> %cvt
   2241 }
   2242 
   2243 define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
   2244 ; SSE-LABEL: uitofp_load_4i32_to_4f64:
   2245 ; SSE:       # BB#0:
   2246 ; SSE-NEXT:    movdqa (%rdi), %xmm2
   2247 ; SSE-NEXT:    pxor %xmm1, %xmm1
   2248 ; SSE-NEXT:    movdqa %xmm2, %xmm3
   2249 ; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
   2250 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [1127219200,1160773632,0,0]
   2251 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
   2252 ; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
   2253 ; SSE-NEXT:    movapd {{.*#+}} xmm6 = [4.503600e+15,1.934281e+25]
   2254 ; SSE-NEXT:    subpd %xmm6, %xmm3
   2255 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
   2256 ; SSE-NEXT:    addpd %xmm3, %xmm0
   2257 ; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
   2258 ; SSE-NEXT:    subpd %xmm6, %xmm5
   2259 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[2,3,0,1]
   2260 ; SSE-NEXT:    addpd %xmm5, %xmm3
   2261 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
   2262 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
   2263 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
   2264 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
   2265 ; SSE-NEXT:    subpd %xmm6, %xmm2
   2266 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
   2267 ; SSE-NEXT:    addpd %xmm2, %xmm1
   2268 ; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
   2269 ; SSE-NEXT:    subpd %xmm6, %xmm3
   2270 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
   2271 ; SSE-NEXT:    addpd %xmm3, %xmm2
   2272 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
   2273 ; SSE-NEXT:    retq
   2274 ;
   2275 ; AVX1-LABEL: uitofp_load_4i32_to_4f64:
   2276 ; AVX1:       # BB#0:
   2277 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
   2278 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
   2279 ; AVX1-NEXT:    vcvtdq2pd %xmm1, %ymm1
   2280 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
   2281 ; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
   2282 ; AVX1-NEXT:    vmulpd {{.*}}(%rip), %ymm0, %ymm0
   2283 ; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
   2284 ; AVX1-NEXT:    retq
   2285 ;
   2286 ; AVX2-LABEL: uitofp_load_4i32_to_4f64:
   2287 ; AVX2:       # BB#0:
   2288 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
   2289 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
   2290 ; AVX2-NEXT:    vcvtdq2pd %xmm1, %ymm1
   2291 ; AVX2-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm2
   2292 ; AVX2-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
   2293 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
   2294 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
   2295 ; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
   2296 ; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
   2297 ; AVX2-NEXT:    retq
   2298   %ld = load <4 x i32>, <4 x i32> *%a
   2299   %cvt = uitofp <4 x i32> %ld to <4 x double>
   2300   ret <4 x double> %cvt
   2301 }
   2302 
   2303 define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) {
   2304 ; SSE-LABEL: uitofp_load_4i16_to_4f64:
   2305 ; SSE:       # BB#0:
   2306 ; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
   2307 ; SSE-NEXT:    pxor %xmm0, %xmm0
   2308 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   2309 ; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
   2310 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   2311 ; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
   2312 ; SSE-NEXT:    retq
   2313 ;
   2314 ; AVX-LABEL: uitofp_load_4i16_to_4f64:
   2315 ; AVX:       # BB#0:
   2316 ; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   2317 ; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
   2318 ; AVX-NEXT:    retq
   2319   %ld = load <4 x i16>, <4 x i16> *%a
   2320   %cvt = uitofp <4 x i16> %ld to <4 x double>
   2321   ret <4 x double> %cvt
   2322 }
   2323 
   2324 define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) {
   2325 ; SSE-LABEL: uitofp_load_4i8_to_4f64:
   2326 ; SSE:       # BB#0:
   2327 ; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   2328 ; SSE-NEXT:    pxor %xmm0, %xmm0
   2329 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   2330 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   2331 ; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
   2332 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   2333 ; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
   2334 ; SSE-NEXT:    retq
   2335 ;
   2336 ; AVX-LABEL: uitofp_load_4i8_to_4f64:
   2337 ; AVX:       # BB#0:
   2338 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   2339 ; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
   2340 ; AVX-NEXT:    retq
   2341   %ld = load <4 x i8>, <4 x i8> *%a
   2342   %cvt = uitofp <4 x i8> %ld to <4 x double>
   2343   ret <4 x double> %cvt
   2344 }
   2345 
   2346 ;
   2347 ; Load Signed Integer to Float
   2348 ;
   2349 
   2350 define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
   2351 ; SSE-LABEL: sitofp_load_4i64_to_4f32:
   2352 ; SSE:       # BB#0:
   2353 ; SSE-NEXT:    movdqa (%rdi), %xmm1
   2354 ; SSE-NEXT:    movdqa 16(%rdi), %xmm2
   2355 ; SSE-NEXT:    movd %xmm2, %rax
   2356 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm3
   2357 ; SSE-NEXT:    movd %xmm1, %rax
   2358 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
   2359 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
   2360 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
   2361 ; SSE-NEXT:    movd %xmm2, %rax
   2362 ; SSE-NEXT:    xorps %xmm2, %xmm2
   2363 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
   2364 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   2365 ; SSE-NEXT:    movd %xmm1, %rax
   2366 ; SSE-NEXT:    xorps %xmm1, %xmm1
   2367 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
   2368 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   2369 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   2370 ; SSE-NEXT:    retq
   2371 ;
   2372 ; AVX1-LABEL: sitofp_load_4i64_to_4f32:
   2373 ; AVX1:       # BB#0:
   2374 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
   2375 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
   2376 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
   2377 ; AVX1-NEXT:    vmovq %xmm0, %rax
   2378 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   2379 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
   2380 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2381 ; AVX1-NEXT:    vmovq %xmm0, %rax
   2382 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   2383 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
   2384 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
   2385 ; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   2386 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
   2387 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
   2388 ; AVX1-NEXT:    vzeroupper
   2389 ; AVX1-NEXT:    retq
   2390 ;
   2391 ; AVX2-LABEL: sitofp_load_4i64_to_4f32:
   2392 ; AVX2:       # BB#0:
   2393 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
   2394 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
   2395 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
   2396 ; AVX2-NEXT:    vmovq %xmm0, %rax
   2397 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   2398 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
   2399 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
   2400 ; AVX2-NEXT:    vmovq %xmm0, %rax
   2401 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   2402 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
   2403 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
   2404 ; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   2405 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
   2406 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
   2407 ; AVX2-NEXT:    vzeroupper
   2408 ; AVX2-NEXT:    retq
   2409   %ld = load <4 x i64>, <4 x i64> *%a
   2410   %cvt = sitofp <4 x i64> %ld to <4 x float>
   2411   ret <4 x float> %cvt
   2412 }
   2413 
   2414 define <4 x float> @sitofp_load_4i32_to_4f32(<4 x i32> *%a) {
   2415 ; SSE-LABEL: sitofp_load_4i32_to_4f32:
   2416 ; SSE:       # BB#0:
   2417 ; SSE-NEXT:    cvtdq2ps (%rdi), %xmm0
   2418 ; SSE-NEXT:    retq
   2419 ;
   2420 ; AVX-LABEL: sitofp_load_4i32_to_4f32:
   2421 ; AVX:       # BB#0:
   2422 ; AVX-NEXT:    vcvtdq2ps (%rdi), %xmm0
   2423 ; AVX-NEXT:    retq
   2424   %ld = load <4 x i32>, <4 x i32> *%a
   2425   %cvt = sitofp <4 x i32> %ld to <4 x float>
   2426   ret <4 x float> %cvt
   2427 }
   2428 
   2429 define <4 x float> @sitofp_load_4i16_to_4f32(<4 x i16> *%a) {
   2430 ; SSE-LABEL: sitofp_load_4i16_to_4f32:
   2431 ; SSE:       # BB#0:
   2432 ; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
   2433 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   2434 ; SSE-NEXT:    psrad $16, %xmm0
   2435 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
   2436 ; SSE-NEXT:    retq
   2437 ;
   2438 ; AVX-LABEL: sitofp_load_4i16_to_4f32:
   2439 ; AVX:       # BB#0:
   2440 ; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
   2441 ; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
   2442 ; AVX-NEXT:    retq
   2443   %ld = load <4 x i16>, <4 x i16> *%a
   2444   %cvt = sitofp <4 x i16> %ld to <4 x float>
   2445   ret <4 x float> %cvt
   2446 }
   2447 
   2448 define <4 x float> @sitofp_load_4i8_to_4f32(<4 x i8> *%a) {
   2449 ; SSE-LABEL: sitofp_load_4i8_to_4f32:
   2450 ; SSE:       # BB#0:
   2451 ; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2452 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2453 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   2454 ; SSE-NEXT:    psrad $24, %xmm0
   2455 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
   2456 ; SSE-NEXT:    retq
   2457 ;
   2458 ; AVX-LABEL: sitofp_load_4i8_to_4f32:
   2459 ; AVX:       # BB#0:
   2460 ; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
   2461 ; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
   2462 ; AVX-NEXT:    retq
   2463   %ld = load <4 x i8>, <4 x i8> *%a
   2464   %cvt = sitofp <4 x i8> %ld to <4 x float>
   2465   ret <4 x float> %cvt
   2466 }
   2467 
   2468 define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
   2469 ; SSE-LABEL: sitofp_load_8i64_to_8f32:
   2470 ; SSE:       # BB#0:
   2471 ; SSE-NEXT:    movdqa (%rdi), %xmm1
   2472 ; SSE-NEXT:    movdqa 16(%rdi), %xmm2
   2473 ; SSE-NEXT:    movdqa 32(%rdi), %xmm3
   2474 ; SSE-NEXT:    movdqa 48(%rdi), %xmm4
   2475 ; SSE-NEXT:    movd %xmm2, %rax
   2476 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm5
   2477 ; SSE-NEXT:    movd %xmm1, %rax
   2478 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
   2479 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
   2480 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
   2481 ; SSE-NEXT:    movd %xmm2, %rax
   2482 ; SSE-NEXT:    xorps %xmm2, %xmm2
   2483 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
   2484 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   2485 ; SSE-NEXT:    movd %xmm1, %rax
   2486 ; SSE-NEXT:    xorps %xmm1, %xmm1
   2487 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
   2488 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   2489 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   2490 ; SSE-NEXT:    movd %xmm4, %rax
   2491 ; SSE-NEXT:    xorps %xmm2, %xmm2
   2492 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
   2493 ; SSE-NEXT:    movd %xmm3, %rax
   2494 ; SSE-NEXT:    xorps %xmm1, %xmm1
   2495 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
   2496 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   2497 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
   2498 ; SSE-NEXT:    movd %xmm2, %rax
   2499 ; SSE-NEXT:    xorps %xmm2, %xmm2
   2500 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
   2501 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
   2502 ; SSE-NEXT:    movd %xmm3, %rax
   2503 ; SSE-NEXT:    xorps %xmm3, %xmm3
   2504 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm3
   2505 ; SSE-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
   2506 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
   2507 ; SSE-NEXT:    retq
   2508 ;
   2509 ; AVX1-LABEL: sitofp_load_8i64_to_8f32:
   2510 ; AVX1:       # BB#0:
   2511 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
   2512 ; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
   2513 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
   2514 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   2515 ; AVX1-NEXT:    vmovq %xmm1, %rax
   2516 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
   2517 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
   2518 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
   2519 ; AVX1-NEXT:    vmovq %xmm1, %rax
   2520 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
   2521 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
   2522 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
   2523 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
   2524 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
   2525 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
   2526 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   2527 ; AVX1-NEXT:    vmovq %xmm0, %rax
   2528 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
   2529 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
   2530 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2531 ; AVX1-NEXT:    vmovq %xmm0, %rax
   2532 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
   2533 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
   2534 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
   2535 ; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   2536 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
   2537 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
   2538 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   2539 ; AVX1-NEXT:    retq
   2540 ;
   2541 ; AVX2-LABEL: sitofp_load_8i64_to_8f32:
   2542 ; AVX2:       # BB#0:
   2543 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
   2544 ; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
   2545 ; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
   2546 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   2547 ; AVX2-NEXT:    vmovq %xmm1, %rax
   2548 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
   2549 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
   2550 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
   2551 ; AVX2-NEXT:    vmovq %xmm1, %rax
   2552 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
   2553 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
   2554 ; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
   2555 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
   2556 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
   2557 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
   2558 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   2559 ; AVX2-NEXT:    vmovq %xmm0, %rax
   2560 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
   2561 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
   2562 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
   2563 ; AVX2-NEXT:    vmovq %xmm0, %rax
   2564 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
   2565 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
   2566 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
   2567 ; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   2568 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
   2569 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
   2570 ; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   2571 ; AVX2-NEXT:    retq
   2572   %ld = load <8 x i64>, <8 x i64> *%a
   2573   %cvt = sitofp <8 x i64> %ld to <8 x float>
   2574   ret <8 x float> %cvt
   2575 }
   2576 
   2577 define <8 x float> @sitofp_load_8i32_to_8f32(<8 x i32> *%a) {
   2578 ; SSE-LABEL: sitofp_load_8i32_to_8f32:
   2579 ; SSE:       # BB#0:
   2580 ; SSE-NEXT:    cvtdq2ps (%rdi), %xmm0
   2581 ; SSE-NEXT:    cvtdq2ps 16(%rdi), %xmm1
   2582 ; SSE-NEXT:    retq
   2583 ;
   2584 ; AVX-LABEL: sitofp_load_8i32_to_8f32:
   2585 ; AVX:       # BB#0:
   2586 ; AVX-NEXT:    vcvtdq2ps (%rdi), %ymm0
   2587 ; AVX-NEXT:    retq
   2588   %ld = load <8 x i32>, <8 x i32> *%a
   2589   %cvt = sitofp <8 x i32> %ld to <8 x float>
   2590   ret <8 x float> %cvt
   2591 }
   2592 
   2593 define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) {
   2594 ; SSE-LABEL: sitofp_load_8i16_to_8f32:
   2595 ; SSE:       # BB#0:
   2596 ; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
   2597 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   2598 ; SSE-NEXT:    psrad $16, %xmm0
   2599 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
   2600 ; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
   2601 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
   2602 ; SSE-NEXT:    psrad $16, %xmm1
   2603 ; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
   2604 ; SSE-NEXT:    retq
   2605 ;
   2606 ; AVX1-LABEL: sitofp_load_8i16_to_8f32:
   2607 ; AVX1:       # BB#0:
   2608 ; AVX1-NEXT:    vpmovsxwd (%rdi), %xmm0
   2609 ; AVX1-NEXT:    vpmovsxwd 8(%rdi), %xmm1
   2610 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   2611 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
   2612 ; AVX1-NEXT:    retq
   2613 ;
   2614 ; AVX2-LABEL: sitofp_load_8i16_to_8f32:
   2615 ; AVX2:       # BB#0:
   2616 ; AVX2-NEXT:    vpmovsxwd (%rdi), %ymm0
   2617 ; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
   2618 ; AVX2-NEXT:    retq
   2619   %ld = load <8 x i16>, <8 x i16> *%a
   2620   %cvt = sitofp <8 x i16> %ld to <8 x float>
   2621   ret <8 x float> %cvt
   2622 }
   2623 
   2624 define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) {
   2625 ; SSE-LABEL: sitofp_load_8i8_to_8f32:
   2626 ; SSE:       # BB#0:
   2627 ; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2628 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2629 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   2630 ; SSE-NEXT:    psrad $24, %xmm0
   2631 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
   2632 ; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   2633 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2634 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
   2635 ; SSE-NEXT:    psrad $24, %xmm1
   2636 ; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
   2637 ; SSE-NEXT:    retq
   2638 ;
   2639 ; AVX1-LABEL: sitofp_load_8i8_to_8f32:
   2640 ; AVX1:       # BB#0:
   2641 ; AVX1-NEXT:    vpmovsxbw (%rdi), %xmm0
   2642 ; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
   2643 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   2644 ; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
   2645 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   2646 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
   2647 ; AVX1-NEXT:    retq
   2648 ;
   2649 ; AVX2-LABEL: sitofp_load_8i8_to_8f32:
   2650 ; AVX2:       # BB#0:
   2651 ; AVX2-NEXT:    vpmovsxbd (%rdi), %ymm0
   2652 ; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
   2653 ; AVX2-NEXT:    retq
   2654   %ld = load <8 x i8>, <8 x i8> *%a
   2655   %cvt = sitofp <8 x i8> %ld to <8 x float>
   2656   ret <8 x float> %cvt
   2657 }
   2658 
   2659 ;
   2660 ; Load Unsigned Integer to Float
   2661 ;
   2662 
   2663 define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
   2664 ; SSE-LABEL: uitofp_load_4i64_to_4f32:
   2665 ; SSE:       # BB#0:
   2666 ; SSE-NEXT:    movdqa (%rdi), %xmm1
   2667 ; SSE-NEXT:    movdqa 16(%rdi), %xmm3
   2668 ; SSE-NEXT:    movd %xmm3, %rax
   2669 ; SSE-NEXT:    movl %eax, %ecx
   2670 ; SSE-NEXT:    andl $1, %ecx
   2671 ; SSE-NEXT:    testq %rax, %rax
   2672 ; SSE-NEXT:    js .LBB74_1
   2673 ; SSE-NEXT:  # BB#2:
   2674 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
   2675 ; SSE-NEXT:    jmp .LBB74_3
   2676 ; SSE-NEXT:  .LBB74_1:
   2677 ; SSE-NEXT:    shrq %rax
   2678 ; SSE-NEXT:    orq %rax, %rcx
   2679 ; SSE-NEXT:    cvtsi2ssq %rcx, %xmm2
   2680 ; SSE-NEXT:    addss %xmm2, %xmm2
   2681 ; SSE-NEXT:  .LBB74_3:
   2682 ; SSE-NEXT:    movd %xmm1, %rax
   2683 ; SSE-NEXT:    movl %eax, %ecx
   2684 ; SSE-NEXT:    andl $1, %ecx
   2685 ; SSE-NEXT:    testq %rax, %rax
   2686 ; SSE-NEXT:    js .LBB74_4
   2687 ; SSE-NEXT:  # BB#5:
   2688 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
   2689 ; SSE-NEXT:    jmp .LBB74_6
   2690 ; SSE-NEXT:  .LBB74_4:
   2691 ; SSE-NEXT:    shrq %rax
   2692 ; SSE-NEXT:    orq %rax, %rcx
   2693 ; SSE-NEXT:    cvtsi2ssq %rcx, %xmm0
   2694 ; SSE-NEXT:    addss %xmm0, %xmm0
   2695 ; SSE-NEXT:  .LBB74_6:
   2696 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
   2697 ; SSE-NEXT:    movd %xmm3, %rax
   2698 ; SSE-NEXT:    movl %eax, %ecx
   2699 ; SSE-NEXT:    andl $1, %ecx
   2700 ; SSE-NEXT:    testq %rax, %rax
   2701 ; SSE-NEXT:    js .LBB74_7
   2702 ; SSE-NEXT:  # BB#8:
   2703 ; SSE-NEXT:    xorps %xmm3, %xmm3
   2704 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm3
   2705 ; SSE-NEXT:    jmp .LBB74_9
   2706 ; SSE-NEXT:  .LBB74_7:
   2707 ; SSE-NEXT:    shrq %rax
   2708 ; SSE-NEXT:    orq %rax, %rcx
   2709 ; SSE-NEXT:    xorps %xmm3, %xmm3
   2710 ; SSE-NEXT:    cvtsi2ssq %rcx, %xmm3
   2711 ; SSE-NEXT:    addss %xmm3, %xmm3
   2712 ; SSE-NEXT:  .LBB74_9:
   2713 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
   2714 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   2715 ; SSE-NEXT:    movd %xmm1, %rax
   2716 ; SSE-NEXT:    movl %eax, %ecx
   2717 ; SSE-NEXT:    andl $1, %ecx
   2718 ; SSE-NEXT:    testq %rax, %rax
   2719 ; SSE-NEXT:    js .LBB74_10
   2720 ; SSE-NEXT:  # BB#11:
   2721 ; SSE-NEXT:    xorps %xmm1, %xmm1
   2722 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
   2723 ; SSE-NEXT:    jmp .LBB74_12
   2724 ; SSE-NEXT:  .LBB74_10:
   2725 ; SSE-NEXT:    shrq %rax
   2726 ; SSE-NEXT:    orq %rax, %rcx
   2727 ; SSE-NEXT:    xorps %xmm1, %xmm1
   2728 ; SSE-NEXT:    cvtsi2ssq %rcx, %xmm1
   2729 ; SSE-NEXT:    addss %xmm1, %xmm1
   2730 ; SSE-NEXT:  .LBB74_12:
   2731 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
   2732 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   2733 ; SSE-NEXT:    retq
   2734 ;
   2735 ; AVX1-LABEL: uitofp_load_4i64_to_4f32:
   2736 ; AVX1:       # BB#0:
   2737 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
   2738 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
   2739 ; AVX1-NEXT:    movl %eax, %ecx
   2740 ; AVX1-NEXT:    andl $1, %ecx
   2741 ; AVX1-NEXT:    testq %rax, %rax
   2742 ; AVX1-NEXT:    js .LBB74_1
   2743 ; AVX1-NEXT:  # BB#2:
   2744 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
   2745 ; AVX1-NEXT:    jmp .LBB74_3
   2746 ; AVX1-NEXT:  .LBB74_1:
   2747 ; AVX1-NEXT:    shrq %rax
   2748 ; AVX1-NEXT:    orq %rax, %rcx
   2749 ; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
   2750 ; AVX1-NEXT:    vaddss %xmm1, %xmm1, %xmm1
   2751 ; AVX1-NEXT:  .LBB74_3:
   2752 ; AVX1-NEXT:    vmovq %xmm0, %rax
   2753 ; AVX1-NEXT:    movl %eax, %ecx
   2754 ; AVX1-NEXT:    andl $1, %ecx
   2755 ; AVX1-NEXT:    testq %rax, %rax
   2756 ; AVX1-NEXT:    js .LBB74_4
   2757 ; AVX1-NEXT:  # BB#5:
   2758 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   2759 ; AVX1-NEXT:    jmp .LBB74_6
   2760 ; AVX1-NEXT:  .LBB74_4:
   2761 ; AVX1-NEXT:    shrq %rax
   2762 ; AVX1-NEXT:    orq %rax, %rcx
   2763 ; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
   2764 ; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
   2765 ; AVX1-NEXT:  .LBB74_6:
   2766 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
   2767 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2768 ; AVX1-NEXT:    vmovq %xmm0, %rax
   2769 ; AVX1-NEXT:    movl %eax, %ecx
   2770 ; AVX1-NEXT:    andl $1, %ecx
   2771 ; AVX1-NEXT:    testq %rax, %rax
   2772 ; AVX1-NEXT:    js .LBB74_7
   2773 ; AVX1-NEXT:  # BB#8:
   2774 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   2775 ; AVX1-NEXT:    jmp .LBB74_9
   2776 ; AVX1-NEXT:  .LBB74_7:
   2777 ; AVX1-NEXT:    shrq %rax
   2778 ; AVX1-NEXT:    orq %rax, %rcx
   2779 ; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
   2780 ; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
   2781 ; AVX1-NEXT:  .LBB74_9:
   2782 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
   2783 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
   2784 ; AVX1-NEXT:    movl %eax, %ecx
   2785 ; AVX1-NEXT:    andl $1, %ecx
   2786 ; AVX1-NEXT:    testq %rax, %rax
   2787 ; AVX1-NEXT:    js .LBB74_10
   2788 ; AVX1-NEXT:  # BB#11:
   2789 ; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   2790 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
   2791 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
   2792 ; AVX1-NEXT:    vzeroupper
   2793 ; AVX1-NEXT:    retq
   2794 ; AVX1-NEXT:  .LBB74_10:
   2795 ; AVX1-NEXT:    shrq %rax
   2796 ; AVX1-NEXT:    orq %rax, %rcx
   2797 ; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   2798 ; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
   2799 ; AVX1-NEXT:    vaddss %xmm0, %xmm0, %xmm0
   2800 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
   2801 ; AVX1-NEXT:    vzeroupper
   2802 ; AVX1-NEXT:    retq
   2803 ;
   2804 ; AVX2-LABEL: uitofp_load_4i64_to_4f32:
   2805 ; AVX2:       # BB#0:
   2806 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
   2807 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
   2808 ; AVX2-NEXT:    movl %eax, %ecx
   2809 ; AVX2-NEXT:    andl $1, %ecx
   2810 ; AVX2-NEXT:    testq %rax, %rax
   2811 ; AVX2-NEXT:    js .LBB74_1
   2812 ; AVX2-NEXT:  # BB#2:
   2813 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
   2814 ; AVX2-NEXT:    jmp .LBB74_3
   2815 ; AVX2-NEXT:  .LBB74_1:
   2816 ; AVX2-NEXT:    shrq %rax
   2817 ; AVX2-NEXT:    orq %rax, %rcx
   2818 ; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
   2819 ; AVX2-NEXT:    vaddss %xmm1, %xmm1, %xmm1
   2820 ; AVX2-NEXT:  .LBB74_3:
   2821 ; AVX2-NEXT:    vmovq %xmm0, %rax
   2822 ; AVX2-NEXT:    movl %eax, %ecx
   2823 ; AVX2-NEXT:    andl $1, %ecx
   2824 ; AVX2-NEXT:    testq %rax, %rax
   2825 ; AVX2-NEXT:    js .LBB74_4
   2826 ; AVX2-NEXT:  # BB#5:
   2827 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   2828 ; AVX2-NEXT:    jmp .LBB74_6
   2829 ; AVX2-NEXT:  .LBB74_4:
   2830 ; AVX2-NEXT:    shrq %rax
   2831 ; AVX2-NEXT:    orq %rax, %rcx
   2832 ; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
   2833 ; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
   2834 ; AVX2-NEXT:  .LBB74_6:
   2835 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
   2836 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
   2837 ; AVX2-NEXT:    vmovq %xmm0, %rax
   2838 ; AVX2-NEXT:    movl %eax, %ecx
   2839 ; AVX2-NEXT:    andl $1, %ecx
   2840 ; AVX2-NEXT:    testq %rax, %rax
   2841 ; AVX2-NEXT:    js .LBB74_7
   2842 ; AVX2-NEXT:  # BB#8:
   2843 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   2844 ; AVX2-NEXT:    jmp .LBB74_9
   2845 ; AVX2-NEXT:  .LBB74_7:
   2846 ; AVX2-NEXT:    shrq %rax
   2847 ; AVX2-NEXT:    orq %rax, %rcx
   2848 ; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
   2849 ; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
   2850 ; AVX2-NEXT:  .LBB74_9:
   2851 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
   2852 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
   2853 ; AVX2-NEXT:    movl %eax, %ecx
   2854 ; AVX2-NEXT:    andl $1, %ecx
   2855 ; AVX2-NEXT:    testq %rax, %rax
   2856 ; AVX2-NEXT:    js .LBB74_10
   2857 ; AVX2-NEXT:  # BB#11:
   2858 ; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   2859 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm0
   2860 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
   2861 ; AVX2-NEXT:    vzeroupper
   2862 ; AVX2-NEXT:    retq
   2863 ; AVX2-NEXT:  .LBB74_10:
   2864 ; AVX2-NEXT:    shrq %rax
   2865 ; AVX2-NEXT:    orq %rax, %rcx
   2866 ; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   2867 ; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
   2868 ; AVX2-NEXT:    vaddss %xmm0, %xmm0, %xmm0
   2869 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
   2870 ; AVX2-NEXT:    vzeroupper
   2871 ; AVX2-NEXT:    retq
   2872   %ld = load <4 x i64>, <4 x i64> *%a
   2873   %cvt = uitofp <4 x i64> %ld to <4 x float>
   2874   ret <4 x float> %cvt
   2875 }
   2876 
   2877 define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) {
   2878 ; SSE-LABEL: uitofp_load_4i32_to_4f32:
   2879 ; SSE:       # BB#0:
   2880 ; SSE-NEXT:    movdqa (%rdi), %xmm0
   2881 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
   2882 ; SSE-NEXT:    pand %xmm0, %xmm1
   2883 ; SSE-NEXT:    por {{.*}}(%rip), %xmm1
   2884 ; SSE-NEXT:    psrld $16, %xmm0
   2885 ; SSE-NEXT:    por {{.*}}(%rip), %xmm0
   2886 ; SSE-NEXT:    addps {{.*}}(%rip), %xmm0
   2887 ; SSE-NEXT:    addps %xmm1, %xmm0
   2888 ; SSE-NEXT:    retq
   2889 ;
   2890 ; AVX1-LABEL: uitofp_load_4i32_to_4f32:
   2891 ; AVX1:       # BB#0:
   2892 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
   2893 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
   2894 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
   2895 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
   2896 ; AVX1-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
   2897 ; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
   2898 ; AVX1-NEXT:    retq
   2899 ;
   2900 ; AVX2-LABEL: uitofp_load_4i32_to_4f32:
   2901 ; AVX2:       # BB#0:
   2902 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
   2903 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
   2904 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
   2905 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
   2906 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
   2907 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
   2908 ; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
   2909 ; AVX2-NEXT:    vaddps %xmm2, %xmm0, %xmm0
   2910 ; AVX2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
   2911 ; AVX2-NEXT:    retq
   2912   %ld = load <4 x i32>, <4 x i32> *%a
   2913   %cvt = uitofp <4 x i32> %ld to <4 x float>
   2914   ret <4 x float> %cvt
   2915 }
   2916 
   2917 define <4 x float> @uitofp_load_4i16_to_4f32(<4 x i16> *%a) {
   2918 ; SSE-LABEL: uitofp_load_4i16_to_4f32:
   2919 ; SSE:       # BB#0:
   2920 ; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
   2921 ; SSE-NEXT:    pxor %xmm1, %xmm1
   2922 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   2923 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
   2924 ; SSE-NEXT:    retq
   2925 ;
   2926 ; AVX-LABEL: uitofp_load_4i16_to_4f32:
   2927 ; AVX:       # BB#0:
   2928 ; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   2929 ; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
   2930 ; AVX-NEXT:    retq
   2931   %ld = load <4 x i16>, <4 x i16> *%a
   2932   %cvt = uitofp <4 x i16> %ld to <4 x float>
   2933   ret <4 x float> %cvt
   2934 }
   2935 
   2936 define <4 x float> @uitofp_load_4i8_to_4f32(<4 x i8> *%a) {
   2937 ; SSE-LABEL: uitofp_load_4i8_to_4f32:
   2938 ; SSE:       # BB#0:
   2939 ; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2940 ; SSE-NEXT:    pxor %xmm1, %xmm1
   2941 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   2942 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   2943 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
   2944 ; SSE-NEXT:    retq
   2945 ;
   2946 ; AVX-LABEL: uitofp_load_4i8_to_4f32:
   2947 ; AVX:       # BB#0:
   2948 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   2949 ; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
   2950 ; AVX-NEXT:    retq
   2951   %ld = load <4 x i8>, <4 x i8> *%a
   2952   %cvt = uitofp <4 x i8> %ld to <4 x float>
   2953   ret <4 x float> %cvt
   2954 }
   2955 
   2956 define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
   2957 ; SSE-LABEL: uitofp_load_8i64_to_8f32:
   2958 ; SSE:       # BB#0:
   2959 ; SSE-NEXT:    movdqa (%rdi), %xmm1
   2960 ; SSE-NEXT:    movdqa 16(%rdi), %xmm5
   2961 ; SSE-NEXT:    movdqa 32(%rdi), %xmm2
   2962 ; SSE-NEXT:    movdqa 48(%rdi), %xmm3
   2963 ; SSE-NEXT:    movd %xmm5, %rax
   2964 ; SSE-NEXT:    movl %eax, %ecx
   2965 ; SSE-NEXT:    andl $1, %ecx
   2966 ; SSE-NEXT:    testq %rax, %rax
   2967 ; SSE-NEXT:    js .LBB78_1
   2968 ; SSE-NEXT:  # BB#2:
   2969 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm4
   2970 ; SSE-NEXT:    jmp .LBB78_3
   2971 ; SSE-NEXT:  .LBB78_1:
   2972 ; SSE-NEXT:    shrq %rax
   2973 ; SSE-NEXT:    orq %rax, %rcx
   2974 ; SSE-NEXT:    cvtsi2ssq %rcx, %xmm4
   2975 ; SSE-NEXT:    addss %xmm4, %xmm4
   2976 ; SSE-NEXT:  .LBB78_3:
   2977 ; SSE-NEXT:    movd %xmm1, %rax
   2978 ; SSE-NEXT:    movl %eax, %ecx
   2979 ; SSE-NEXT:    andl $1, %ecx
   2980 ; SSE-NEXT:    testq %rax, %rax
   2981 ; SSE-NEXT:    js .LBB78_4
   2982 ; SSE-NEXT:  # BB#5:
   2983 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm0
   2984 ; SSE-NEXT:    jmp .LBB78_6
   2985 ; SSE-NEXT:  .LBB78_4:
   2986 ; SSE-NEXT:    shrq %rax
   2987 ; SSE-NEXT:    orq %rax, %rcx
   2988 ; SSE-NEXT:    cvtsi2ssq %rcx, %xmm0
   2989 ; SSE-NEXT:    addss %xmm0, %xmm0
   2990 ; SSE-NEXT:  .LBB78_6:
   2991 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
   2992 ; SSE-NEXT:    movd %xmm5, %rax
   2993 ; SSE-NEXT:    movl %eax, %ecx
   2994 ; SSE-NEXT:    andl $1, %ecx
   2995 ; SSE-NEXT:    testq %rax, %rax
   2996 ; SSE-NEXT:    js .LBB78_7
   2997 ; SSE-NEXT:  # BB#8:
   2998 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm6
   2999 ; SSE-NEXT:    jmp .LBB78_9
   3000 ; SSE-NEXT:  .LBB78_7:
   3001 ; SSE-NEXT:    shrq %rax
   3002 ; SSE-NEXT:    orq %rax, %rcx
   3003 ; SSE-NEXT:    cvtsi2ssq %rcx, %xmm6
   3004 ; SSE-NEXT:    addss %xmm6, %xmm6
   3005 ; SSE-NEXT:  .LBB78_9:
   3006 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   3007 ; SSE-NEXT:    movd %xmm1, %rax
   3008 ; SSE-NEXT:    movl %eax, %ecx
   3009 ; SSE-NEXT:    andl $1, %ecx
   3010 ; SSE-NEXT:    testq %rax, %rax
   3011 ; SSE-NEXT:    js .LBB78_10
   3012 ; SSE-NEXT:  # BB#11:
   3013 ; SSE-NEXT:    xorps %xmm5, %xmm5
   3014 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm5
   3015 ; SSE-NEXT:    jmp .LBB78_12
   3016 ; SSE-NEXT:  .LBB78_10:
   3017 ; SSE-NEXT:    shrq %rax
   3018 ; SSE-NEXT:    orq %rax, %rcx
   3019 ; SSE-NEXT:    xorps %xmm5, %xmm5
   3020 ; SSE-NEXT:    cvtsi2ssq %rcx, %xmm5
   3021 ; SSE-NEXT:    addss %xmm5, %xmm5
   3022 ; SSE-NEXT:  .LBB78_12:
   3023 ; SSE-NEXT:    movd %xmm3, %rax
   3024 ; SSE-NEXT:    movl %eax, %ecx
   3025 ; SSE-NEXT:    andl $1, %ecx
   3026 ; SSE-NEXT:    testq %rax, %rax
   3027 ; SSE-NEXT:    js .LBB78_13
   3028 ; SSE-NEXT:  # BB#14:
   3029 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm7
   3030 ; SSE-NEXT:    jmp .LBB78_15
   3031 ; SSE-NEXT:  .LBB78_13:
   3032 ; SSE-NEXT:    shrq %rax
   3033 ; SSE-NEXT:    orq %rax, %rcx
   3034 ; SSE-NEXT:    cvtsi2ssq %rcx, %xmm7
   3035 ; SSE-NEXT:    addss %xmm7, %xmm7
   3036 ; SSE-NEXT:  .LBB78_15:
   3037 ; SSE-NEXT:    movd %xmm2, %rax
   3038 ; SSE-NEXT:    movl %eax, %ecx
   3039 ; SSE-NEXT:    andl $1, %ecx
   3040 ; SSE-NEXT:    testq %rax, %rax
   3041 ; SSE-NEXT:    js .LBB78_16
   3042 ; SSE-NEXT:  # BB#17:
   3043 ; SSE-NEXT:    xorps %xmm1, %xmm1
   3044 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm1
   3045 ; SSE-NEXT:    jmp .LBB78_18
   3046 ; SSE-NEXT:  .LBB78_16:
   3047 ; SSE-NEXT:    shrq %rax
   3048 ; SSE-NEXT:    orq %rax, %rcx
   3049 ; SSE-NEXT:    xorps %xmm1, %xmm1
   3050 ; SSE-NEXT:    cvtsi2ssq %rcx, %xmm1
   3051 ; SSE-NEXT:    addss %xmm1, %xmm1
   3052 ; SSE-NEXT:  .LBB78_18:
   3053 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
   3054 ; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
   3055 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
   3056 ; SSE-NEXT:    movd %xmm3, %rax
   3057 ; SSE-NEXT:    movl %eax, %ecx
   3058 ; SSE-NEXT:    andl $1, %ecx
   3059 ; SSE-NEXT:    testq %rax, %rax
   3060 ; SSE-NEXT:    js .LBB78_19
   3061 ; SSE-NEXT:  # BB#20:
   3062 ; SSE-NEXT:    xorps %xmm3, %xmm3
   3063 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm3
   3064 ; SSE-NEXT:    jmp .LBB78_21
   3065 ; SSE-NEXT:  .LBB78_19:
   3066 ; SSE-NEXT:    shrq %rax
   3067 ; SSE-NEXT:    orq %rax, %rcx
   3068 ; SSE-NEXT:    xorps %xmm3, %xmm3
   3069 ; SSE-NEXT:    cvtsi2ssq %rcx, %xmm3
   3070 ; SSE-NEXT:    addss %xmm3, %xmm3
   3071 ; SSE-NEXT:  .LBB78_21:
   3072 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
   3073 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
   3074 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
   3075 ; SSE-NEXT:    movd %xmm2, %rax
   3076 ; SSE-NEXT:    movl %eax, %ecx
   3077 ; SSE-NEXT:    andl $1, %ecx
   3078 ; SSE-NEXT:    testq %rax, %rax
   3079 ; SSE-NEXT:    js .LBB78_22
   3080 ; SSE-NEXT:  # BB#23:
   3081 ; SSE-NEXT:    xorps %xmm2, %xmm2
   3082 ; SSE-NEXT:    cvtsi2ssq %rax, %xmm2
   3083 ; SSE-NEXT:    jmp .LBB78_24
   3084 ; SSE-NEXT:  .LBB78_22:
   3085 ; SSE-NEXT:    shrq %rax
   3086 ; SSE-NEXT:    orq %rax, %rcx
   3087 ; SSE-NEXT:    xorps %xmm2, %xmm2
   3088 ; SSE-NEXT:    cvtsi2ssq %rcx, %xmm2
   3089 ; SSE-NEXT:    addss %xmm2, %xmm2
   3090 ; SSE-NEXT:  .LBB78_24:
   3091 ; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
   3092 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   3093 ; SSE-NEXT:    retq
   3094 ;
   3095 ; AVX1-LABEL: uitofp_load_8i64_to_8f32:
   3096 ; AVX1:       # BB#0:
   3097 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
   3098 ; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm2
   3099 ; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
   3100 ; AVX1-NEXT:    movl %eax, %ecx
   3101 ; AVX1-NEXT:    andl $1, %ecx
   3102 ; AVX1-NEXT:    testq %rax, %rax
   3103 ; AVX1-NEXT:    js .LBB78_1
   3104 ; AVX1-NEXT:  # BB#2:
   3105 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
   3106 ; AVX1-NEXT:    jmp .LBB78_3
   3107 ; AVX1-NEXT:  .LBB78_1:
   3108 ; AVX1-NEXT:    shrq %rax
   3109 ; AVX1-NEXT:    orq %rax, %rcx
   3110 ; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
   3111 ; AVX1-NEXT:    vaddss %xmm1, %xmm1, %xmm1
   3112 ; AVX1-NEXT:  .LBB78_3:
   3113 ; AVX1-NEXT:    vmovq %xmm2, %rax
   3114 ; AVX1-NEXT:    movl %eax, %ecx
   3115 ; AVX1-NEXT:    andl $1, %ecx
   3116 ; AVX1-NEXT:    testq %rax, %rax
   3117 ; AVX1-NEXT:    js .LBB78_4
   3118 ; AVX1-NEXT:  # BB#5:
   3119 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
   3120 ; AVX1-NEXT:    jmp .LBB78_6
   3121 ; AVX1-NEXT:  .LBB78_4:
   3122 ; AVX1-NEXT:    shrq %rax
   3123 ; AVX1-NEXT:    orq %rax, %rcx
   3124 ; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm3
   3125 ; AVX1-NEXT:    vaddss %xmm3, %xmm3, %xmm3
   3126 ; AVX1-NEXT:  .LBB78_6:
   3127 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
   3128 ; AVX1-NEXT:    vmovq %xmm2, %rax
   3129 ; AVX1-NEXT:    movl %eax, %ecx
   3130 ; AVX1-NEXT:    andl $1, %ecx
   3131 ; AVX1-NEXT:    testq %rax, %rax
   3132 ; AVX1-NEXT:    js .LBB78_7
   3133 ; AVX1-NEXT:  # BB#8:
   3134 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm4
   3135 ; AVX1-NEXT:    jmp .LBB78_9
   3136 ; AVX1-NEXT:  .LBB78_7:
   3137 ; AVX1-NEXT:    shrq %rax
   3138 ; AVX1-NEXT:    orq %rax, %rcx
   3139 ; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm4
   3140 ; AVX1-NEXT:    vaddss %xmm4, %xmm4, %xmm4
   3141 ; AVX1-NEXT:  .LBB78_9:
   3142 ; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
   3143 ; AVX1-NEXT:    movl %eax, %ecx
   3144 ; AVX1-NEXT:    andl $1, %ecx
   3145 ; AVX1-NEXT:    testq %rax, %rax
   3146 ; AVX1-NEXT:    js .LBB78_10
   3147 ; AVX1-NEXT:  # BB#11:
   3148 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   3149 ; AVX1-NEXT:    jmp .LBB78_12
   3150 ; AVX1-NEXT:  .LBB78_10:
   3151 ; AVX1-NEXT:    shrq %rax
   3152 ; AVX1-NEXT:    orq %rax, %rcx
   3153 ; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
   3154 ; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
   3155 ; AVX1-NEXT:  .LBB78_12:
   3156 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
   3157 ; AVX1-NEXT:    movl %eax, %ecx
   3158 ; AVX1-NEXT:    andl $1, %ecx
   3159 ; AVX1-NEXT:    testq %rax, %rax
   3160 ; AVX1-NEXT:    js .LBB78_13
   3161 ; AVX1-NEXT:  # BB#14:
   3162 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm5
   3163 ; AVX1-NEXT:    jmp .LBB78_15
   3164 ; AVX1-NEXT:  .LBB78_13:
   3165 ; AVX1-NEXT:    shrq %rax
   3166 ; AVX1-NEXT:    orq %rax, %rcx
   3167 ; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm5
   3168 ; AVX1-NEXT:    vaddss %xmm5, %xmm5, %xmm5
   3169 ; AVX1-NEXT:  .LBB78_15:
   3170 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
   3171 ; AVX1-NEXT:    vmovq %xmm0, %rax
   3172 ; AVX1-NEXT:    movl %eax, %ecx
   3173 ; AVX1-NEXT:    andl $1, %ecx
   3174 ; AVX1-NEXT:    testq %rax, %rax
   3175 ; AVX1-NEXT:    js .LBB78_16
   3176 ; AVX1-NEXT:  # BB#17:
   3177 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
   3178 ; AVX1-NEXT:    jmp .LBB78_18
   3179 ; AVX1-NEXT:  .LBB78_16:
   3180 ; AVX1-NEXT:    shrq %rax
   3181 ; AVX1-NEXT:    orq %rax, %rcx
   3182 ; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm3
   3183 ; AVX1-NEXT:    vaddss %xmm3, %xmm3, %xmm3
   3184 ; AVX1-NEXT:  .LBB78_18:
   3185 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
   3186 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
   3187 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
   3188 ; AVX1-NEXT:    vmovq %xmm4, %rax
   3189 ; AVX1-NEXT:    movl %eax, %ecx
   3190 ; AVX1-NEXT:    andl $1, %ecx
   3191 ; AVX1-NEXT:    testq %rax, %rax
   3192 ; AVX1-NEXT:    js .LBB78_19
   3193 ; AVX1-NEXT:  # BB#20:
   3194 ; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   3195 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm5
   3196 ; AVX1-NEXT:    jmp .LBB78_21
   3197 ; AVX1-NEXT:  .LBB78_19:
   3198 ; AVX1-NEXT:    shrq %rax
   3199 ; AVX1-NEXT:    orq %rax, %rcx
   3200 ; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   3201 ; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
   3202 ; AVX1-NEXT:    vaddss %xmm0, %xmm0, %xmm5
   3203 ; AVX1-NEXT:  .LBB78_21:
   3204 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
   3205 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3]
   3206 ; AVX1-NEXT:    vpextrq $1, %xmm4, %rax
   3207 ; AVX1-NEXT:    movl %eax, %ecx
   3208 ; AVX1-NEXT:    andl $1, %ecx
   3209 ; AVX1-NEXT:    testq %rax, %rax
   3210 ; AVX1-NEXT:    js .LBB78_22
   3211 ; AVX1-NEXT:  # BB#23:
   3212 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   3213 ; AVX1-NEXT:    jmp .LBB78_24
   3214 ; AVX1-NEXT:  .LBB78_22:
   3215 ; AVX1-NEXT:    shrq %rax
   3216 ; AVX1-NEXT:    orq %rax, %rcx
   3217 ; AVX1-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
   3218 ; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
   3219 ; AVX1-NEXT:  .LBB78_24:
   3220 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
   3221 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   3222 ; AVX1-NEXT:    retq
   3223 ;
   3224 ; AVX2-LABEL: uitofp_load_8i64_to_8f32:
   3225 ; AVX2:       # BB#0:
   3226 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
   3227 ; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm2
   3228 ; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
   3229 ; AVX2-NEXT:    movl %eax, %ecx
   3230 ; AVX2-NEXT:    andl $1, %ecx
   3231 ; AVX2-NEXT:    testq %rax, %rax
   3232 ; AVX2-NEXT:    js .LBB78_1
   3233 ; AVX2-NEXT:  # BB#2:
   3234 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm1
   3235 ; AVX2-NEXT:    jmp .LBB78_3
   3236 ; AVX2-NEXT:  .LBB78_1:
   3237 ; AVX2-NEXT:    shrq %rax
   3238 ; AVX2-NEXT:    orq %rax, %rcx
   3239 ; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm1
   3240 ; AVX2-NEXT:    vaddss %xmm1, %xmm1, %xmm1
   3241 ; AVX2-NEXT:  .LBB78_3:
   3242 ; AVX2-NEXT:    vmovq %xmm2, %rax
   3243 ; AVX2-NEXT:    movl %eax, %ecx
   3244 ; AVX2-NEXT:    andl $1, %ecx
   3245 ; AVX2-NEXT:    testq %rax, %rax
   3246 ; AVX2-NEXT:    js .LBB78_4
   3247 ; AVX2-NEXT:  # BB#5:
   3248 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
   3249 ; AVX2-NEXT:    jmp .LBB78_6
   3250 ; AVX2-NEXT:  .LBB78_4:
   3251 ; AVX2-NEXT:    shrq %rax
   3252 ; AVX2-NEXT:    orq %rax, %rcx
   3253 ; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm3
   3254 ; AVX2-NEXT:    vaddss %xmm3, %xmm3, %xmm3
   3255 ; AVX2-NEXT:  .LBB78_6:
   3256 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
   3257 ; AVX2-NEXT:    vmovq %xmm2, %rax
   3258 ; AVX2-NEXT:    movl %eax, %ecx
   3259 ; AVX2-NEXT:    andl $1, %ecx
   3260 ; AVX2-NEXT:    testq %rax, %rax
   3261 ; AVX2-NEXT:    js .LBB78_7
   3262 ; AVX2-NEXT:  # BB#8:
   3263 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm4
   3264 ; AVX2-NEXT:    jmp .LBB78_9
   3265 ; AVX2-NEXT:  .LBB78_7:
   3266 ; AVX2-NEXT:    shrq %rax
   3267 ; AVX2-NEXT:    orq %rax, %rcx
   3268 ; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm4
   3269 ; AVX2-NEXT:    vaddss %xmm4, %xmm4, %xmm4
   3270 ; AVX2-NEXT:  .LBB78_9:
   3271 ; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
   3272 ; AVX2-NEXT:    movl %eax, %ecx
   3273 ; AVX2-NEXT:    andl $1, %ecx
   3274 ; AVX2-NEXT:    testq %rax, %rax
   3275 ; AVX2-NEXT:    js .LBB78_10
   3276 ; AVX2-NEXT:  # BB#11:
   3277 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   3278 ; AVX2-NEXT:    jmp .LBB78_12
   3279 ; AVX2-NEXT:  .LBB78_10:
   3280 ; AVX2-NEXT:    shrq %rax
   3281 ; AVX2-NEXT:    orq %rax, %rcx
   3282 ; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
   3283 ; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
   3284 ; AVX2-NEXT:  .LBB78_12:
   3285 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
   3286 ; AVX2-NEXT:    movl %eax, %ecx
   3287 ; AVX2-NEXT:    andl $1, %ecx
   3288 ; AVX2-NEXT:    testq %rax, %rax
   3289 ; AVX2-NEXT:    js .LBB78_13
   3290 ; AVX2-NEXT:  # BB#14:
   3291 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm5
   3292 ; AVX2-NEXT:    jmp .LBB78_15
   3293 ; AVX2-NEXT:  .LBB78_13:
   3294 ; AVX2-NEXT:    shrq %rax
   3295 ; AVX2-NEXT:    orq %rax, %rcx
   3296 ; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm5
   3297 ; AVX2-NEXT:    vaddss %xmm5, %xmm5, %xmm5
   3298 ; AVX2-NEXT:  .LBB78_15:
   3299 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
   3300 ; AVX2-NEXT:    vmovq %xmm0, %rax
   3301 ; AVX2-NEXT:    movl %eax, %ecx
   3302 ; AVX2-NEXT:    andl $1, %ecx
   3303 ; AVX2-NEXT:    testq %rax, %rax
   3304 ; AVX2-NEXT:    js .LBB78_16
   3305 ; AVX2-NEXT:  # BB#17:
   3306 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm3
   3307 ; AVX2-NEXT:    jmp .LBB78_18
   3308 ; AVX2-NEXT:  .LBB78_16:
   3309 ; AVX2-NEXT:    shrq %rax
   3310 ; AVX2-NEXT:    orq %rax, %rcx
   3311 ; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm3
   3312 ; AVX2-NEXT:    vaddss %xmm3, %xmm3, %xmm3
   3313 ; AVX2-NEXT:  .LBB78_18:
   3314 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
   3315 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
   3316 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
   3317 ; AVX2-NEXT:    vmovq %xmm4, %rax
   3318 ; AVX2-NEXT:    movl %eax, %ecx
   3319 ; AVX2-NEXT:    andl $1, %ecx
   3320 ; AVX2-NEXT:    testq %rax, %rax
   3321 ; AVX2-NEXT:    js .LBB78_19
   3322 ; AVX2-NEXT:  # BB#20:
   3323 ; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   3324 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm5
   3325 ; AVX2-NEXT:    jmp .LBB78_21
   3326 ; AVX2-NEXT:  .LBB78_19:
   3327 ; AVX2-NEXT:    shrq %rax
   3328 ; AVX2-NEXT:    orq %rax, %rcx
   3329 ; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   3330 ; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm0
   3331 ; AVX2-NEXT:    vaddss %xmm0, %xmm0, %xmm5
   3332 ; AVX2-NEXT:  .LBB78_21:
   3333 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
   3334 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3]
   3335 ; AVX2-NEXT:    vpextrq $1, %xmm4, %rax
   3336 ; AVX2-NEXT:    movl %eax, %ecx
   3337 ; AVX2-NEXT:    andl $1, %ecx
   3338 ; AVX2-NEXT:    testq %rax, %rax
   3339 ; AVX2-NEXT:    js .LBB78_22
   3340 ; AVX2-NEXT:  # BB#23:
   3341 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm0, %xmm2
   3342 ; AVX2-NEXT:    jmp .LBB78_24
   3343 ; AVX2-NEXT:  .LBB78_22:
   3344 ; AVX2-NEXT:    shrq %rax
   3345 ; AVX2-NEXT:    orq %rax, %rcx
   3346 ; AVX2-NEXT:    vcvtsi2ssq %rcx, %xmm0, %xmm2
   3347 ; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
   3348 ; AVX2-NEXT:  .LBB78_24:
   3349 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
   3350 ; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   3351 ; AVX2-NEXT:    retq
   3352   %ld = load <8 x i64>, <8 x i64> *%a
   3353   %cvt = uitofp <8 x i64> %ld to <8 x float>
   3354   ret <8 x float> %cvt
   3355 }
   3356 
   3357 define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) {
   3358 ; SSE-LABEL: uitofp_load_8i32_to_8f32:
   3359 ; SSE:       # BB#0:
   3360 ; SSE-NEXT:    movdqa (%rdi), %xmm0
   3361 ; SSE-NEXT:    movdqa 16(%rdi), %xmm1
   3362 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
   3363 ; SSE-NEXT:    movdqa %xmm0, %xmm3
   3364 ; SSE-NEXT:    pand %xmm2, %xmm3
   3365 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
   3366 ; SSE-NEXT:    por %xmm4, %xmm3
   3367 ; SSE-NEXT:    psrld $16, %xmm0
   3368 ; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
   3369 ; SSE-NEXT:    por %xmm5, %xmm0
   3370 ; SSE-NEXT:    movaps {{.*#+}} xmm6 = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
   3371 ; SSE-NEXT:    addps %xmm6, %xmm0
   3372 ; SSE-NEXT:    addps %xmm3, %xmm0
   3373 ; SSE-NEXT:    pand %xmm1, %xmm2
   3374 ; SSE-NEXT:    por %xmm4, %xmm2
   3375 ; SSE-NEXT:    psrld $16, %xmm1
   3376 ; SSE-NEXT:    por %xmm5, %xmm1
   3377 ; SSE-NEXT:    addps %xmm6, %xmm1
   3378 ; SSE-NEXT:    addps %xmm2, %xmm1
   3379 ; SSE-NEXT:    retq
   3380 ;
   3381 ; AVX1-LABEL: uitofp_load_8i32_to_8f32:
   3382 ; AVX1:       # BB#0:
   3383 ; AVX1-NEXT:    vmovaps (%rdi), %ymm0
   3384 ; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm1
   3385 ; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
   3386 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
   3387 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3388 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
   3389 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
   3390 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
   3391 ; AVX1-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
   3392 ; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
   3393 ; AVX1-NEXT:    retq
   3394 ;
   3395 ; AVX2-LABEL: uitofp_load_8i32_to_8f32:
   3396 ; AVX2:       # BB#0:
   3397 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
   3398 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
   3399 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
   3400 ; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
   3401 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
   3402 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
   3403 ; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
   3404 ; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
   3405 ; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
   3406 ; AVX2-NEXT:    retq
   3407   %ld = load <8 x i32>, <8 x i32> *%a
   3408   %cvt = uitofp <8 x i32> %ld to <8 x float>
   3409   ret <8 x float> %cvt
   3410 }
   3411 
   3412 define <8 x float> @uitofp_load_8i16_to_8f32(<8 x i16> *%a) {
   3413 ; SSE-LABEL: uitofp_load_8i16_to_8f32:
   3414 ; SSE:       # BB#0:
   3415 ; SSE-NEXT:    movdqa (%rdi), %xmm1
   3416 ; SSE-NEXT:    pxor %xmm2, %xmm2
   3417 ; SSE-NEXT:    movdqa %xmm1, %xmm0
   3418 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   3419 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
   3420 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
   3421 ; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
   3422 ; SSE-NEXT:    retq
   3423 ;
   3424 ; AVX1-LABEL: uitofp_load_8i16_to_8f32:
   3425 ; AVX1:       # BB#0:
   3426 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   3427 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   3428 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   3429 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
   3430 ; AVX1-NEXT:    retq
   3431 ;
   3432 ; AVX2-LABEL: uitofp_load_8i16_to_8f32:
   3433 ; AVX2:       # BB#0:
   3434 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
   3435 ; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
   3436 ; AVX2-NEXT:    retq
   3437   %ld = load <8 x i16>, <8 x i16> *%a
   3438   %cvt = uitofp <8 x i16> %ld to <8 x float>
   3439   ret <8 x float> %cvt
   3440 }
   3441 
   3442 define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) {
   3443 ; SSE-LABEL: uitofp_load_8i8_to_8f32:
   3444 ; SSE:       # BB#0:
   3445 ; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
   3446 ; SSE-NEXT:    pxor %xmm2, %xmm2
   3447 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
   3448 ; SSE-NEXT:    movdqa %xmm1, %xmm0
   3449 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   3450 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
   3451 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
   3452 ; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
   3453 ; SSE-NEXT:    retq
   3454 ;
   3455 ; AVX1-LABEL: uitofp_load_8i8_to_8f32:
   3456 ; AVX1:       # BB#0:
   3457 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   3458 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
   3459 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   3460 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
   3461 ; AVX1-NEXT:    retq
   3462 ;
   3463 ; AVX2-LABEL: uitofp_load_8i8_to_8f32:
   3464 ; AVX2:       # BB#0:
   3465 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
   3466 ; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
   3467 ; AVX2-NEXT:    retq
   3468   %ld = load <8 x i8>, <8 x i8> *%a
   3469   %cvt = uitofp <8 x i8> %ld to <8 x float>
   3470   ret <8 x float> %cvt
   3471 }
   3472 
   3473 ;
   3474 ; Aggregates
   3475 ;
   3476 
   3477 %Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float>* }>
   3478 define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
   3479 ; SSE-LABEL: aggregate_sitofp_8i16_to_8f32:
   3480 ; SSE:       # BB#0:
   3481 ; SSE-NEXT:    movq 24(%rdi), %rax
   3482 ; SSE-NEXT:    movdqu 8(%rdi), %xmm0
   3483 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
   3484 ; SSE-NEXT:    psrad $16, %xmm1
   3485 ; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
   3486 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
   3487 ; SSE-NEXT:    psrad $16, %xmm0
   3488 ; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
   3489 ; SSE-NEXT:    movaps %xmm0, 16(%rax)
   3490 ; SSE-NEXT:    movaps %xmm1, (%rax)
   3491 ; SSE-NEXT:    retq
   3492 ;
   3493 ; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32:
   3494 ; AVX1:       # BB#0:
   3495 ; AVX1-NEXT:    movq 24(%rdi), %rax
   3496 ; AVX1-NEXT:    vmovdqu 8(%rdi), %xmm0
   3497 ; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
   3498 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   3499 ; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
   3500 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   3501 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
   3502 ; AVX1-NEXT:    vmovaps %ymm0, (%rax)
   3503 ; AVX1-NEXT:    vzeroupper
   3504 ; AVX1-NEXT:    retq
   3505 ;
   3506 ; AVX2-LABEL: aggregate_sitofp_8i16_to_8f32:
   3507 ; AVX2:       # BB#0:
   3508 ; AVX2-NEXT:    movq 24(%rdi), %rax
   3509 ; AVX2-NEXT:    vpmovsxwd 8(%rdi), %ymm0
   3510 ; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
   3511 ; AVX2-NEXT:    vmovaps %ymm0, (%rax)
   3512 ; AVX2-NEXT:    vzeroupper
   3513 ; AVX2-NEXT:    retq
   3514  %1 = load %Arguments, %Arguments* %a0, align 1
   3515  %2 = extractvalue %Arguments %1, 1
   3516  %3 = extractvalue %Arguments %1, 2
   3517  %4 = sitofp <8 x i16> %2 to <8 x float>
   3518  store <8 x float> %4, <8 x float>* %3, align 32
   3519  ret void
   3520 }
   3521