Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c,+fast-variable-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VL
      7 
      8 ;
      9 ; Half to Float
     10 ;
     11 
     12 define float @cvt_i16_to_f32(i16 %a0) nounwind {
     13 ; ALL-LABEL: cvt_i16_to_f32:
     14 ; ALL:       # %bb.0:
     15 ; ALL-NEXT:    movswl %di, %eax
     16 ; ALL-NEXT:    vmovd %eax, %xmm0
     17 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
     18 ; ALL-NEXT:    retq
     19   %1 = bitcast i16 %a0 to half
     20   %2 = fpext half %1 to float
     21   ret float %2
     22 }
     23 
     24 define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind {
     25 ; AVX1-LABEL: cvt_4i16_to_4f32:
     26 ; AVX1:       # %bb.0:
     27 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
     28 ; AVX1-NEXT:    vmovq %xmm0, %rax
     29 ; AVX1-NEXT:    movq %rax, %rcx
     30 ; AVX1-NEXT:    movq %rax, %rdx
     31 ; AVX1-NEXT:    movswl %ax, %esi
     32 ; AVX1-NEXT:    # kill: def $eax killed $eax killed $rax
     33 ; AVX1-NEXT:    shrl $16, %eax
     34 ; AVX1-NEXT:    shrq $32, %rcx
     35 ; AVX1-NEXT:    shrq $48, %rdx
     36 ; AVX1-NEXT:    movswl %dx, %edx
     37 ; AVX1-NEXT:    vmovd %edx, %xmm0
     38 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
     39 ; AVX1-NEXT:    movswl %cx, %ecx
     40 ; AVX1-NEXT:    vmovd %ecx, %xmm1
     41 ; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
     42 ; AVX1-NEXT:    cwtl
     43 ; AVX1-NEXT:    vmovd %eax, %xmm2
     44 ; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
     45 ; AVX1-NEXT:    vmovd %esi, %xmm3
     46 ; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
     47 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
     48 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
     49 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
     50 ; AVX1-NEXT:    retq
     51 ;
     52 ; AVX2-LABEL: cvt_4i16_to_4f32:
     53 ; AVX2:       # %bb.0:
     54 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
     55 ; AVX2-NEXT:    vmovq %xmm0, %rax
     56 ; AVX2-NEXT:    movq %rax, %rcx
     57 ; AVX2-NEXT:    movq %rax, %rdx
     58 ; AVX2-NEXT:    movswl %ax, %esi
     59 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
     60 ; AVX2-NEXT:    shrl $16, %eax
     61 ; AVX2-NEXT:    shrq $32, %rcx
     62 ; AVX2-NEXT:    shrq $48, %rdx
     63 ; AVX2-NEXT:    movswl %dx, %edx
     64 ; AVX2-NEXT:    vmovd %edx, %xmm0
     65 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
     66 ; AVX2-NEXT:    movswl %cx, %ecx
     67 ; AVX2-NEXT:    vmovd %ecx, %xmm1
     68 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
     69 ; AVX2-NEXT:    cwtl
     70 ; AVX2-NEXT:    vmovd %eax, %xmm2
     71 ; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
     72 ; AVX2-NEXT:    vmovd %esi, %xmm3
     73 ; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
     74 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
     75 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
     76 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
     77 ; AVX2-NEXT:    retq
     78 ;
     79 ; AVX512F-LABEL: cvt_4i16_to_4f32:
     80 ; AVX512F:       # %bb.0:
     81 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
     82 ; AVX512F-NEXT:    vmovq %xmm0, %rax
     83 ; AVX512F-NEXT:    movq %rax, %rcx
     84 ; AVX512F-NEXT:    movq %rax, %rdx
     85 ; AVX512F-NEXT:    movswl %ax, %esi
     86 ; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
     87 ; AVX512F-NEXT:    shrl $16, %eax
     88 ; AVX512F-NEXT:    shrq $32, %rcx
     89 ; AVX512F-NEXT:    shrq $48, %rdx
     90 ; AVX512F-NEXT:    movswl %dx, %edx
     91 ; AVX512F-NEXT:    vmovd %edx, %xmm0
     92 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
     93 ; AVX512F-NEXT:    movswl %cx, %ecx
     94 ; AVX512F-NEXT:    vmovd %ecx, %xmm1
     95 ; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
     96 ; AVX512F-NEXT:    cwtl
     97 ; AVX512F-NEXT:    vmovd %eax, %xmm2
     98 ; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
     99 ; AVX512F-NEXT:    vmovd %esi, %xmm3
    100 ; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
    101 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
    102 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
    103 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    104 ; AVX512F-NEXT:    retq
    105 ;
    106 ; AVX512VL-LABEL: cvt_4i16_to_4f32:
    107 ; AVX512VL:       # %bb.0:
    108 ; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
    109 ; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
    110 ; AVX512VL-NEXT:    movq %rax, %rcx
    111 ; AVX512VL-NEXT:    movq %rax, %rdx
    112 ; AVX512VL-NEXT:    movswl %ax, %esi
    113 ; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
    114 ; AVX512VL-NEXT:    shrl $16, %eax
    115 ; AVX512VL-NEXT:    shrq $32, %rcx
    116 ; AVX512VL-NEXT:    shrq $48, %rdx
    117 ; AVX512VL-NEXT:    movswl %dx, %edx
    118 ; AVX512VL-NEXT:    vmovd %edx, %xmm0
    119 ; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
    120 ; AVX512VL-NEXT:    movswl %cx, %ecx
    121 ; AVX512VL-NEXT:    vmovd %ecx, %xmm1
    122 ; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
    123 ; AVX512VL-NEXT:    cwtl
    124 ; AVX512VL-NEXT:    vmovd %eax, %xmm2
    125 ; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
    126 ; AVX512VL-NEXT:    vmovd %esi, %xmm3
    127 ; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
    128 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
    129 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
    130 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    131 ; AVX512VL-NEXT:    retq
    132   %1 = bitcast <4 x i16> %a0 to <4 x half>
    133   %2 = fpext <4 x half> %1 to <4 x float>
    134   ret <4 x float> %2
    135 }
    136 
    137 define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind {
    138 ; AVX1-LABEL: cvt_8i16_to_4f32:
    139 ; AVX1:       # %bb.0:
    140 ; AVX1-NEXT:    vmovq %xmm0, %rax
    141 ; AVX1-NEXT:    movq %rax, %rcx
    142 ; AVX1-NEXT:    movq %rax, %rdx
    143 ; AVX1-NEXT:    movswl %ax, %esi
    144 ; AVX1-NEXT:    # kill: def $eax killed $eax killed $rax
    145 ; AVX1-NEXT:    shrl $16, %eax
    146 ; AVX1-NEXT:    shrq $32, %rcx
    147 ; AVX1-NEXT:    shrq $48, %rdx
    148 ; AVX1-NEXT:    movswl %dx, %edx
    149 ; AVX1-NEXT:    vmovd %edx, %xmm0
    150 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
    151 ; AVX1-NEXT:    movswl %cx, %ecx
    152 ; AVX1-NEXT:    vmovd %ecx, %xmm1
    153 ; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
    154 ; AVX1-NEXT:    cwtl
    155 ; AVX1-NEXT:    vmovd %eax, %xmm2
    156 ; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
    157 ; AVX1-NEXT:    vmovd %esi, %xmm3
    158 ; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
    159 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
    160 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
    161 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    162 ; AVX1-NEXT:    retq
    163 ;
    164 ; AVX2-LABEL: cvt_8i16_to_4f32:
    165 ; AVX2:       # %bb.0:
    166 ; AVX2-NEXT:    vmovq %xmm0, %rax
    167 ; AVX2-NEXT:    movq %rax, %rcx
    168 ; AVX2-NEXT:    movq %rax, %rdx
    169 ; AVX2-NEXT:    movswl %ax, %esi
    170 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
    171 ; AVX2-NEXT:    shrl $16, %eax
    172 ; AVX2-NEXT:    shrq $32, %rcx
    173 ; AVX2-NEXT:    shrq $48, %rdx
    174 ; AVX2-NEXT:    movswl %dx, %edx
    175 ; AVX2-NEXT:    vmovd %edx, %xmm0
    176 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
    177 ; AVX2-NEXT:    movswl %cx, %ecx
    178 ; AVX2-NEXT:    vmovd %ecx, %xmm1
    179 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
    180 ; AVX2-NEXT:    cwtl
    181 ; AVX2-NEXT:    vmovd %eax, %xmm2
    182 ; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
    183 ; AVX2-NEXT:    vmovd %esi, %xmm3
    184 ; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
    185 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
    186 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
    187 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    188 ; AVX2-NEXT:    retq
    189 ;
    190 ; AVX512F-LABEL: cvt_8i16_to_4f32:
    191 ; AVX512F:       # %bb.0:
    192 ; AVX512F-NEXT:    vmovq %xmm0, %rax
    193 ; AVX512F-NEXT:    movq %rax, %rcx
    194 ; AVX512F-NEXT:    movq %rax, %rdx
    195 ; AVX512F-NEXT:    movswl %ax, %esi
    196 ; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
    197 ; AVX512F-NEXT:    shrl $16, %eax
    198 ; AVX512F-NEXT:    shrq $32, %rcx
    199 ; AVX512F-NEXT:    shrq $48, %rdx
    200 ; AVX512F-NEXT:    movswl %dx, %edx
    201 ; AVX512F-NEXT:    vmovd %edx, %xmm0
    202 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
    203 ; AVX512F-NEXT:    movswl %cx, %ecx
    204 ; AVX512F-NEXT:    vmovd %ecx, %xmm1
    205 ; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
    206 ; AVX512F-NEXT:    cwtl
    207 ; AVX512F-NEXT:    vmovd %eax, %xmm2
    208 ; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
    209 ; AVX512F-NEXT:    vmovd %esi, %xmm3
    210 ; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
    211 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
    212 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
    213 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    214 ; AVX512F-NEXT:    retq
    215 ;
    216 ; AVX512VL-LABEL: cvt_8i16_to_4f32:
    217 ; AVX512VL:       # %bb.0:
    218 ; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    219 ; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
    220 ; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
    221 ; AVX512VL-NEXT:    movq %rax, %rcx
    222 ; AVX512VL-NEXT:    movq %rax, %rdx
    223 ; AVX512VL-NEXT:    movswl %ax, %esi
    224 ; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
    225 ; AVX512VL-NEXT:    shrl $16, %eax
    226 ; AVX512VL-NEXT:    shrq $32, %rcx
    227 ; AVX512VL-NEXT:    shrq $48, %rdx
    228 ; AVX512VL-NEXT:    movswl %dx, %edx
    229 ; AVX512VL-NEXT:    vmovd %edx, %xmm0
    230 ; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
    231 ; AVX512VL-NEXT:    movswl %cx, %ecx
    232 ; AVX512VL-NEXT:    vmovd %ecx, %xmm1
    233 ; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
    234 ; AVX512VL-NEXT:    cwtl
    235 ; AVX512VL-NEXT:    vmovd %eax, %xmm2
    236 ; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
    237 ; AVX512VL-NEXT:    vmovd %esi, %xmm3
    238 ; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
    239 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
    240 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
    241 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    242 ; AVX512VL-NEXT:    retq
    243   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    244   %2 = bitcast <4 x i16> %1 to <4 x half>
    245   %3 = fpext <4 x half> %2 to <4 x float>
    246   ret <4 x float> %3
    247 }
    248 
    249 define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind {
    250 ; ALL-LABEL: cvt_8i16_to_8f32:
    251 ; ALL:       # %bb.0:
    252 ; ALL-NEXT:    vpextrq $1, %xmm0, %rdx
    253 ; ALL-NEXT:    movq %rdx, %r8
    254 ; ALL-NEXT:    movq %rdx, %r10
    255 ; ALL-NEXT:    movswl %dx, %r9d
    256 ; ALL-NEXT:    # kill: def $edx killed $edx killed $rdx
    257 ; ALL-NEXT:    shrl $16, %edx
    258 ; ALL-NEXT:    shrq $32, %r8
    259 ; ALL-NEXT:    shrq $48, %r10
    260 ; ALL-NEXT:    vmovq %xmm0, %rdi
    261 ; ALL-NEXT:    movq %rdi, %rax
    262 ; ALL-NEXT:    movq %rdi, %rsi
    263 ; ALL-NEXT:    movswl %di, %ecx
    264 ; ALL-NEXT:    # kill: def $edi killed $edi killed $rdi
    265 ; ALL-NEXT:    shrl $16, %edi
    266 ; ALL-NEXT:    shrq $32, %rax
    267 ; ALL-NEXT:    shrq $48, %rsi
    268 ; ALL-NEXT:    movswl %si, %esi
    269 ; ALL-NEXT:    vmovd %esi, %xmm0
    270 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
    271 ; ALL-NEXT:    cwtl
    272 ; ALL-NEXT:    vmovd %eax, %xmm1
    273 ; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
    274 ; ALL-NEXT:    movswl %di, %eax
    275 ; ALL-NEXT:    vmovd %eax, %xmm2
    276 ; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
    277 ; ALL-NEXT:    vmovd %ecx, %xmm3
    278 ; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
    279 ; ALL-NEXT:    movswl %r10w, %eax
    280 ; ALL-NEXT:    vmovd %eax, %xmm4
    281 ; ALL-NEXT:    vcvtph2ps %xmm4, %xmm4
    282 ; ALL-NEXT:    movswl %r8w, %eax
    283 ; ALL-NEXT:    vmovd %eax, %xmm5
    284 ; ALL-NEXT:    vcvtph2ps %xmm5, %xmm5
    285 ; ALL-NEXT:    movswl %dx, %eax
    286 ; ALL-NEXT:    vmovd %eax, %xmm6
    287 ; ALL-NEXT:    vcvtph2ps %xmm6, %xmm6
    288 ; ALL-NEXT:    vmovd %r9d, %xmm7
    289 ; ALL-NEXT:    vcvtph2ps %xmm7, %xmm7
    290 ; ALL-NEXT:    vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
    291 ; ALL-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
    292 ; ALL-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
    293 ; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
    294 ; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
    295 ; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    296 ; ALL-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
    297 ; ALL-NEXT:    retq
    298   %1 = bitcast <8 x i16> %a0 to <8 x half>
    299   %2 = fpext <8 x half> %1 to <8 x float>
    300   ret <8 x float> %2
    301 }
    302 
    303 define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
    304 ; AVX1-LABEL: cvt_16i16_to_16f32:
    305 ; AVX1:       # %bb.0:
    306 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
    307 ; AVX1-NEXT:    vmovq %xmm4, %rax
    308 ; AVX1-NEXT:    movq %rax, %rcx
    309 ; AVX1-NEXT:    shrq $48, %rcx
    310 ; AVX1-NEXT:    movswl %cx, %ecx
    311 ; AVX1-NEXT:    vmovd %ecx, %xmm8
    312 ; AVX1-NEXT:    movq %rax, %rcx
    313 ; AVX1-NEXT:    shrq $32, %rcx
    314 ; AVX1-NEXT:    movswl %cx, %ecx
    315 ; AVX1-NEXT:    vmovd %ecx, %xmm9
    316 ; AVX1-NEXT:    movswl %ax, %ecx
    317 ; AVX1-NEXT:    # kill: def $eax killed $eax killed $rax
    318 ; AVX1-NEXT:    shrl $16, %eax
    319 ; AVX1-NEXT:    cwtl
    320 ; AVX1-NEXT:    vmovd %eax, %xmm10
    321 ; AVX1-NEXT:    vpextrq $1, %xmm4, %rax
    322 ; AVX1-NEXT:    vmovd %ecx, %xmm11
    323 ; AVX1-NEXT:    movq %rax, %rcx
    324 ; AVX1-NEXT:    shrq $48, %rcx
    325 ; AVX1-NEXT:    movswl %cx, %ecx
    326 ; AVX1-NEXT:    vmovd %ecx, %xmm12
    327 ; AVX1-NEXT:    movq %rax, %rcx
    328 ; AVX1-NEXT:    shrq $32, %rcx
    329 ; AVX1-NEXT:    movswl %cx, %ecx
    330 ; AVX1-NEXT:    vmovd %ecx, %xmm13
    331 ; AVX1-NEXT:    movswl %ax, %ecx
    332 ; AVX1-NEXT:    # kill: def $eax killed $eax killed $rax
    333 ; AVX1-NEXT:    shrl $16, %eax
    334 ; AVX1-NEXT:    cwtl
    335 ; AVX1-NEXT:    vmovd %eax, %xmm14
    336 ; AVX1-NEXT:    vmovq %xmm0, %rax
    337 ; AVX1-NEXT:    vmovd %ecx, %xmm15
    338 ; AVX1-NEXT:    movq %rax, %rcx
    339 ; AVX1-NEXT:    shrq $48, %rcx
    340 ; AVX1-NEXT:    movswl %cx, %ecx
    341 ; AVX1-NEXT:    vmovd %ecx, %xmm2
    342 ; AVX1-NEXT:    movq %rax, %rcx
    343 ; AVX1-NEXT:    shrq $32, %rcx
    344 ; AVX1-NEXT:    movswl %cx, %ecx
    345 ; AVX1-NEXT:    vmovd %ecx, %xmm3
    346 ; AVX1-NEXT:    movswl %ax, %ecx
    347 ; AVX1-NEXT:    # kill: def $eax killed $eax killed $rax
    348 ; AVX1-NEXT:    shrl $16, %eax
    349 ; AVX1-NEXT:    cwtl
    350 ; AVX1-NEXT:    vmovd %eax, %xmm4
    351 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
    352 ; AVX1-NEXT:    vmovd %ecx, %xmm0
    353 ; AVX1-NEXT:    movq %rax, %rcx
    354 ; AVX1-NEXT:    shrq $48, %rcx
    355 ; AVX1-NEXT:    movswl %cx, %ecx
    356 ; AVX1-NEXT:    vmovd %ecx, %xmm5
    357 ; AVX1-NEXT:    movq %rax, %rcx
    358 ; AVX1-NEXT:    shrq $32, %rcx
    359 ; AVX1-NEXT:    movswl %cx, %ecx
    360 ; AVX1-NEXT:    vmovd %ecx, %xmm6
    361 ; AVX1-NEXT:    movl %eax, %ecx
    362 ; AVX1-NEXT:    shrl $16, %ecx
    363 ; AVX1-NEXT:    movswl %cx, %ecx
    364 ; AVX1-NEXT:    vmovd %ecx, %xmm7
    365 ; AVX1-NEXT:    cwtl
    366 ; AVX1-NEXT:    vmovd %eax, %xmm1
    367 ; AVX1-NEXT:    vcvtph2ps %xmm8, %xmm8
    368 ; AVX1-NEXT:    vcvtph2ps %xmm9, %xmm9
    369 ; AVX1-NEXT:    vcvtph2ps %xmm10, %xmm10
    370 ; AVX1-NEXT:    vcvtph2ps %xmm11, %xmm11
    371 ; AVX1-NEXT:    vcvtph2ps %xmm12, %xmm12
    372 ; AVX1-NEXT:    vcvtph2ps %xmm13, %xmm13
    373 ; AVX1-NEXT:    vcvtph2ps %xmm14, %xmm14
    374 ; AVX1-NEXT:    vcvtph2ps %xmm15, %xmm15
    375 ; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
    376 ; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
    377 ; AVX1-NEXT:    vcvtph2ps %xmm4, %xmm4
    378 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
    379 ; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
    380 ; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
    381 ; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
    382 ; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
    383 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3]
    384 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
    385 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
    386 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
    387 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
    388 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
    389 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    390 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3]
    391 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
    392 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
    393 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3]
    394 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
    395 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
    396 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
    397 ; AVX1-NEXT:    retq
    398 ;
    399 ; AVX2-LABEL: cvt_16i16_to_16f32:
    400 ; AVX2:       # %bb.0:
    401 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
    402 ; AVX2-NEXT:    vmovq %xmm4, %rax
    403 ; AVX2-NEXT:    movq %rax, %rcx
    404 ; AVX2-NEXT:    shrq $48, %rcx
    405 ; AVX2-NEXT:    movswl %cx, %ecx
    406 ; AVX2-NEXT:    vmovd %ecx, %xmm8
    407 ; AVX2-NEXT:    movq %rax, %rcx
    408 ; AVX2-NEXT:    shrq $32, %rcx
    409 ; AVX2-NEXT:    movswl %cx, %ecx
    410 ; AVX2-NEXT:    vmovd %ecx, %xmm9
    411 ; AVX2-NEXT:    movswl %ax, %ecx
    412 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
    413 ; AVX2-NEXT:    shrl $16, %eax
    414 ; AVX2-NEXT:    cwtl
    415 ; AVX2-NEXT:    vmovd %eax, %xmm10
    416 ; AVX2-NEXT:    vpextrq $1, %xmm4, %rax
    417 ; AVX2-NEXT:    vmovd %ecx, %xmm11
    418 ; AVX2-NEXT:    movq %rax, %rcx
    419 ; AVX2-NEXT:    shrq $48, %rcx
    420 ; AVX2-NEXT:    movswl %cx, %ecx
    421 ; AVX2-NEXT:    vmovd %ecx, %xmm12
    422 ; AVX2-NEXT:    movq %rax, %rcx
    423 ; AVX2-NEXT:    shrq $32, %rcx
    424 ; AVX2-NEXT:    movswl %cx, %ecx
    425 ; AVX2-NEXT:    vmovd %ecx, %xmm13
    426 ; AVX2-NEXT:    movswl %ax, %ecx
    427 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
    428 ; AVX2-NEXT:    shrl $16, %eax
    429 ; AVX2-NEXT:    cwtl
    430 ; AVX2-NEXT:    vmovd %eax, %xmm14
    431 ; AVX2-NEXT:    vmovq %xmm0, %rax
    432 ; AVX2-NEXT:    vmovd %ecx, %xmm15
    433 ; AVX2-NEXT:    movq %rax, %rcx
    434 ; AVX2-NEXT:    shrq $48, %rcx
    435 ; AVX2-NEXT:    movswl %cx, %ecx
    436 ; AVX2-NEXT:    vmovd %ecx, %xmm2
    437 ; AVX2-NEXT:    movq %rax, %rcx
    438 ; AVX2-NEXT:    shrq $32, %rcx
    439 ; AVX2-NEXT:    movswl %cx, %ecx
    440 ; AVX2-NEXT:    vmovd %ecx, %xmm3
    441 ; AVX2-NEXT:    movswl %ax, %ecx
    442 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
    443 ; AVX2-NEXT:    shrl $16, %eax
    444 ; AVX2-NEXT:    cwtl
    445 ; AVX2-NEXT:    vmovd %eax, %xmm4
    446 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
    447 ; AVX2-NEXT:    vmovd %ecx, %xmm0
    448 ; AVX2-NEXT:    movq %rax, %rcx
    449 ; AVX2-NEXT:    shrq $48, %rcx
    450 ; AVX2-NEXT:    movswl %cx, %ecx
    451 ; AVX2-NEXT:    vmovd %ecx, %xmm5
    452 ; AVX2-NEXT:    movq %rax, %rcx
    453 ; AVX2-NEXT:    shrq $32, %rcx
    454 ; AVX2-NEXT:    movswl %cx, %ecx
    455 ; AVX2-NEXT:    vmovd %ecx, %xmm6
    456 ; AVX2-NEXT:    movl %eax, %ecx
    457 ; AVX2-NEXT:    shrl $16, %ecx
    458 ; AVX2-NEXT:    movswl %cx, %ecx
    459 ; AVX2-NEXT:    vmovd %ecx, %xmm7
    460 ; AVX2-NEXT:    cwtl
    461 ; AVX2-NEXT:    vmovd %eax, %xmm1
    462 ; AVX2-NEXT:    vcvtph2ps %xmm8, %xmm8
    463 ; AVX2-NEXT:    vcvtph2ps %xmm9, %xmm9
    464 ; AVX2-NEXT:    vcvtph2ps %xmm10, %xmm10
    465 ; AVX2-NEXT:    vcvtph2ps %xmm11, %xmm11
    466 ; AVX2-NEXT:    vcvtph2ps %xmm12, %xmm12
    467 ; AVX2-NEXT:    vcvtph2ps %xmm13, %xmm13
    468 ; AVX2-NEXT:    vcvtph2ps %xmm14, %xmm14
    469 ; AVX2-NEXT:    vcvtph2ps %xmm15, %xmm15
    470 ; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
    471 ; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
    472 ; AVX2-NEXT:    vcvtph2ps %xmm4, %xmm4
    473 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
    474 ; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
    475 ; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
    476 ; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
    477 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
    478 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3]
    479 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
    480 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
    481 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
    482 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
    483 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
    484 ; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    485 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3]
    486 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
    487 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
    488 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3]
    489 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
    490 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
    491 ; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
    492 ; AVX2-NEXT:    retq
    493 ;
    494 ; AVX512F-LABEL: cvt_16i16_to_16f32:
    495 ; AVX512F:       # %bb.0:
    496 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm10
    497 ; AVX512F-NEXT:    vmovq %xmm0, %rax
    498 ; AVX512F-NEXT:    movq %rax, %rcx
    499 ; AVX512F-NEXT:    shrq $48, %rcx
    500 ; AVX512F-NEXT:    movswl %cx, %ecx
    501 ; AVX512F-NEXT:    vmovd %ecx, %xmm8
    502 ; AVX512F-NEXT:    movq %rax, %rcx
    503 ; AVX512F-NEXT:    shrq $32, %rcx
    504 ; AVX512F-NEXT:    movswl %cx, %ecx
    505 ; AVX512F-NEXT:    vmovd %ecx, %xmm9
    506 ; AVX512F-NEXT:    movswl %ax, %ecx
    507 ; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
    508 ; AVX512F-NEXT:    shrl $16, %eax
    509 ; AVX512F-NEXT:    cwtl
    510 ; AVX512F-NEXT:    vmovd %eax, %xmm11
    511 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
    512 ; AVX512F-NEXT:    vmovd %ecx, %xmm12
    513 ; AVX512F-NEXT:    movq %rax, %rcx
    514 ; AVX512F-NEXT:    shrq $48, %rcx
    515 ; AVX512F-NEXT:    movswl %cx, %ecx
    516 ; AVX512F-NEXT:    vmovd %ecx, %xmm13
    517 ; AVX512F-NEXT:    movq %rax, %rcx
    518 ; AVX512F-NEXT:    shrq $32, %rcx
    519 ; AVX512F-NEXT:    movswl %cx, %ecx
    520 ; AVX512F-NEXT:    vmovd %ecx, %xmm14
    521 ; AVX512F-NEXT:    movswl %ax, %ecx
    522 ; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
    523 ; AVX512F-NEXT:    shrl $16, %eax
    524 ; AVX512F-NEXT:    cwtl
    525 ; AVX512F-NEXT:    vmovd %eax, %xmm15
    526 ; AVX512F-NEXT:    vmovq %xmm10, %rax
    527 ; AVX512F-NEXT:    vmovd %ecx, %xmm2
    528 ; AVX512F-NEXT:    movq %rax, %rcx
    529 ; AVX512F-NEXT:    shrq $48, %rcx
    530 ; AVX512F-NEXT:    movswl %cx, %ecx
    531 ; AVX512F-NEXT:    vmovd %ecx, %xmm3
    532 ; AVX512F-NEXT:    movq %rax, %rcx
    533 ; AVX512F-NEXT:    shrq $32, %rcx
    534 ; AVX512F-NEXT:    movswl %cx, %ecx
    535 ; AVX512F-NEXT:    vmovd %ecx, %xmm1
    536 ; AVX512F-NEXT:    movswl %ax, %ecx
    537 ; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
    538 ; AVX512F-NEXT:    shrl $16, %eax
    539 ; AVX512F-NEXT:    cwtl
    540 ; AVX512F-NEXT:    vmovd %eax, %xmm4
    541 ; AVX512F-NEXT:    vpextrq $1, %xmm10, %rax
    542 ; AVX512F-NEXT:    vmovd %ecx, %xmm10
    543 ; AVX512F-NEXT:    movq %rax, %rcx
    544 ; AVX512F-NEXT:    shrq $48, %rcx
    545 ; AVX512F-NEXT:    movswl %cx, %ecx
    546 ; AVX512F-NEXT:    vmovd %ecx, %xmm5
    547 ; AVX512F-NEXT:    movq %rax, %rcx
    548 ; AVX512F-NEXT:    shrq $32, %rcx
    549 ; AVX512F-NEXT:    movswl %cx, %ecx
    550 ; AVX512F-NEXT:    vmovd %ecx, %xmm6
    551 ; AVX512F-NEXT:    movl %eax, %ecx
    552 ; AVX512F-NEXT:    shrl $16, %ecx
    553 ; AVX512F-NEXT:    movswl %cx, %ecx
    554 ; AVX512F-NEXT:    vmovd %ecx, %xmm7
    555 ; AVX512F-NEXT:    cwtl
    556 ; AVX512F-NEXT:    vmovd %eax, %xmm0
    557 ; AVX512F-NEXT:    vcvtph2ps %xmm8, %xmm8
    558 ; AVX512F-NEXT:    vcvtph2ps %xmm9, %xmm9
    559 ; AVX512F-NEXT:    vcvtph2ps %xmm11, %xmm11
    560 ; AVX512F-NEXT:    vcvtph2ps %xmm12, %xmm12
    561 ; AVX512F-NEXT:    vcvtph2ps %xmm13, %xmm13
    562 ; AVX512F-NEXT:    vcvtph2ps %xmm14, %xmm14
    563 ; AVX512F-NEXT:    vcvtph2ps %xmm15, %xmm15
    564 ; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
    565 ; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
    566 ; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
    567 ; AVX512F-NEXT:    vcvtph2ps %xmm4, %xmm4
    568 ; AVX512F-NEXT:    vcvtph2ps %xmm10, %xmm10
    569 ; AVX512F-NEXT:    vcvtph2ps %xmm5, %xmm5
    570 ; AVX512F-NEXT:    vcvtph2ps %xmm6, %xmm6
    571 ; AVX512F-NEXT:    vcvtph2ps %xmm7, %xmm7
    572 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
    573 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3]
    574 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3]
    575 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0]
    576 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3]
    577 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3]
    578 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
    579 ; AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
    580 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3]
    581 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
    582 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
    583 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
    584 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
    585 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
    586 ; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
    587 ; AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
    588 ; AVX512F-NEXT:    retq
    589 ;
    590 ; AVX512VL-LABEL: cvt_16i16_to_16f32:
    591 ; AVX512VL:       # %bb.0:
    592 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm10
    593 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
    594 ; AVX512VL-NEXT:    movq %rax, %rcx
    595 ; AVX512VL-NEXT:    shrq $48, %rcx
    596 ; AVX512VL-NEXT:    movswl %cx, %ecx
    597 ; AVX512VL-NEXT:    vmovd %ecx, %xmm8
    598 ; AVX512VL-NEXT:    movq %rax, %rcx
    599 ; AVX512VL-NEXT:    shrq $32, %rcx
    600 ; AVX512VL-NEXT:    movswl %cx, %ecx
    601 ; AVX512VL-NEXT:    vmovd %ecx, %xmm9
    602 ; AVX512VL-NEXT:    movswl %ax, %ecx
    603 ; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
    604 ; AVX512VL-NEXT:    shrl $16, %eax
    605 ; AVX512VL-NEXT:    cwtl
    606 ; AVX512VL-NEXT:    vmovd %eax, %xmm11
    607 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
    608 ; AVX512VL-NEXT:    vmovd %ecx, %xmm12
    609 ; AVX512VL-NEXT:    movq %rax, %rcx
    610 ; AVX512VL-NEXT:    shrq $48, %rcx
    611 ; AVX512VL-NEXT:    movswl %cx, %ecx
    612 ; AVX512VL-NEXT:    vmovd %ecx, %xmm13
    613 ; AVX512VL-NEXT:    movq %rax, %rcx
    614 ; AVX512VL-NEXT:    shrq $32, %rcx
    615 ; AVX512VL-NEXT:    movswl %cx, %ecx
    616 ; AVX512VL-NEXT:    vmovd %ecx, %xmm14
    617 ; AVX512VL-NEXT:    movswl %ax, %ecx
    618 ; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
    619 ; AVX512VL-NEXT:    shrl $16, %eax
    620 ; AVX512VL-NEXT:    cwtl
    621 ; AVX512VL-NEXT:    vmovd %eax, %xmm15
    622 ; AVX512VL-NEXT:    vmovq %xmm10, %rax
    623 ; AVX512VL-NEXT:    vmovd %ecx, %xmm16
    624 ; AVX512VL-NEXT:    movq %rax, %rcx
    625 ; AVX512VL-NEXT:    shrq $48, %rcx
    626 ; AVX512VL-NEXT:    movswl %cx, %ecx
    627 ; AVX512VL-NEXT:    vmovd %ecx, %xmm17
    628 ; AVX512VL-NEXT:    movq %rax, %rcx
    629 ; AVX512VL-NEXT:    shrq $32, %rcx
    630 ; AVX512VL-NEXT:    movswl %cx, %ecx
    631 ; AVX512VL-NEXT:    vmovd %ecx, %xmm18
    632 ; AVX512VL-NEXT:    movswl %ax, %ecx
    633 ; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
    634 ; AVX512VL-NEXT:    shrl $16, %eax
    635 ; AVX512VL-NEXT:    cwtl
    636 ; AVX512VL-NEXT:    vmovd %eax, %xmm19
    637 ; AVX512VL-NEXT:    vpextrq $1, %xmm10, %rax
    638 ; AVX512VL-NEXT:    vmovd %ecx, %xmm10
    639 ; AVX512VL-NEXT:    movq %rax, %rcx
    640 ; AVX512VL-NEXT:    shrq $48, %rcx
    641 ; AVX512VL-NEXT:    movswl %cx, %ecx
    642 ; AVX512VL-NEXT:    vmovd %ecx, %xmm20
    643 ; AVX512VL-NEXT:    movq %rax, %rcx
    644 ; AVX512VL-NEXT:    shrq $32, %rcx
    645 ; AVX512VL-NEXT:    movswl %cx, %ecx
    646 ; AVX512VL-NEXT:    vmovd %ecx, %xmm21
    647 ; AVX512VL-NEXT:    movl %eax, %ecx
    648 ; AVX512VL-NEXT:    shrl $16, %ecx
    649 ; AVX512VL-NEXT:    movswl %cx, %ecx
    650 ; AVX512VL-NEXT:    vmovd %ecx, %xmm22
    651 ; AVX512VL-NEXT:    cwtl
    652 ; AVX512VL-NEXT:    vmovd %eax, %xmm2
    653 ; AVX512VL-NEXT:    vcvtph2ps %xmm8, %xmm8
    654 ; AVX512VL-NEXT:    vcvtph2ps %xmm9, %xmm9
    655 ; AVX512VL-NEXT:    vcvtph2ps %xmm11, %xmm11
    656 ; AVX512VL-NEXT:    vcvtph2ps %xmm12, %xmm12
    657 ; AVX512VL-NEXT:    vcvtph2ps %xmm13, %xmm13
    658 ; AVX512VL-NEXT:    vcvtph2ps %xmm14, %xmm14
    659 ; AVX512VL-NEXT:    vcvtph2ps %xmm15, %xmm15
    660 ; AVX512VL-NEXT:    vcvtph2ps %xmm16, %xmm16
    661 ; AVX512VL-NEXT:    vcvtph2ps %xmm17, %xmm4
    662 ; AVX512VL-NEXT:    vcvtph2ps %xmm18, %xmm0
    663 ; AVX512VL-NEXT:    vcvtph2ps %xmm19, %xmm5
    664 ; AVX512VL-NEXT:    vcvtph2ps %xmm10, %xmm7
    665 ; AVX512VL-NEXT:    vcvtph2ps %xmm20, %xmm3
    666 ; AVX512VL-NEXT:    vcvtph2ps %xmm21, %xmm6
    667 ; AVX512VL-NEXT:    vcvtph2ps %xmm22, %xmm1
    668 ; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
    669 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
    670 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
    671 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
    672 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[2,3]
    673 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3]
    674 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
    675 ; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    676 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm16[0],xmm15[0],xmm16[2,3]
    677 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
    678 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
    679 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
    680 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
    681 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
    682 ; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
    683 ; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
    684 ; AVX512VL-NEXT:    retq
    685   %1 = bitcast <16 x i16> %a0 to <16 x half>
    686   %2 = fpext <16 x half> %1 to <16 x float>
    687   ret <16 x float> %2
    688 }
    689 
    690 ;
    691 ; Half to Float (Load)
    692 ;
    693 
    694 define float @load_cvt_i16_to_f32(i16* %a0) nounwind {
    695 ; ALL-LABEL: load_cvt_i16_to_f32:
    696 ; ALL:       # %bb.0:
    697 ; ALL-NEXT:    movswl (%rdi), %eax
    698 ; ALL-NEXT:    vmovd %eax, %xmm0
    699 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
    700 ; ALL-NEXT:    retq
    701   %1 = load i16, i16* %a0
    702   %2 = bitcast i16 %1 to half
    703   %3 = fpext half %2 to float
    704   ret float %3
    705 }
    706 
    707 define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind {
    708 ; ALL-LABEL: load_cvt_4i16_to_4f32:
    709 ; ALL:       # %bb.0:
    710 ; ALL-NEXT:    movswl 6(%rdi), %eax
    711 ; ALL-NEXT:    vmovd %eax, %xmm0
    712 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
    713 ; ALL-NEXT:    movswl 4(%rdi), %eax
    714 ; ALL-NEXT:    vmovd %eax, %xmm1
    715 ; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
    716 ; ALL-NEXT:    movswl (%rdi), %eax
    717 ; ALL-NEXT:    vmovd %eax, %xmm2
    718 ; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
    719 ; ALL-NEXT:    movswl 2(%rdi), %eax
    720 ; ALL-NEXT:    vmovd %eax, %xmm3
    721 ; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
    722 ; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
    723 ; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
    724 ; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    725 ; ALL-NEXT:    retq
    726   %1 = load <4 x i16>, <4 x i16>* %a0
    727   %2 = bitcast <4 x i16> %1 to <4 x half>
    728   %3 = fpext <4 x half> %2 to <4 x float>
    729   ret <4 x float> %3
    730 }
    731 
    732 define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind {
    733 ; AVX1-LABEL: load_cvt_8i16_to_4f32:
    734 ; AVX1:       # %bb.0:
    735 ; AVX1-NEXT:    movq (%rdi), %rax
    736 ; AVX1-NEXT:    movq %rax, %rcx
    737 ; AVX1-NEXT:    movq %rax, %rdx
    738 ; AVX1-NEXT:    movswl %ax, %esi
    739 ; AVX1-NEXT:    # kill: def $eax killed $eax killed $rax
    740 ; AVX1-NEXT:    shrl $16, %eax
    741 ; AVX1-NEXT:    shrq $32, %rcx
    742 ; AVX1-NEXT:    shrq $48, %rdx
    743 ; AVX1-NEXT:    movswl %dx, %edx
    744 ; AVX1-NEXT:    vmovd %edx, %xmm0
    745 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
    746 ; AVX1-NEXT:    movswl %cx, %ecx
    747 ; AVX1-NEXT:    vmovd %ecx, %xmm1
    748 ; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
    749 ; AVX1-NEXT:    cwtl
    750 ; AVX1-NEXT:    vmovd %eax, %xmm2
    751 ; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
    752 ; AVX1-NEXT:    vmovd %esi, %xmm3
    753 ; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
    754 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
    755 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
    756 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    757 ; AVX1-NEXT:    retq
    758 ;
    759 ; AVX2-LABEL: load_cvt_8i16_to_4f32:
    760 ; AVX2:       # %bb.0:
    761 ; AVX2-NEXT:    movq (%rdi), %rax
    762 ; AVX2-NEXT:    movq %rax, %rcx
    763 ; AVX2-NEXT:    movq %rax, %rdx
    764 ; AVX2-NEXT:    movswl %ax, %esi
    765 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
    766 ; AVX2-NEXT:    shrl $16, %eax
    767 ; AVX2-NEXT:    shrq $32, %rcx
    768 ; AVX2-NEXT:    shrq $48, %rdx
    769 ; AVX2-NEXT:    movswl %dx, %edx
    770 ; AVX2-NEXT:    vmovd %edx, %xmm0
    771 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
    772 ; AVX2-NEXT:    movswl %cx, %ecx
    773 ; AVX2-NEXT:    vmovd %ecx, %xmm1
    774 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
    775 ; AVX2-NEXT:    cwtl
    776 ; AVX2-NEXT:    vmovd %eax, %xmm2
    777 ; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
    778 ; AVX2-NEXT:    vmovd %esi, %xmm3
    779 ; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
    780 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
    781 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
    782 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    783 ; AVX2-NEXT:    retq
    784 ;
    785 ; AVX512F-LABEL: load_cvt_8i16_to_4f32:
    786 ; AVX512F:       # %bb.0:
    787 ; AVX512F-NEXT:    movq (%rdi), %rax
    788 ; AVX512F-NEXT:    movq %rax, %rcx
    789 ; AVX512F-NEXT:    movq %rax, %rdx
    790 ; AVX512F-NEXT:    movswl %ax, %esi
    791 ; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
    792 ; AVX512F-NEXT:    shrl $16, %eax
    793 ; AVX512F-NEXT:    shrq $32, %rcx
    794 ; AVX512F-NEXT:    shrq $48, %rdx
    795 ; AVX512F-NEXT:    movswl %dx, %edx
    796 ; AVX512F-NEXT:    vmovd %edx, %xmm0
    797 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
    798 ; AVX512F-NEXT:    movswl %cx, %ecx
    799 ; AVX512F-NEXT:    vmovd %ecx, %xmm1
    800 ; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
    801 ; AVX512F-NEXT:    cwtl
    802 ; AVX512F-NEXT:    vmovd %eax, %xmm2
    803 ; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
    804 ; AVX512F-NEXT:    vmovd %esi, %xmm3
    805 ; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
    806 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
    807 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
    808 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    809 ; AVX512F-NEXT:    retq
    810 ;
    811 ; AVX512VL-LABEL: load_cvt_8i16_to_4f32:
    812 ; AVX512VL:       # %bb.0:
    813 ; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    814 ; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
    815 ; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
    816 ; AVX512VL-NEXT:    movq %rax, %rcx
    817 ; AVX512VL-NEXT:    movq %rax, %rdx
    818 ; AVX512VL-NEXT:    movswl %ax, %esi
    819 ; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
    820 ; AVX512VL-NEXT:    shrl $16, %eax
    821 ; AVX512VL-NEXT:    shrq $32, %rcx
    822 ; AVX512VL-NEXT:    shrq $48, %rdx
    823 ; AVX512VL-NEXT:    movswl %dx, %edx
    824 ; AVX512VL-NEXT:    vmovd %edx, %xmm0
    825 ; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
    826 ; AVX512VL-NEXT:    movswl %cx, %ecx
    827 ; AVX512VL-NEXT:    vmovd %ecx, %xmm1
    828 ; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
    829 ; AVX512VL-NEXT:    cwtl
    830 ; AVX512VL-NEXT:    vmovd %eax, %xmm2
    831 ; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
    832 ; AVX512VL-NEXT:    vmovd %esi, %xmm3
    833 ; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
    834 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
    835 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
    836 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    837 ; AVX512VL-NEXT:    retq
    838   %1 = load <8 x i16>, <8 x i16>* %a0
    839   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    840   %3 = bitcast <4 x i16> %2 to <4 x half>
    841   %4 = fpext <4 x half> %3 to <4 x float>
    842   ret <4 x float> %4
    843 }
    844 
    845 define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind {
    846 ; ALL-LABEL: load_cvt_8i16_to_8f32:
    847 ; ALL:       # %bb.0:
    848 ; ALL-NEXT:    movswl 6(%rdi), %eax
    849 ; ALL-NEXT:    vmovd %eax, %xmm0
    850 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
    851 ; ALL-NEXT:    movswl 4(%rdi), %eax
    852 ; ALL-NEXT:    vmovd %eax, %xmm1
    853 ; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
    854 ; ALL-NEXT:    movswl (%rdi), %eax
    855 ; ALL-NEXT:    vmovd %eax, %xmm2
    856 ; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
    857 ; ALL-NEXT:    movswl 2(%rdi), %eax
    858 ; ALL-NEXT:    vmovd %eax, %xmm3
    859 ; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
    860 ; ALL-NEXT:    movswl 14(%rdi), %eax
    861 ; ALL-NEXT:    vmovd %eax, %xmm4
    862 ; ALL-NEXT:    vcvtph2ps %xmm4, %xmm4
    863 ; ALL-NEXT:    movswl 12(%rdi), %eax
    864 ; ALL-NEXT:    vmovd %eax, %xmm5
    865 ; ALL-NEXT:    vcvtph2ps %xmm5, %xmm5
    866 ; ALL-NEXT:    movswl 8(%rdi), %eax
    867 ; ALL-NEXT:    vmovd %eax, %xmm6
    868 ; ALL-NEXT:    vcvtph2ps %xmm6, %xmm6
    869 ; ALL-NEXT:    movswl 10(%rdi), %eax
    870 ; ALL-NEXT:    vmovd %eax, %xmm7
    871 ; ALL-NEXT:    vcvtph2ps %xmm7, %xmm7
    872 ; ALL-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
    873 ; ALL-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
    874 ; ALL-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
    875 ; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
    876 ; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
    877 ; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    878 ; ALL-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
    879 ; ALL-NEXT:    retq
    880   %1 = load <8 x i16>, <8 x i16>* %a0
    881   %2 = bitcast <8 x i16> %1 to <8 x half>
    882   %3 = fpext <8 x half> %2 to <8 x float>
    883   ret <8 x float> %3
    884 }
    885 
    886 define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind {
    887 ; AVX1-LABEL: load_cvt_16i16_to_16f32:
    888 ; AVX1:       # %bb.0:
    889 ; AVX1-NEXT:    movswl 22(%rdi), %eax
    890 ; AVX1-NEXT:    vmovd %eax, %xmm0
    891 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm8
    892 ; AVX1-NEXT:    movswl 20(%rdi), %eax
    893 ; AVX1-NEXT:    vmovd %eax, %xmm0
    894 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm9
    895 ; AVX1-NEXT:    movswl 16(%rdi), %eax
    896 ; AVX1-NEXT:    vmovd %eax, %xmm0
    897 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm10
    898 ; AVX1-NEXT:    movswl 18(%rdi), %eax
    899 ; AVX1-NEXT:    vmovd %eax, %xmm0
    900 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm11
    901 ; AVX1-NEXT:    movswl 30(%rdi), %eax
    902 ; AVX1-NEXT:    vmovd %eax, %xmm0
    903 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm12
    904 ; AVX1-NEXT:    movswl 28(%rdi), %eax
    905 ; AVX1-NEXT:    vmovd %eax, %xmm0
    906 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm13
    907 ; AVX1-NEXT:    movswl 24(%rdi), %eax
    908 ; AVX1-NEXT:    vmovd %eax, %xmm0
    909 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm14
    910 ; AVX1-NEXT:    movswl 26(%rdi), %eax
    911 ; AVX1-NEXT:    vmovd %eax, %xmm0
    912 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm15
    913 ; AVX1-NEXT:    movswl 6(%rdi), %eax
    914 ; AVX1-NEXT:    vmovd %eax, %xmm0
    915 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
    916 ; AVX1-NEXT:    movswl 4(%rdi), %eax
    917 ; AVX1-NEXT:    vmovd %eax, %xmm2
    918 ; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
    919 ; AVX1-NEXT:    movswl (%rdi), %eax
    920 ; AVX1-NEXT:    vmovd %eax, %xmm3
    921 ; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
    922 ; AVX1-NEXT:    movswl 2(%rdi), %eax
    923 ; AVX1-NEXT:    vmovd %eax, %xmm4
    924 ; AVX1-NEXT:    vcvtph2ps %xmm4, %xmm4
    925 ; AVX1-NEXT:    movswl 14(%rdi), %eax
    926 ; AVX1-NEXT:    vmovd %eax, %xmm5
    927 ; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
    928 ; AVX1-NEXT:    movswl 12(%rdi), %eax
    929 ; AVX1-NEXT:    vmovd %eax, %xmm6
    930 ; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
    931 ; AVX1-NEXT:    movswl 8(%rdi), %eax
    932 ; AVX1-NEXT:    vmovd %eax, %xmm7
    933 ; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
    934 ; AVX1-NEXT:    movswl 10(%rdi), %eax
    935 ; AVX1-NEXT:    vmovd %eax, %xmm1
    936 ; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
    937 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
    938 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
    939 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
    940 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
    941 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
    942 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
    943 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    944 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
    945 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
    946 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
    947 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
    948 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
    949 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
    950 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
    951 ; AVX1-NEXT:    retq
    952 ;
    953 ; AVX2-LABEL: load_cvt_16i16_to_16f32:
    954 ; AVX2:       # %bb.0:
    955 ; AVX2-NEXT:    movswl 22(%rdi), %eax
    956 ; AVX2-NEXT:    vmovd %eax, %xmm0
    957 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm8
    958 ; AVX2-NEXT:    movswl 20(%rdi), %eax
    959 ; AVX2-NEXT:    vmovd %eax, %xmm0
    960 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm9
    961 ; AVX2-NEXT:    movswl 16(%rdi), %eax
    962 ; AVX2-NEXT:    vmovd %eax, %xmm0
    963 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm10
    964 ; AVX2-NEXT:    movswl 18(%rdi), %eax
    965 ; AVX2-NEXT:    vmovd %eax, %xmm0
    966 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm11
    967 ; AVX2-NEXT:    movswl 30(%rdi), %eax
    968 ; AVX2-NEXT:    vmovd %eax, %xmm0
    969 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm12
    970 ; AVX2-NEXT:    movswl 28(%rdi), %eax
    971 ; AVX2-NEXT:    vmovd %eax, %xmm0
    972 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm13
    973 ; AVX2-NEXT:    movswl 24(%rdi), %eax
    974 ; AVX2-NEXT:    vmovd %eax, %xmm0
    975 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm14
    976 ; AVX2-NEXT:    movswl 26(%rdi), %eax
    977 ; AVX2-NEXT:    vmovd %eax, %xmm0
    978 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm15
    979 ; AVX2-NEXT:    movswl 6(%rdi), %eax
    980 ; AVX2-NEXT:    vmovd %eax, %xmm0
    981 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
    982 ; AVX2-NEXT:    movswl 4(%rdi), %eax
    983 ; AVX2-NEXT:    vmovd %eax, %xmm2
    984 ; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
    985 ; AVX2-NEXT:    movswl (%rdi), %eax
    986 ; AVX2-NEXT:    vmovd %eax, %xmm3
    987 ; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
    988 ; AVX2-NEXT:    movswl 2(%rdi), %eax
    989 ; AVX2-NEXT:    vmovd %eax, %xmm4
    990 ; AVX2-NEXT:    vcvtph2ps %xmm4, %xmm4
    991 ; AVX2-NEXT:    movswl 14(%rdi), %eax
    992 ; AVX2-NEXT:    vmovd %eax, %xmm5
    993 ; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
    994 ; AVX2-NEXT:    movswl 12(%rdi), %eax
    995 ; AVX2-NEXT:    vmovd %eax, %xmm6
    996 ; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
    997 ; AVX2-NEXT:    movswl 8(%rdi), %eax
    998 ; AVX2-NEXT:    vmovd %eax, %xmm7
    999 ; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
   1000 ; AVX2-NEXT:    movswl 10(%rdi), %eax
   1001 ; AVX2-NEXT:    vmovd %eax, %xmm1
   1002 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
   1003 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
   1004 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
   1005 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
   1006 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
   1007 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
   1008 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
   1009 ; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1010 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
   1011 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
   1012 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
   1013 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
   1014 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
   1015 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
   1016 ; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
   1017 ; AVX2-NEXT:    retq
   1018 ;
   1019 ; AVX512F-LABEL: load_cvt_16i16_to_16f32:
   1020 ; AVX512F:       # %bb.0:
   1021 ; AVX512F-NEXT:    movswl 6(%rdi), %eax
   1022 ; AVX512F-NEXT:    vmovd %eax, %xmm0
   1023 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm8
   1024 ; AVX512F-NEXT:    movswl 4(%rdi), %eax
   1025 ; AVX512F-NEXT:    vmovd %eax, %xmm0
   1026 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm9
   1027 ; AVX512F-NEXT:    movswl (%rdi), %eax
   1028 ; AVX512F-NEXT:    vmovd %eax, %xmm0
   1029 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm10
   1030 ; AVX512F-NEXT:    movswl 2(%rdi), %eax
   1031 ; AVX512F-NEXT:    vmovd %eax, %xmm0
   1032 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm11
   1033 ; AVX512F-NEXT:    movswl 14(%rdi), %eax
   1034 ; AVX512F-NEXT:    vmovd %eax, %xmm0
   1035 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm12
   1036 ; AVX512F-NEXT:    movswl 12(%rdi), %eax
   1037 ; AVX512F-NEXT:    vmovd %eax, %xmm0
   1038 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm13
   1039 ; AVX512F-NEXT:    movswl 8(%rdi), %eax
   1040 ; AVX512F-NEXT:    vmovd %eax, %xmm0
   1041 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm14
   1042 ; AVX512F-NEXT:    movswl 10(%rdi), %eax
   1043 ; AVX512F-NEXT:    vmovd %eax, %xmm0
   1044 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm15
   1045 ; AVX512F-NEXT:    movswl 22(%rdi), %eax
   1046 ; AVX512F-NEXT:    vmovd %eax, %xmm0
   1047 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
   1048 ; AVX512F-NEXT:    movswl 20(%rdi), %eax
   1049 ; AVX512F-NEXT:    vmovd %eax, %xmm1
   1050 ; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
   1051 ; AVX512F-NEXT:    movswl 16(%rdi), %eax
   1052 ; AVX512F-NEXT:    vmovd %eax, %xmm2
   1053 ; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
   1054 ; AVX512F-NEXT:    movswl 18(%rdi), %eax
   1055 ; AVX512F-NEXT:    vmovd %eax, %xmm3
   1056 ; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
   1057 ; AVX512F-NEXT:    movswl 30(%rdi), %eax
   1058 ; AVX512F-NEXT:    vmovd %eax, %xmm4
   1059 ; AVX512F-NEXT:    vcvtph2ps %xmm4, %xmm4
   1060 ; AVX512F-NEXT:    movswl 28(%rdi), %eax
   1061 ; AVX512F-NEXT:    vmovd %eax, %xmm5
   1062 ; AVX512F-NEXT:    vcvtph2ps %xmm5, %xmm5
   1063 ; AVX512F-NEXT:    movswl 24(%rdi), %eax
   1064 ; AVX512F-NEXT:    vmovd %eax, %xmm6
   1065 ; AVX512F-NEXT:    vcvtph2ps %xmm6, %xmm6
   1066 ; AVX512F-NEXT:    movswl 26(%rdi), %eax
   1067 ; AVX512F-NEXT:    vmovd %eax, %xmm7
   1068 ; AVX512F-NEXT:    vcvtph2ps %xmm7, %xmm7
   1069 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
   1070 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
   1071 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
   1072 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
   1073 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
   1074 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
   1075 ; AVX512F-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
   1076 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
   1077 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
   1078 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
   1079 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
   1080 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
   1081 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
   1082 ; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
   1083 ; AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
   1084 ; AVX512F-NEXT:    retq
   1085 ;
   1086 ; AVX512VL-LABEL: load_cvt_16i16_to_16f32:
   1087 ; AVX512VL:       # %bb.0:
   1088 ; AVX512VL-NEXT:    movswl 6(%rdi), %eax
   1089 ; AVX512VL-NEXT:    vmovd %eax, %xmm0
   1090 ; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm8
   1091 ; AVX512VL-NEXT:    movswl 4(%rdi), %eax
   1092 ; AVX512VL-NEXT:    vmovd %eax, %xmm1
   1093 ; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm9
   1094 ; AVX512VL-NEXT:    movswl (%rdi), %eax
   1095 ; AVX512VL-NEXT:    vmovd %eax, %xmm2
   1096 ; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm10
   1097 ; AVX512VL-NEXT:    movswl 2(%rdi), %eax
   1098 ; AVX512VL-NEXT:    vmovd %eax, %xmm3
   1099 ; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm11
   1100 ; AVX512VL-NEXT:    movswl 14(%rdi), %eax
   1101 ; AVX512VL-NEXT:    vmovd %eax, %xmm4
   1102 ; AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm12
   1103 ; AVX512VL-NEXT:    movswl 12(%rdi), %eax
   1104 ; AVX512VL-NEXT:    vmovd %eax, %xmm5
   1105 ; AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm13
   1106 ; AVX512VL-NEXT:    movswl 8(%rdi), %eax
   1107 ; AVX512VL-NEXT:    vmovd %eax, %xmm6
   1108 ; AVX512VL-NEXT:    vcvtph2ps %xmm6, %xmm14
   1109 ; AVX512VL-NEXT:    movswl 10(%rdi), %eax
   1110 ; AVX512VL-NEXT:    vmovd %eax, %xmm7
   1111 ; AVX512VL-NEXT:    vcvtph2ps %xmm7, %xmm15
   1112 ; AVX512VL-NEXT:    movswl 22(%rdi), %eax
   1113 ; AVX512VL-NEXT:    vmovd %eax, %xmm0
   1114 ; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
   1115 ; AVX512VL-NEXT:    movswl 20(%rdi), %eax
   1116 ; AVX512VL-NEXT:    vmovd %eax, %xmm1
   1117 ; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
   1118 ; AVX512VL-NEXT:    movswl 16(%rdi), %eax
   1119 ; AVX512VL-NEXT:    vmovd %eax, %xmm2
   1120 ; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
   1121 ; AVX512VL-NEXT:    movswl 18(%rdi), %eax
   1122 ; AVX512VL-NEXT:    vmovd %eax, %xmm3
   1123 ; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
   1124 ; AVX512VL-NEXT:    movswl 30(%rdi), %eax
   1125 ; AVX512VL-NEXT:    vmovd %eax, %xmm4
   1126 ; AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
   1127 ; AVX512VL-NEXT:    movswl 28(%rdi), %eax
   1128 ; AVX512VL-NEXT:    vmovd %eax, %xmm5
   1129 ; AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
   1130 ; AVX512VL-NEXT:    movswl 24(%rdi), %eax
   1131 ; AVX512VL-NEXT:    vmovd %eax, %xmm6
   1132 ; AVX512VL-NEXT:    vcvtph2ps %xmm6, %xmm6
   1133 ; AVX512VL-NEXT:    movswl 26(%rdi), %eax
   1134 ; AVX512VL-NEXT:    vmovd %eax, %xmm7
   1135 ; AVX512VL-NEXT:    vcvtph2ps %xmm7, %xmm7
   1136 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
   1137 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
   1138 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
   1139 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
   1140 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
   1141 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
   1142 ; AVX512VL-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
   1143 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
   1144 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
   1145 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
   1146 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
   1147 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
   1148 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
   1149 ; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
   1150 ; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
   1151 ; AVX512VL-NEXT:    retq
   1152   %1 = load <16 x i16>, <16 x i16>* %a0
   1153   %2 = bitcast <16 x i16> %1 to <16 x half>
   1154   %3 = fpext <16 x half> %2 to <16 x float>
   1155   ret <16 x float> %3
   1156 }
   1157 
   1158 ;
   1159 ; Half to Double
   1160 ;
   1161 
   1162 define double @cvt_i16_to_f64(i16 %a0) nounwind {
   1163 ; ALL-LABEL: cvt_i16_to_f64:
   1164 ; ALL:       # %bb.0:
   1165 ; ALL-NEXT:    movswl %di, %eax
   1166 ; ALL-NEXT:    vmovd %eax, %xmm0
   1167 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
   1168 ; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1169 ; ALL-NEXT:    retq
   1170   %1 = bitcast i16 %a0 to half
   1171   %2 = fpext half %1 to double
   1172   ret double %2
   1173 }
   1174 
   1175 define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
   1176 ; AVX1-LABEL: cvt_2i16_to_2f64:
   1177 ; AVX1:       # %bb.0:
   1178 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1179 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   1180 ; AVX1-NEXT:    vmovd %xmm0, %eax
   1181 ; AVX1-NEXT:    movswl %ax, %ecx
   1182 ; AVX1-NEXT:    shrl $16, %eax
   1183 ; AVX1-NEXT:    cwtl
   1184 ; AVX1-NEXT:    vmovd %eax, %xmm0
   1185 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
   1186 ; AVX1-NEXT:    vmovd %ecx, %xmm1
   1187 ; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
   1188 ; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1189 ; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1190 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1191 ; AVX1-NEXT:    retq
   1192 ;
   1193 ; AVX2-SLOW-LABEL: cvt_2i16_to_2f64:
   1194 ; AVX2-SLOW:       # %bb.0:
   1195 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1196 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   1197 ; AVX2-SLOW-NEXT:    vmovd %xmm0, %eax
   1198 ; AVX2-SLOW-NEXT:    movswl %ax, %ecx
   1199 ; AVX2-SLOW-NEXT:    shrl $16, %eax
   1200 ; AVX2-SLOW-NEXT:    cwtl
   1201 ; AVX2-SLOW-NEXT:    vmovd %eax, %xmm0
   1202 ; AVX2-SLOW-NEXT:    vcvtph2ps %xmm0, %xmm0
   1203 ; AVX2-SLOW-NEXT:    vmovd %ecx, %xmm1
   1204 ; AVX2-SLOW-NEXT:    vcvtph2ps %xmm1, %xmm1
   1205 ; AVX2-SLOW-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1206 ; AVX2-SLOW-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1207 ; AVX2-SLOW-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1208 ; AVX2-SLOW-NEXT:    retq
   1209 ;
   1210 ; AVX2-FAST-LABEL: cvt_2i16_to_2f64:
   1211 ; AVX2-FAST:       # %bb.0:
   1212 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
   1213 ; AVX2-FAST-NEXT:    vmovd %xmm0, %eax
   1214 ; AVX2-FAST-NEXT:    movswl %ax, %ecx
   1215 ; AVX2-FAST-NEXT:    shrl $16, %eax
   1216 ; AVX2-FAST-NEXT:    cwtl
   1217 ; AVX2-FAST-NEXT:    vmovd %eax, %xmm0
   1218 ; AVX2-FAST-NEXT:    vcvtph2ps %xmm0, %xmm0
   1219 ; AVX2-FAST-NEXT:    vmovd %ecx, %xmm1
   1220 ; AVX2-FAST-NEXT:    vcvtph2ps %xmm1, %xmm1
   1221 ; AVX2-FAST-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1222 ; AVX2-FAST-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1223 ; AVX2-FAST-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1224 ; AVX2-FAST-NEXT:    retq
   1225 ;
   1226 ; AVX512F-LABEL: cvt_2i16_to_2f64:
   1227 ; AVX512F:       # %bb.0:
   1228 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1229 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   1230 ; AVX512F-NEXT:    vmovd %xmm0, %eax
   1231 ; AVX512F-NEXT:    movswl %ax, %ecx
   1232 ; AVX512F-NEXT:    shrl $16, %eax
   1233 ; AVX512F-NEXT:    cwtl
   1234 ; AVX512F-NEXT:    vmovd %eax, %xmm0
   1235 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
   1236 ; AVX512F-NEXT:    vmovd %ecx, %xmm1
   1237 ; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
   1238 ; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1239 ; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1240 ; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1241 ; AVX512F-NEXT:    retq
   1242 ;
   1243 ; AVX512VL-LABEL: cvt_2i16_to_2f64:
   1244 ; AVX512VL:       # %bb.0:
   1245 ; AVX512VL-NEXT:    vpmovqw %xmm0, -{{[0-9]+}}(%rsp)
   1246 ; AVX512VL-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
   1247 ; AVX512VL-NEXT:    movswl %ax, %ecx
   1248 ; AVX512VL-NEXT:    shrl $16, %eax
   1249 ; AVX512VL-NEXT:    cwtl
   1250 ; AVX512VL-NEXT:    vmovd %eax, %xmm0
   1251 ; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
   1252 ; AVX512VL-NEXT:    vmovd %ecx, %xmm1
   1253 ; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
   1254 ; AVX512VL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1255 ; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1256 ; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1257 ; AVX512VL-NEXT:    retq
   1258   %1 = bitcast <2 x i16> %a0 to <2 x half>
   1259   %2 = fpext <2 x half> %1 to <2 x double>
   1260   ret <2 x double> %2
   1261 }
   1262 
   1263 define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind {
   1264 ; AVX1-LABEL: cvt_4i16_to_4f64:
   1265 ; AVX1:       # %bb.0:
   1266 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1267 ; AVX1-NEXT:    vmovq %xmm0, %rax
   1268 ; AVX1-NEXT:    movq %rax, %rcx
   1269 ; AVX1-NEXT:    movl %eax, %edx
   1270 ; AVX1-NEXT:    movswl %ax, %esi
   1271 ; AVX1-NEXT:    shrq $48, %rax
   1272 ; AVX1-NEXT:    shrq $32, %rcx
   1273 ; AVX1-NEXT:    shrl $16, %edx
   1274 ; AVX1-NEXT:    movswl %dx, %edx
   1275 ; AVX1-NEXT:    vmovd %edx, %xmm0
   1276 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
   1277 ; AVX1-NEXT:    vmovd %esi, %xmm1
   1278 ; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
   1279 ; AVX1-NEXT:    movswl %cx, %ecx
   1280 ; AVX1-NEXT:    vmovd %ecx, %xmm2
   1281 ; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
   1282 ; AVX1-NEXT:    cwtl
   1283 ; AVX1-NEXT:    vmovd %eax, %xmm3
   1284 ; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
   1285 ; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1286 ; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1287 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   1288 ; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1289 ; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1290 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1291 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1292 ; AVX1-NEXT:    retq
   1293 ;
   1294 ; AVX2-LABEL: cvt_4i16_to_4f64:
   1295 ; AVX2:       # %bb.0:
   1296 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1297 ; AVX2-NEXT:    vmovq %xmm0, %rax
   1298 ; AVX2-NEXT:    movq %rax, %rcx
   1299 ; AVX2-NEXT:    movl %eax, %edx
   1300 ; AVX2-NEXT:    movswl %ax, %esi
   1301 ; AVX2-NEXT:    shrq $48, %rax
   1302 ; AVX2-NEXT:    shrq $32, %rcx
   1303 ; AVX2-NEXT:    shrl $16, %edx
   1304 ; AVX2-NEXT:    movswl %dx, %edx
   1305 ; AVX2-NEXT:    vmovd %edx, %xmm0
   1306 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
   1307 ; AVX2-NEXT:    vmovd %esi, %xmm1
   1308 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
   1309 ; AVX2-NEXT:    movswl %cx, %ecx
   1310 ; AVX2-NEXT:    vmovd %ecx, %xmm2
   1311 ; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
   1312 ; AVX2-NEXT:    cwtl
   1313 ; AVX2-NEXT:    vmovd %eax, %xmm3
   1314 ; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
   1315 ; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1316 ; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1317 ; AVX2-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   1318 ; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1319 ; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1320 ; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1321 ; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1322 ; AVX2-NEXT:    retq
   1323 ;
   1324 ; AVX512F-LABEL: cvt_4i16_to_4f64:
   1325 ; AVX512F:       # %bb.0:
   1326 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1327 ; AVX512F-NEXT:    vmovq %xmm0, %rax
   1328 ; AVX512F-NEXT:    movq %rax, %rcx
   1329 ; AVX512F-NEXT:    movl %eax, %edx
   1330 ; AVX512F-NEXT:    movswl %ax, %esi
   1331 ; AVX512F-NEXT:    shrq $48, %rax
   1332 ; AVX512F-NEXT:    shrq $32, %rcx
   1333 ; AVX512F-NEXT:    shrl $16, %edx
   1334 ; AVX512F-NEXT:    movswl %dx, %edx
   1335 ; AVX512F-NEXT:    vmovd %edx, %xmm0
   1336 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
   1337 ; AVX512F-NEXT:    vmovd %esi, %xmm1
   1338 ; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
   1339 ; AVX512F-NEXT:    movswl %cx, %ecx
   1340 ; AVX512F-NEXT:    vmovd %ecx, %xmm2
   1341 ; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
   1342 ; AVX512F-NEXT:    cwtl
   1343 ; AVX512F-NEXT:    vmovd %eax, %xmm3
   1344 ; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
   1345 ; AVX512F-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1346 ; AVX512F-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1347 ; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   1348 ; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1349 ; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1350 ; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1351 ; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1352 ; AVX512F-NEXT:    retq
   1353 ;
   1354 ; AVX512VL-LABEL: cvt_4i16_to_4f64:
   1355 ; AVX512VL:       # %bb.0:
   1356 ; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
   1357 ; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
   1358 ; AVX512VL-NEXT:    movq %rax, %rcx
   1359 ; AVX512VL-NEXT:    movl %eax, %edx
   1360 ; AVX512VL-NEXT:    movswl %ax, %esi
   1361 ; AVX512VL-NEXT:    shrq $48, %rax
   1362 ; AVX512VL-NEXT:    shrq $32, %rcx
   1363 ; AVX512VL-NEXT:    shrl $16, %edx
   1364 ; AVX512VL-NEXT:    movswl %dx, %edx
   1365 ; AVX512VL-NEXT:    vmovd %edx, %xmm0
   1366 ; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
   1367 ; AVX512VL-NEXT:    vmovd %esi, %xmm1
   1368 ; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
   1369 ; AVX512VL-NEXT:    movswl %cx, %ecx
   1370 ; AVX512VL-NEXT:    vmovd %ecx, %xmm2
   1371 ; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
   1372 ; AVX512VL-NEXT:    cwtl
   1373 ; AVX512VL-NEXT:    vmovd %eax, %xmm3
   1374 ; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
   1375 ; AVX512VL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1376 ; AVX512VL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1377 ; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   1378 ; AVX512VL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1379 ; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1380 ; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1381 ; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1382 ; AVX512VL-NEXT:    retq
   1383   %1 = bitcast <4 x i16> %a0 to <4 x half>
   1384   %2 = fpext <4 x half> %1 to <4 x double>
   1385   ret <4 x double> %2
   1386 }
   1387 
   1388 define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind {
   1389 ; AVX1-LABEL: cvt_8i16_to_2f64:
   1390 ; AVX1:       # %bb.0:
   1391 ; AVX1-NEXT:    vmovd %xmm0, %eax
   1392 ; AVX1-NEXT:    movswl %ax, %ecx
   1393 ; AVX1-NEXT:    shrl $16, %eax
   1394 ; AVX1-NEXT:    cwtl
   1395 ; AVX1-NEXT:    vmovd %eax, %xmm0
   1396 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
   1397 ; AVX1-NEXT:    vmovd %ecx, %xmm1
   1398 ; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
   1399 ; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1400 ; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1401 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1402 ; AVX1-NEXT:    retq
   1403 ;
   1404 ; AVX2-LABEL: cvt_8i16_to_2f64:
   1405 ; AVX2:       # %bb.0:
   1406 ; AVX2-NEXT:    vmovd %xmm0, %eax
   1407 ; AVX2-NEXT:    movswl %ax, %ecx
   1408 ; AVX2-NEXT:    shrl $16, %eax
   1409 ; AVX2-NEXT:    cwtl
   1410 ; AVX2-NEXT:    vmovd %eax, %xmm0
   1411 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
   1412 ; AVX2-NEXT:    vmovd %ecx, %xmm1
   1413 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
   1414 ; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1415 ; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1416 ; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1417 ; AVX2-NEXT:    retq
   1418 ;
   1419 ; AVX512F-LABEL: cvt_8i16_to_2f64:
   1420 ; AVX512F:       # %bb.0:
   1421 ; AVX512F-NEXT:    vmovd %xmm0, %eax
   1422 ; AVX512F-NEXT:    movswl %ax, %ecx
   1423 ; AVX512F-NEXT:    shrl $16, %eax
   1424 ; AVX512F-NEXT:    cwtl
   1425 ; AVX512F-NEXT:    vmovd %eax, %xmm0
   1426 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
   1427 ; AVX512F-NEXT:    vmovd %ecx, %xmm1
   1428 ; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
   1429 ; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1430 ; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1431 ; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1432 ; AVX512F-NEXT:    retq
   1433 ;
   1434 ; AVX512VL-LABEL: cvt_8i16_to_2f64:
   1435 ; AVX512VL:       # %bb.0:
   1436 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
   1437 ; AVX512VL-NEXT:    vpmovqw %xmm0, -{{[0-9]+}}(%rsp)
   1438 ; AVX512VL-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
   1439 ; AVX512VL-NEXT:    movswl %ax, %ecx
   1440 ; AVX512VL-NEXT:    shrl $16, %eax
   1441 ; AVX512VL-NEXT:    cwtl
   1442 ; AVX512VL-NEXT:    vmovd %eax, %xmm0
   1443 ; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
   1444 ; AVX512VL-NEXT:    vmovd %ecx, %xmm1
   1445 ; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
   1446 ; AVX512VL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1447 ; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1448 ; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1449 ; AVX512VL-NEXT:    retq
   1450   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
   1451   %2 = bitcast <2 x i16> %1 to <2 x half>
   1452   %3 = fpext <2 x half> %2 to <2 x double>
   1453   ret <2 x double> %3
   1454 }
   1455 
   1456 define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind {
   1457 ; AVX1-LABEL: cvt_8i16_to_4f64:
   1458 ; AVX1:       # %bb.0:
   1459 ; AVX1-NEXT:    vmovq %xmm0, %rax
   1460 ; AVX1-NEXT:    movq %rax, %rcx
   1461 ; AVX1-NEXT:    movl %eax, %edx
   1462 ; AVX1-NEXT:    movswl %ax, %esi
   1463 ; AVX1-NEXT:    shrq $48, %rax
   1464 ; AVX1-NEXT:    shrq $32, %rcx
   1465 ; AVX1-NEXT:    shrl $16, %edx
   1466 ; AVX1-NEXT:    movswl %dx, %edx
   1467 ; AVX1-NEXT:    vmovd %edx, %xmm0
   1468 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
   1469 ; AVX1-NEXT:    vmovd %esi, %xmm1
   1470 ; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
   1471 ; AVX1-NEXT:    movswl %cx, %ecx
   1472 ; AVX1-NEXT:    vmovd %ecx, %xmm2
   1473 ; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
   1474 ; AVX1-NEXT:    cwtl
   1475 ; AVX1-NEXT:    vmovd %eax, %xmm3
   1476 ; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
   1477 ; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1478 ; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1479 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   1480 ; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1481 ; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1482 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1483 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1484 ; AVX1-NEXT:    retq
   1485 ;
   1486 ; AVX2-LABEL: cvt_8i16_to_4f64:
   1487 ; AVX2:       # %bb.0:
   1488 ; AVX2-NEXT:    vmovq %xmm0, %rax
   1489 ; AVX2-NEXT:    movq %rax, %rcx
   1490 ; AVX2-NEXT:    movl %eax, %edx
   1491 ; AVX2-NEXT:    movswl %ax, %esi
   1492 ; AVX2-NEXT:    shrq $48, %rax
   1493 ; AVX2-NEXT:    shrq $32, %rcx
   1494 ; AVX2-NEXT:    shrl $16, %edx
   1495 ; AVX2-NEXT:    movswl %dx, %edx
   1496 ; AVX2-NEXT:    vmovd %edx, %xmm0
   1497 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
   1498 ; AVX2-NEXT:    vmovd %esi, %xmm1
   1499 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
   1500 ; AVX2-NEXT:    movswl %cx, %ecx
   1501 ; AVX2-NEXT:    vmovd %ecx, %xmm2
   1502 ; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
   1503 ; AVX2-NEXT:    cwtl
   1504 ; AVX2-NEXT:    vmovd %eax, %xmm3
   1505 ; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
   1506 ; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1507 ; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1508 ; AVX2-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   1509 ; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1510 ; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1511 ; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1512 ; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1513 ; AVX2-NEXT:    retq
   1514 ;
   1515 ; AVX512F-LABEL: cvt_8i16_to_4f64:
   1516 ; AVX512F:       # %bb.0:
   1517 ; AVX512F-NEXT:    vmovq %xmm0, %rax
   1518 ; AVX512F-NEXT:    movq %rax, %rcx
   1519 ; AVX512F-NEXT:    movl %eax, %edx
   1520 ; AVX512F-NEXT:    movswl %ax, %esi
   1521 ; AVX512F-NEXT:    shrq $48, %rax
   1522 ; AVX512F-NEXT:    shrq $32, %rcx
   1523 ; AVX512F-NEXT:    shrl $16, %edx
   1524 ; AVX512F-NEXT:    movswl %dx, %edx
   1525 ; AVX512F-NEXT:    vmovd %edx, %xmm0
   1526 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
   1527 ; AVX512F-NEXT:    vmovd %esi, %xmm1
   1528 ; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
   1529 ; AVX512F-NEXT:    movswl %cx, %ecx
   1530 ; AVX512F-NEXT:    vmovd %ecx, %xmm2
   1531 ; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
   1532 ; AVX512F-NEXT:    cwtl
   1533 ; AVX512F-NEXT:    vmovd %eax, %xmm3
   1534 ; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
   1535 ; AVX512F-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1536 ; AVX512F-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1537 ; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   1538 ; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1539 ; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1540 ; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1541 ; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1542 ; AVX512F-NEXT:    retq
   1543 ;
   1544 ; AVX512VL-LABEL: cvt_8i16_to_4f64:
   1545 ; AVX512VL:       # %bb.0:
   1546 ; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   1547 ; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
   1548 ; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
   1549 ; AVX512VL-NEXT:    movq %rax, %rcx
   1550 ; AVX512VL-NEXT:    movl %eax, %edx
   1551 ; AVX512VL-NEXT:    movswl %ax, %esi
   1552 ; AVX512VL-NEXT:    shrq $48, %rax
   1553 ; AVX512VL-NEXT:    shrq $32, %rcx
   1554 ; AVX512VL-NEXT:    shrl $16, %edx
   1555 ; AVX512VL-NEXT:    movswl %dx, %edx
   1556 ; AVX512VL-NEXT:    vmovd %edx, %xmm0
   1557 ; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
   1558 ; AVX512VL-NEXT:    vmovd %esi, %xmm1
   1559 ; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
   1560 ; AVX512VL-NEXT:    movswl %cx, %ecx
   1561 ; AVX512VL-NEXT:    vmovd %ecx, %xmm2
   1562 ; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
   1563 ; AVX512VL-NEXT:    cwtl
   1564 ; AVX512VL-NEXT:    vmovd %eax, %xmm3
   1565 ; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
   1566 ; AVX512VL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1567 ; AVX512VL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1568 ; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   1569 ; AVX512VL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1570 ; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1571 ; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1572 ; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1573 ; AVX512VL-NEXT:    retq
   1574   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1575   %2 = bitcast <4 x i16> %1 to <4 x half>
   1576   %3 = fpext <4 x half> %2 to <4 x double>
   1577   ret <4 x double> %3
   1578 }
   1579 
   1580 define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
   1581 ; AVX1-LABEL: cvt_8i16_to_8f64:
   1582 ; AVX1:       # %bb.0:
   1583 ; AVX1-NEXT:    vmovq %xmm0, %rdx
   1584 ; AVX1-NEXT:    movq %rdx, %r9
   1585 ; AVX1-NEXT:    movl %edx, %r10d
   1586 ; AVX1-NEXT:    movswl %dx, %r8d
   1587 ; AVX1-NEXT:    shrq $48, %rdx
   1588 ; AVX1-NEXT:    shrq $32, %r9
   1589 ; AVX1-NEXT:    shrl $16, %r10d
   1590 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rdi
   1591 ; AVX1-NEXT:    movq %rdi, %rsi
   1592 ; AVX1-NEXT:    movl %edi, %eax
   1593 ; AVX1-NEXT:    movswl %di, %ecx
   1594 ; AVX1-NEXT:    shrq $48, %rdi
   1595 ; AVX1-NEXT:    shrq $32, %rsi
   1596 ; AVX1-NEXT:    shrl $16, %eax
   1597 ; AVX1-NEXT:    cwtl
   1598 ; AVX1-NEXT:    vmovd %eax, %xmm0
   1599 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm1
   1600 ; AVX1-NEXT:    vmovd %ecx, %xmm0
   1601 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm2
   1602 ; AVX1-NEXT:    movswl %si, %eax
   1603 ; AVX1-NEXT:    vmovd %eax, %xmm0
   1604 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm3
   1605 ; AVX1-NEXT:    movswl %di, %eax
   1606 ; AVX1-NEXT:    vmovd %eax, %xmm0
   1607 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm4
   1608 ; AVX1-NEXT:    movswl %r10w, %eax
   1609 ; AVX1-NEXT:    vmovd %eax, %xmm0
   1610 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
   1611 ; AVX1-NEXT:    vmovd %r8d, %xmm5
   1612 ; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
   1613 ; AVX1-NEXT:    movswl %r9w, %eax
   1614 ; AVX1-NEXT:    vmovd %eax, %xmm6
   1615 ; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
   1616 ; AVX1-NEXT:    movswl %dx, %eax
   1617 ; AVX1-NEXT:    vmovd %eax, %xmm7
   1618 ; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
   1619 ; AVX1-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
   1620 ; AVX1-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
   1621 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
   1622 ; AVX1-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
   1623 ; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1624 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0]
   1625 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
   1626 ; AVX1-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
   1627 ; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1628 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
   1629 ; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1630 ; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1631 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
   1632 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
   1633 ; AVX1-NEXT:    retq
   1634 ;
   1635 ; AVX2-LABEL: cvt_8i16_to_8f64:
   1636 ; AVX2:       # %bb.0:
   1637 ; AVX2-NEXT:    vmovq %xmm0, %rdx
   1638 ; AVX2-NEXT:    movq %rdx, %r9
   1639 ; AVX2-NEXT:    movl %edx, %r10d
   1640 ; AVX2-NEXT:    movswl %dx, %r8d
   1641 ; AVX2-NEXT:    shrq $48, %rdx
   1642 ; AVX2-NEXT:    shrq $32, %r9
   1643 ; AVX2-NEXT:    shrl $16, %r10d
   1644 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rdi
   1645 ; AVX2-NEXT:    movq %rdi, %rsi
   1646 ; AVX2-NEXT:    movl %edi, %eax
   1647 ; AVX2-NEXT:    movswl %di, %ecx
   1648 ; AVX2-NEXT:    shrq $48, %rdi
   1649 ; AVX2-NEXT:    shrq $32, %rsi
   1650 ; AVX2-NEXT:    shrl $16, %eax
   1651 ; AVX2-NEXT:    cwtl
   1652 ; AVX2-NEXT:    vmovd %eax, %xmm0
   1653 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm1
   1654 ; AVX2-NEXT:    vmovd %ecx, %xmm0
   1655 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm2
   1656 ; AVX2-NEXT:    movswl %si, %eax
   1657 ; AVX2-NEXT:    vmovd %eax, %xmm0
   1658 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm3
   1659 ; AVX2-NEXT:    movswl %di, %eax
   1660 ; AVX2-NEXT:    vmovd %eax, %xmm0
   1661 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm4
   1662 ; AVX2-NEXT:    movswl %r10w, %eax
   1663 ; AVX2-NEXT:    vmovd %eax, %xmm0
   1664 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
   1665 ; AVX2-NEXT:    vmovd %r8d, %xmm5
   1666 ; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
   1667 ; AVX2-NEXT:    movswl %r9w, %eax
   1668 ; AVX2-NEXT:    vmovd %eax, %xmm6
   1669 ; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
   1670 ; AVX2-NEXT:    movswl %dx, %eax
   1671 ; AVX2-NEXT:    vmovd %eax, %xmm7
   1672 ; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
   1673 ; AVX2-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
   1674 ; AVX2-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
   1675 ; AVX2-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
   1676 ; AVX2-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
   1677 ; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1678 ; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0]
   1679 ; AVX2-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
   1680 ; AVX2-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
   1681 ; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1682 ; AVX2-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
   1683 ; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1684 ; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1685 ; AVX2-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
   1686 ; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
   1687 ; AVX2-NEXT:    retq
   1688 ;
   1689 ; AVX512-LABEL: cvt_8i16_to_8f64:
   1690 ; AVX512:       # %bb.0:
   1691 ; AVX512-NEXT:    vpextrq $1, %xmm0, %rdx
   1692 ; AVX512-NEXT:    movq %rdx, %r9
   1693 ; AVX512-NEXT:    movl %edx, %r10d
   1694 ; AVX512-NEXT:    movswl %dx, %r8d
   1695 ; AVX512-NEXT:    shrq $48, %rdx
   1696 ; AVX512-NEXT:    shrq $32, %r9
   1697 ; AVX512-NEXT:    shrl $16, %r10d
   1698 ; AVX512-NEXT:    vmovq %xmm0, %rdi
   1699 ; AVX512-NEXT:    movq %rdi, %rsi
   1700 ; AVX512-NEXT:    movl %edi, %eax
   1701 ; AVX512-NEXT:    movswl %di, %ecx
   1702 ; AVX512-NEXT:    shrq $48, %rdi
   1703 ; AVX512-NEXT:    shrq $32, %rsi
   1704 ; AVX512-NEXT:    shrl $16, %eax
   1705 ; AVX512-NEXT:    cwtl
   1706 ; AVX512-NEXT:    vmovd %eax, %xmm0
   1707 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
   1708 ; AVX512-NEXT:    vmovd %ecx, %xmm1
   1709 ; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
   1710 ; AVX512-NEXT:    movswl %si, %eax
   1711 ; AVX512-NEXT:    vmovd %eax, %xmm2
   1712 ; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
   1713 ; AVX512-NEXT:    movswl %di, %eax
   1714 ; AVX512-NEXT:    vmovd %eax, %xmm3
   1715 ; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
   1716 ; AVX512-NEXT:    movswl %r10w, %eax
   1717 ; AVX512-NEXT:    vmovd %eax, %xmm4
   1718 ; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm4
   1719 ; AVX512-NEXT:    vmovd %r8d, %xmm5
   1720 ; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
   1721 ; AVX512-NEXT:    movswl %r9w, %eax
   1722 ; AVX512-NEXT:    vmovd %eax, %xmm6
   1723 ; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm6
   1724 ; AVX512-NEXT:    movswl %dx, %eax
   1725 ; AVX512-NEXT:    vmovd %eax, %xmm7
   1726 ; AVX512-NEXT:    vcvtph2ps %xmm7, %xmm7
   1727 ; AVX512-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
   1728 ; AVX512-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
   1729 ; AVX512-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
   1730 ; AVX512-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
   1731 ; AVX512-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
   1732 ; AVX512-NEXT:    vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0]
   1733 ; AVX512-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm4
   1734 ; AVX512-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1735 ; AVX512-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1736 ; AVX512-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   1737 ; AVX512-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1738 ; AVX512-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1739 ; AVX512-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1740 ; AVX512-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1741 ; AVX512-NEXT:    vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
   1742 ; AVX512-NEXT:    retq
   1743   %1 = bitcast <8 x i16> %a0 to <8 x half>
   1744   %2 = fpext <8 x half> %1 to <8 x double>
   1745   ret <8 x double> %2
   1746 }
   1747 
   1748 ;
   1749 ; Half to Double (Load)
   1750 ;
   1751 
   1752 define double @load_cvt_i16_to_f64(i16* %a0) nounwind {
   1753 ; ALL-LABEL: load_cvt_i16_to_f64:
   1754 ; ALL:       # %bb.0:
   1755 ; ALL-NEXT:    movswl (%rdi), %eax
   1756 ; ALL-NEXT:    vmovd %eax, %xmm0
   1757 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
   1758 ; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1759 ; ALL-NEXT:    retq
   1760   %1 = load i16, i16* %a0
   1761   %2 = bitcast i16 %1 to half
   1762   %3 = fpext half %2 to double
   1763   ret double %3
   1764 }
   1765 
   1766 define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind {
   1767 ; ALL-LABEL: load_cvt_2i16_to_2f64:
   1768 ; ALL:       # %bb.0:
   1769 ; ALL-NEXT:    movswl (%rdi), %eax
   1770 ; ALL-NEXT:    vmovd %eax, %xmm0
   1771 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
   1772 ; ALL-NEXT:    movswl 2(%rdi), %eax
   1773 ; ALL-NEXT:    vmovd %eax, %xmm1
   1774 ; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
   1775 ; ALL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1776 ; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1777 ; ALL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1778 ; ALL-NEXT:    retq
   1779   %1 = load <2 x i16>, <2 x i16>* %a0
   1780   %2 = bitcast <2 x i16> %1 to <2 x half>
   1781   %3 = fpext <2 x half> %2 to <2 x double>
   1782   ret <2 x double> %3
   1783 }
   1784 
   1785 define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind {
   1786 ; ALL-LABEL: load_cvt_4i16_to_4f64:
   1787 ; ALL:       # %bb.0:
   1788 ; ALL-NEXT:    movswl (%rdi), %eax
   1789 ; ALL-NEXT:    vmovd %eax, %xmm0
   1790 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
   1791 ; ALL-NEXT:    movswl 2(%rdi), %eax
   1792 ; ALL-NEXT:    vmovd %eax, %xmm1
   1793 ; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
   1794 ; ALL-NEXT:    movswl 4(%rdi), %eax
   1795 ; ALL-NEXT:    vmovd %eax, %xmm2
   1796 ; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
   1797 ; ALL-NEXT:    movswl 6(%rdi), %eax
   1798 ; ALL-NEXT:    vmovd %eax, %xmm3
   1799 ; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
   1800 ; ALL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1801 ; ALL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1802 ; ALL-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   1803 ; ALL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1804 ; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1805 ; ALL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1806 ; ALL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1807 ; ALL-NEXT:    retq
   1808   %1 = load <4 x i16>, <4 x i16>* %a0
   1809   %2 = bitcast <4 x i16> %1 to <4 x half>
   1810   %3 = fpext <4 x half> %2 to <4 x double>
   1811   ret <4 x double> %3
   1812 }
   1813 
   1814 define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind {
   1815 ; AVX1-LABEL: load_cvt_8i16_to_4f64:
   1816 ; AVX1:       # %bb.0:
   1817 ; AVX1-NEXT:    movq (%rdi), %rax
   1818 ; AVX1-NEXT:    movq %rax, %rcx
   1819 ; AVX1-NEXT:    movl %eax, %edx
   1820 ; AVX1-NEXT:    movswl %ax, %esi
   1821 ; AVX1-NEXT:    shrq $48, %rax
   1822 ; AVX1-NEXT:    shrq $32, %rcx
   1823 ; AVX1-NEXT:    shrl $16, %edx
   1824 ; AVX1-NEXT:    movswl %dx, %edx
   1825 ; AVX1-NEXT:    vmovd %edx, %xmm0
   1826 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
   1827 ; AVX1-NEXT:    vmovd %esi, %xmm1
   1828 ; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
   1829 ; AVX1-NEXT:    movswl %cx, %ecx
   1830 ; AVX1-NEXT:    vmovd %ecx, %xmm2
   1831 ; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
   1832 ; AVX1-NEXT:    cwtl
   1833 ; AVX1-NEXT:    vmovd %eax, %xmm3
   1834 ; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
   1835 ; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1836 ; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1837 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   1838 ; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1839 ; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1840 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1841 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1842 ; AVX1-NEXT:    retq
   1843 ;
   1844 ; AVX2-LABEL: load_cvt_8i16_to_4f64:
   1845 ; AVX2:       # %bb.0:
   1846 ; AVX2-NEXT:    movq (%rdi), %rax
   1847 ; AVX2-NEXT:    movq %rax, %rcx
   1848 ; AVX2-NEXT:    movl %eax, %edx
   1849 ; AVX2-NEXT:    movswl %ax, %esi
   1850 ; AVX2-NEXT:    shrq $48, %rax
   1851 ; AVX2-NEXT:    shrq $32, %rcx
   1852 ; AVX2-NEXT:    shrl $16, %edx
   1853 ; AVX2-NEXT:    movswl %dx, %edx
   1854 ; AVX2-NEXT:    vmovd %edx, %xmm0
   1855 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
   1856 ; AVX2-NEXT:    vmovd %esi, %xmm1
   1857 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
   1858 ; AVX2-NEXT:    movswl %cx, %ecx
   1859 ; AVX2-NEXT:    vmovd %ecx, %xmm2
   1860 ; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
   1861 ; AVX2-NEXT:    cwtl
   1862 ; AVX2-NEXT:    vmovd %eax, %xmm3
   1863 ; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
   1864 ; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1865 ; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1866 ; AVX2-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   1867 ; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1868 ; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1869 ; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1870 ; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1871 ; AVX2-NEXT:    retq
   1872 ;
   1873 ; AVX512F-LABEL: load_cvt_8i16_to_4f64:
   1874 ; AVX512F:       # %bb.0:
   1875 ; AVX512F-NEXT:    movq (%rdi), %rax
   1876 ; AVX512F-NEXT:    movq %rax, %rcx
   1877 ; AVX512F-NEXT:    movl %eax, %edx
   1878 ; AVX512F-NEXT:    movswl %ax, %esi
   1879 ; AVX512F-NEXT:    shrq $48, %rax
   1880 ; AVX512F-NEXT:    shrq $32, %rcx
   1881 ; AVX512F-NEXT:    shrl $16, %edx
   1882 ; AVX512F-NEXT:    movswl %dx, %edx
   1883 ; AVX512F-NEXT:    vmovd %edx, %xmm0
   1884 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
   1885 ; AVX512F-NEXT:    vmovd %esi, %xmm1
   1886 ; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
   1887 ; AVX512F-NEXT:    movswl %cx, %ecx
   1888 ; AVX512F-NEXT:    vmovd %ecx, %xmm2
   1889 ; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
   1890 ; AVX512F-NEXT:    cwtl
   1891 ; AVX512F-NEXT:    vmovd %eax, %xmm3
   1892 ; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
   1893 ; AVX512F-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1894 ; AVX512F-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1895 ; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   1896 ; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1897 ; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1898 ; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1899 ; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1900 ; AVX512F-NEXT:    retq
   1901 ;
   1902 ; AVX512VL-LABEL: load_cvt_8i16_to_4f64:
   1903 ; AVX512VL:       # %bb.0:
   1904 ; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
   1905 ; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
   1906 ; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
   1907 ; AVX512VL-NEXT:    movq %rax, %rcx
   1908 ; AVX512VL-NEXT:    movl %eax, %edx
   1909 ; AVX512VL-NEXT:    movswl %ax, %esi
   1910 ; AVX512VL-NEXT:    shrq $48, %rax
   1911 ; AVX512VL-NEXT:    shrq $32, %rcx
   1912 ; AVX512VL-NEXT:    shrl $16, %edx
   1913 ; AVX512VL-NEXT:    movswl %dx, %edx
   1914 ; AVX512VL-NEXT:    vmovd %edx, %xmm0
   1915 ; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
   1916 ; AVX512VL-NEXT:    vmovd %esi, %xmm1
   1917 ; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
   1918 ; AVX512VL-NEXT:    movswl %cx, %ecx
   1919 ; AVX512VL-NEXT:    vmovd %ecx, %xmm2
   1920 ; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
   1921 ; AVX512VL-NEXT:    cwtl
   1922 ; AVX512VL-NEXT:    vmovd %eax, %xmm3
   1923 ; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
   1924 ; AVX512VL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1925 ; AVX512VL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1926 ; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   1927 ; AVX512VL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1928 ; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1929 ; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1930 ; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1931 ; AVX512VL-NEXT:    retq
   1932   %1 = load <8 x i16>, <8 x i16>* %a0
   1933   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1934   %3 = bitcast <4 x i16> %2 to <4 x half>
   1935   %4 = fpext <4 x half> %3 to <4 x double>
   1936   ret <4 x double> %4
   1937 }
   1938 
   1939 define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind {
   1940 ; AVX1-LABEL: load_cvt_8i16_to_8f64:
   1941 ; AVX1:       # %bb.0:
   1942 ; AVX1-NEXT:    movswl 8(%rdi), %eax
   1943 ; AVX1-NEXT:    vmovd %eax, %xmm0
   1944 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm1
   1945 ; AVX1-NEXT:    movswl 10(%rdi), %eax
   1946 ; AVX1-NEXT:    vmovd %eax, %xmm0
   1947 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm2
   1948 ; AVX1-NEXT:    movswl 12(%rdi), %eax
   1949 ; AVX1-NEXT:    vmovd %eax, %xmm0
   1950 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm3
   1951 ; AVX1-NEXT:    movswl 14(%rdi), %eax
   1952 ; AVX1-NEXT:    vmovd %eax, %xmm0
   1953 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm4
   1954 ; AVX1-NEXT:    movswl (%rdi), %eax
   1955 ; AVX1-NEXT:    vmovd %eax, %xmm0
   1956 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
   1957 ; AVX1-NEXT:    movswl 2(%rdi), %eax
   1958 ; AVX1-NEXT:    vmovd %eax, %xmm5
   1959 ; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
   1960 ; AVX1-NEXT:    movswl 4(%rdi), %eax
   1961 ; AVX1-NEXT:    vmovd %eax, %xmm6
   1962 ; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
   1963 ; AVX1-NEXT:    movswl 6(%rdi), %eax
   1964 ; AVX1-NEXT:    vmovd %eax, %xmm7
   1965 ; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
   1966 ; AVX1-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
   1967 ; AVX1-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
   1968 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
   1969 ; AVX1-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
   1970 ; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1971 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0]
   1972 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
   1973 ; AVX1-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
   1974 ; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1975 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
   1976 ; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1977 ; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1978 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
   1979 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
   1980 ; AVX1-NEXT:    retq
   1981 ;
   1982 ; AVX2-LABEL: load_cvt_8i16_to_8f64:
   1983 ; AVX2:       # %bb.0:
   1984 ; AVX2-NEXT:    movswl 8(%rdi), %eax
   1985 ; AVX2-NEXT:    vmovd %eax, %xmm0
   1986 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm1
   1987 ; AVX2-NEXT:    movswl 10(%rdi), %eax
   1988 ; AVX2-NEXT:    vmovd %eax, %xmm0
   1989 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm2
   1990 ; AVX2-NEXT:    movswl 12(%rdi), %eax
   1991 ; AVX2-NEXT:    vmovd %eax, %xmm0
   1992 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm3
   1993 ; AVX2-NEXT:    movswl 14(%rdi), %eax
   1994 ; AVX2-NEXT:    vmovd %eax, %xmm0
   1995 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm4
   1996 ; AVX2-NEXT:    movswl (%rdi), %eax
   1997 ; AVX2-NEXT:    vmovd %eax, %xmm0
   1998 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
   1999 ; AVX2-NEXT:    movswl 2(%rdi), %eax
   2000 ; AVX2-NEXT:    vmovd %eax, %xmm5
   2001 ; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
   2002 ; AVX2-NEXT:    movswl 4(%rdi), %eax
   2003 ; AVX2-NEXT:    vmovd %eax, %xmm6
   2004 ; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
   2005 ; AVX2-NEXT:    movswl 6(%rdi), %eax
   2006 ; AVX2-NEXT:    vmovd %eax, %xmm7
   2007 ; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
   2008 ; AVX2-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
   2009 ; AVX2-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
   2010 ; AVX2-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
   2011 ; AVX2-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
   2012 ; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   2013 ; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0]
   2014 ; AVX2-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
   2015 ; AVX2-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
   2016 ; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   2017 ; AVX2-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
   2018 ; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   2019 ; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   2020 ; AVX2-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
   2021 ; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
   2022 ; AVX2-NEXT:    retq
   2023 ;
   2024 ; AVX512-LABEL: load_cvt_8i16_to_8f64:
   2025 ; AVX512:       # %bb.0:
   2026 ; AVX512-NEXT:    movswl (%rdi), %eax
   2027 ; AVX512-NEXT:    vmovd %eax, %xmm0
   2028 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
   2029 ; AVX512-NEXT:    movswl 2(%rdi), %eax
   2030 ; AVX512-NEXT:    vmovd %eax, %xmm1
   2031 ; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
   2032 ; AVX512-NEXT:    movswl 4(%rdi), %eax
   2033 ; AVX512-NEXT:    vmovd %eax, %xmm2
   2034 ; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
   2035 ; AVX512-NEXT:    movswl 6(%rdi), %eax
   2036 ; AVX512-NEXT:    vmovd %eax, %xmm3
   2037 ; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
   2038 ; AVX512-NEXT:    movswl 8(%rdi), %eax
   2039 ; AVX512-NEXT:    vmovd %eax, %xmm4
   2040 ; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm4
   2041 ; AVX512-NEXT:    movswl 10(%rdi), %eax
   2042 ; AVX512-NEXT:    vmovd %eax, %xmm5
   2043 ; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
   2044 ; AVX512-NEXT:    movswl 12(%rdi), %eax
   2045 ; AVX512-NEXT:    vmovd %eax, %xmm6
   2046 ; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm6
   2047 ; AVX512-NEXT:    movswl 14(%rdi), %eax
   2048 ; AVX512-NEXT:    vmovd %eax, %xmm7
   2049 ; AVX512-NEXT:    vcvtph2ps %xmm7, %xmm7
   2050 ; AVX512-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
   2051 ; AVX512-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
   2052 ; AVX512-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
   2053 ; AVX512-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
   2054 ; AVX512-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
   2055 ; AVX512-NEXT:    vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
   2056 ; AVX512-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm4
   2057 ; AVX512-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   2058 ; AVX512-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   2059 ; AVX512-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   2060 ; AVX512-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   2061 ; AVX512-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   2062 ; AVX512-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2063 ; AVX512-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   2064 ; AVX512-NEXT:    vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
   2065 ; AVX512-NEXT:    retq
   2066   %1 = load <8 x i16>, <8 x i16>* %a0
   2067   %2 = bitcast <8 x i16> %1 to <8 x half>
   2068   %3 = fpext <8 x half> %2 to <8 x double>
   2069   ret <8 x double> %3
   2070 }
   2071 
   2072 ;
   2073 ; Float to Half
   2074 ;
   2075 
   2076 define i16 @cvt_f32_to_i16(float %a0) nounwind {
   2077 ; ALL-LABEL: cvt_f32_to_i16:
   2078 ; ALL:       # %bb.0:
   2079 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2080 ; ALL-NEXT:    vmovd %xmm0, %eax
   2081 ; ALL-NEXT:    # kill: def $ax killed $ax killed $eax
   2082 ; ALL-NEXT:    retq
   2083   %1 = fptrunc float %a0 to half
   2084   %2 = bitcast half %1 to i16
   2085   ret i16 %2
   2086 }
   2087 
   2088 define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind {
   2089 ; ALL-LABEL: cvt_4f32_to_4i16:
   2090 ; ALL:       # %bb.0:
   2091 ; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2092 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2093 ; ALL-NEXT:    vmovd %xmm1, %eax
   2094 ; ALL-NEXT:    shll $16, %eax
   2095 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   2096 ; ALL-NEXT:    vmovd %xmm1, %ecx
   2097 ; ALL-NEXT:    movzwl %cx, %ecx
   2098 ; ALL-NEXT:    orl %eax, %ecx
   2099 ; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   2100 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2101 ; ALL-NEXT:    vmovd %xmm1, %eax
   2102 ; ALL-NEXT:    shll $16, %eax
   2103 ; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2104 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2105 ; ALL-NEXT:    vmovd %xmm0, %edx
   2106 ; ALL-NEXT:    movzwl %dx, %edx
   2107 ; ALL-NEXT:    orl %eax, %edx
   2108 ; ALL-NEXT:    shlq $32, %rdx
   2109 ; ALL-NEXT:    orq %rcx, %rdx
   2110 ; ALL-NEXT:    vmovq %rdx, %xmm0
   2111 ; ALL-NEXT:    retq
   2112   %1 = fptrunc <4 x float> %a0 to <4 x half>
   2113   %2 = bitcast <4 x half> %1 to <4 x i16>
   2114   ret <4 x i16> %2
   2115 }
   2116 
   2117 define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
   2118 ; ALL-LABEL: cvt_4f32_to_8i16_undef:
   2119 ; ALL:       # %bb.0:
   2120 ; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2121 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2122 ; ALL-NEXT:    vmovd %xmm1, %eax
   2123 ; ALL-NEXT:    shll $16, %eax
   2124 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   2125 ; ALL-NEXT:    vmovd %xmm1, %ecx
   2126 ; ALL-NEXT:    movzwl %cx, %ecx
   2127 ; ALL-NEXT:    orl %eax, %ecx
   2128 ; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   2129 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2130 ; ALL-NEXT:    vmovd %xmm1, %eax
   2131 ; ALL-NEXT:    shll $16, %eax
   2132 ; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2133 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2134 ; ALL-NEXT:    vmovd %xmm0, %edx
   2135 ; ALL-NEXT:    movzwl %dx, %edx
   2136 ; ALL-NEXT:    orl %eax, %edx
   2137 ; ALL-NEXT:    shlq $32, %rdx
   2138 ; ALL-NEXT:    orq %rcx, %rdx
   2139 ; ALL-NEXT:    vmovq %rdx, %xmm0
   2140 ; ALL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   2141 ; ALL-NEXT:    retq
   2142   %1 = fptrunc <4 x float> %a0 to <4 x half>
   2143   %2 = bitcast <4 x half> %1 to <4 x i16>
   2144   %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   2145   ret <8 x i16> %3
   2146 }
   2147 
   2148 define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
   2149 ; AVX1-LABEL: cvt_4f32_to_8i16_zero:
   2150 ; AVX1:       # %bb.0:
   2151 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2152 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2153 ; AVX1-NEXT:    vmovd %xmm1, %eax
   2154 ; AVX1-NEXT:    shll $16, %eax
   2155 ; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   2156 ; AVX1-NEXT:    vmovd %xmm1, %ecx
   2157 ; AVX1-NEXT:    movzwl %cx, %ecx
   2158 ; AVX1-NEXT:    orl %eax, %ecx
   2159 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   2160 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2161 ; AVX1-NEXT:    vmovd %xmm1, %eax
   2162 ; AVX1-NEXT:    shll $16, %eax
   2163 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2164 ; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2165 ; AVX1-NEXT:    vmovd %xmm0, %edx
   2166 ; AVX1-NEXT:    movzwl %dx, %edx
   2167 ; AVX1-NEXT:    orl %eax, %edx
   2168 ; AVX1-NEXT:    shlq $32, %rdx
   2169 ; AVX1-NEXT:    orq %rcx, %rdx
   2170 ; AVX1-NEXT:    vmovq %rdx, %xmm0
   2171 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   2172 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
   2173 ; AVX1-NEXT:    retq
   2174 ;
   2175 ; AVX2-SLOW-LABEL: cvt_4f32_to_8i16_zero:
   2176 ; AVX2-SLOW:       # %bb.0:
   2177 ; AVX2-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2178 ; AVX2-SLOW-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2179 ; AVX2-SLOW-NEXT:    vmovd %xmm1, %eax
   2180 ; AVX2-SLOW-NEXT:    shll $16, %eax
   2181 ; AVX2-SLOW-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   2182 ; AVX2-SLOW-NEXT:    vmovd %xmm1, %ecx
   2183 ; AVX2-SLOW-NEXT:    movzwl %cx, %ecx
   2184 ; AVX2-SLOW-NEXT:    orl %eax, %ecx
   2185 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   2186 ; AVX2-SLOW-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2187 ; AVX2-SLOW-NEXT:    vmovd %xmm1, %eax
   2188 ; AVX2-SLOW-NEXT:    shll $16, %eax
   2189 ; AVX2-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2190 ; AVX2-SLOW-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2191 ; AVX2-SLOW-NEXT:    vmovd %xmm0, %edx
   2192 ; AVX2-SLOW-NEXT:    movzwl %dx, %edx
   2193 ; AVX2-SLOW-NEXT:    orl %eax, %edx
   2194 ; AVX2-SLOW-NEXT:    shlq $32, %rdx
   2195 ; AVX2-SLOW-NEXT:    orq %rcx, %rdx
   2196 ; AVX2-SLOW-NEXT:    vmovq %rdx, %xmm0
   2197 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   2198 ; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
   2199 ; AVX2-SLOW-NEXT:    retq
   2200 ;
   2201 ; AVX2-FAST-LABEL: cvt_4f32_to_8i16_zero:
   2202 ; AVX2-FAST:       # %bb.0:
   2203 ; AVX2-FAST-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2204 ; AVX2-FAST-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2205 ; AVX2-FAST-NEXT:    vmovd %xmm1, %eax
   2206 ; AVX2-FAST-NEXT:    shll $16, %eax
   2207 ; AVX2-FAST-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   2208 ; AVX2-FAST-NEXT:    vmovd %xmm1, %ecx
   2209 ; AVX2-FAST-NEXT:    movzwl %cx, %ecx
   2210 ; AVX2-FAST-NEXT:    orl %eax, %ecx
   2211 ; AVX2-FAST-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   2212 ; AVX2-FAST-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2213 ; AVX2-FAST-NEXT:    vmovd %xmm1, %eax
   2214 ; AVX2-FAST-NEXT:    shll $16, %eax
   2215 ; AVX2-FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2216 ; AVX2-FAST-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2217 ; AVX2-FAST-NEXT:    vmovd %xmm0, %edx
   2218 ; AVX2-FAST-NEXT:    movzwl %dx, %edx
   2219 ; AVX2-FAST-NEXT:    orl %eax, %edx
   2220 ; AVX2-FAST-NEXT:    shlq $32, %rdx
   2221 ; AVX2-FAST-NEXT:    orq %rcx, %rdx
   2222 ; AVX2-FAST-NEXT:    vmovq %rdx, %xmm0
   2223 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
   2224 ; AVX2-FAST-NEXT:    retq
   2225 ;
   2226 ; AVX512F-LABEL: cvt_4f32_to_8i16_zero:
   2227 ; AVX512F:       # %bb.0:
   2228 ; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2229 ; AVX512F-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2230 ; AVX512F-NEXT:    vmovd %xmm1, %eax
   2231 ; AVX512F-NEXT:    shll $16, %eax
   2232 ; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   2233 ; AVX512F-NEXT:    vmovd %xmm1, %ecx
   2234 ; AVX512F-NEXT:    movzwl %cx, %ecx
   2235 ; AVX512F-NEXT:    orl %eax, %ecx
   2236 ; AVX512F-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   2237 ; AVX512F-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2238 ; AVX512F-NEXT:    vmovd %xmm1, %eax
   2239 ; AVX512F-NEXT:    shll $16, %eax
   2240 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2241 ; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2242 ; AVX512F-NEXT:    vmovd %xmm0, %edx
   2243 ; AVX512F-NEXT:    movzwl %dx, %edx
   2244 ; AVX512F-NEXT:    orl %eax, %edx
   2245 ; AVX512F-NEXT:    shlq $32, %rdx
   2246 ; AVX512F-NEXT:    orq %rcx, %rdx
   2247 ; AVX512F-NEXT:    vmovq %rdx, %xmm0
   2248 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   2249 ; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
   2250 ; AVX512F-NEXT:    retq
   2251 ;
   2252 ; AVX512VL-LABEL: cvt_4f32_to_8i16_zero:
   2253 ; AVX512VL:       # %bb.0:
   2254 ; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2255 ; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2256 ; AVX512VL-NEXT:    vmovd %xmm1, %eax
   2257 ; AVX512VL-NEXT:    shll $16, %eax
   2258 ; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   2259 ; AVX512VL-NEXT:    vmovd %xmm1, %ecx
   2260 ; AVX512VL-NEXT:    movzwl %cx, %ecx
   2261 ; AVX512VL-NEXT:    orl %eax, %ecx
   2262 ; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   2263 ; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2264 ; AVX512VL-NEXT:    vmovd %xmm1, %eax
   2265 ; AVX512VL-NEXT:    shll $16, %eax
   2266 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2267 ; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2268 ; AVX512VL-NEXT:    vmovd %xmm0, %edx
   2269 ; AVX512VL-NEXT:    movzwl %dx, %edx
   2270 ; AVX512VL-NEXT:    orl %eax, %edx
   2271 ; AVX512VL-NEXT:    shlq $32, %rdx
   2272 ; AVX512VL-NEXT:    orq %rcx, %rdx
   2273 ; AVX512VL-NEXT:    vmovq %rdx, %xmm0
   2274 ; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
   2275 ; AVX512VL-NEXT:    retq
   2276   %1 = fptrunc <4 x float> %a0 to <4 x half>
   2277   %2 = bitcast <4 x half> %1 to <4 x i16>
   2278   %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   2279   ret <8 x i16> %3
   2280 }
   2281 
   2282 define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind {
   2283 ; ALL-LABEL: cvt_8f32_to_8i16:
   2284 ; ALL:       # %bb.0:
   2285 ; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2286 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2287 ; ALL-NEXT:    vmovd %xmm1, %eax
   2288 ; ALL-NEXT:    shll $16, %eax
   2289 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   2290 ; ALL-NEXT:    vmovd %xmm1, %ecx
   2291 ; ALL-NEXT:    movzwl %cx, %ecx
   2292 ; ALL-NEXT:    orl %eax, %ecx
   2293 ; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   2294 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2295 ; ALL-NEXT:    vmovd %xmm1, %edx
   2296 ; ALL-NEXT:    shll $16, %edx
   2297 ; ALL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   2298 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2299 ; ALL-NEXT:    vmovd %xmm1, %eax
   2300 ; ALL-NEXT:    movzwl %ax, %eax
   2301 ; ALL-NEXT:    orl %edx, %eax
   2302 ; ALL-NEXT:    shlq $32, %rax
   2303 ; ALL-NEXT:    orq %rcx, %rax
   2304 ; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2305 ; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2306 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2307 ; ALL-NEXT:    vmovd %xmm1, %ecx
   2308 ; ALL-NEXT:    shll $16, %ecx
   2309 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   2310 ; ALL-NEXT:    vmovd %xmm1, %edx
   2311 ; ALL-NEXT:    movzwl %dx, %edx
   2312 ; ALL-NEXT:    orl %ecx, %edx
   2313 ; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   2314 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2315 ; ALL-NEXT:    vmovd %xmm1, %ecx
   2316 ; ALL-NEXT:    shll $16, %ecx
   2317 ; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2318 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2319 ; ALL-NEXT:    vmovd %xmm0, %esi
   2320 ; ALL-NEXT:    movzwl %si, %esi
   2321 ; ALL-NEXT:    orl %ecx, %esi
   2322 ; ALL-NEXT:    shlq $32, %rsi
   2323 ; ALL-NEXT:    orq %rdx, %rsi
   2324 ; ALL-NEXT:    vmovq %rsi, %xmm0
   2325 ; ALL-NEXT:    vmovq %rax, %xmm1
   2326 ; ALL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   2327 ; ALL-NEXT:    vzeroupper
   2328 ; ALL-NEXT:    retq
   2329   %1 = fptrunc <8 x float> %a0 to <8 x half>
   2330   %2 = bitcast <8 x half> %1 to <8 x i16>
   2331   ret <8 x i16> %2
   2332 }
   2333 
   2334 define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
   2335 ; AVX1-LABEL: cvt_16f32_to_16i16:
   2336 ; AVX1:       # %bb.0:
   2337 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm2
   2338 ; AVX1-NEXT:    vmovd %xmm2, %eax
   2339 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
   2340 ; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2341 ; AVX1-NEXT:    vmovd %eax, %xmm3
   2342 ; AVX1-NEXT:    vmovd %xmm2, %eax
   2343 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   2344 ; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2345 ; AVX1-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
   2346 ; AVX1-NEXT:    vmovd %xmm2, %eax
   2347 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   2348 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
   2349 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2350 ; AVX1-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
   2351 ; AVX1-NEXT:    vmovd %xmm1, %eax
   2352 ; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
   2353 ; AVX1-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
   2354 ; AVX1-NEXT:    vmovd %xmm1, %eax
   2355 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
   2356 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2357 ; AVX1-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
   2358 ; AVX1-NEXT:    vmovd %xmm1, %eax
   2359 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
   2360 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2361 ; AVX1-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
   2362 ; AVX1-NEXT:    vmovd %xmm1, %eax
   2363 ; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   2364 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
   2365 ; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2366 ; AVX1-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
   2367 ; AVX1-NEXT:    vmovd %xmm2, %eax
   2368 ; AVX1-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm2
   2369 ; AVX1-NEXT:    vmovd %xmm1, %eax
   2370 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2371 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2372 ; AVX1-NEXT:    vmovd %eax, %xmm3
   2373 ; AVX1-NEXT:    vmovd %xmm1, %eax
   2374 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   2375 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2376 ; AVX1-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
   2377 ; AVX1-NEXT:    vmovd %xmm1, %eax
   2378 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   2379 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
   2380 ; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2381 ; AVX1-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
   2382 ; AVX1-NEXT:    vmovd %xmm0, %eax
   2383 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
   2384 ; AVX1-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
   2385 ; AVX1-NEXT:    vmovd %xmm0, %eax
   2386 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
   2387 ; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2388 ; AVX1-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
   2389 ; AVX1-NEXT:    vmovd %xmm0, %eax
   2390 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
   2391 ; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2392 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   2393 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2394 ; AVX1-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
   2395 ; AVX1-NEXT:    vmovd %xmm1, %eax
   2396 ; AVX1-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm1
   2397 ; AVX1-NEXT:    vmovd %xmm0, %eax
   2398 ; AVX1-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
   2399 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   2400 ; AVX1-NEXT:    retq
   2401 ;
   2402 ; AVX2-LABEL: cvt_16f32_to_16i16:
   2403 ; AVX2:       # %bb.0:
   2404 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm2
   2405 ; AVX2-NEXT:    vmovd %xmm2, %eax
   2406 ; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
   2407 ; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2408 ; AVX2-NEXT:    vmovd %eax, %xmm3
   2409 ; AVX2-NEXT:    vmovd %xmm2, %eax
   2410 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   2411 ; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2412 ; AVX2-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
   2413 ; AVX2-NEXT:    vmovd %xmm2, %eax
   2414 ; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
   2415 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
   2416 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2417 ; AVX2-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
   2418 ; AVX2-NEXT:    vmovd %xmm1, %eax
   2419 ; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
   2420 ; AVX2-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
   2421 ; AVX2-NEXT:    vmovd %xmm1, %eax
   2422 ; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
   2423 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2424 ; AVX2-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
   2425 ; AVX2-NEXT:    vmovd %xmm1, %eax
   2426 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
   2427 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2428 ; AVX2-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
   2429 ; AVX2-NEXT:    vmovd %xmm1, %eax
   2430 ; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   2431 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
   2432 ; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2433 ; AVX2-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
   2434 ; AVX2-NEXT:    vmovd %xmm2, %eax
   2435 ; AVX2-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm2
   2436 ; AVX2-NEXT:    vmovd %xmm1, %eax
   2437 ; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2438 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2439 ; AVX2-NEXT:    vmovd %eax, %xmm3
   2440 ; AVX2-NEXT:    vmovd %xmm1, %eax
   2441 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   2442 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2443 ; AVX2-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
   2444 ; AVX2-NEXT:    vmovd %xmm1, %eax
   2445 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
   2446 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
   2447 ; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2448 ; AVX2-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
   2449 ; AVX2-NEXT:    vmovd %xmm0, %eax
   2450 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
   2451 ; AVX2-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
   2452 ; AVX2-NEXT:    vmovd %xmm0, %eax
   2453 ; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
   2454 ; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2455 ; AVX2-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
   2456 ; AVX2-NEXT:    vmovd %xmm0, %eax
   2457 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
   2458 ; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2459 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   2460 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2461 ; AVX2-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
   2462 ; AVX2-NEXT:    vmovd %xmm1, %eax
   2463 ; AVX2-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm1
   2464 ; AVX2-NEXT:    vmovd %xmm0, %eax
   2465 ; AVX2-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
   2466 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
   2467 ; AVX2-NEXT:    retq
   2468 ;
   2469 ; AVX512-LABEL: cvt_16f32_to_16i16:
   2470 ; AVX512:       # %bb.0:
   2471 ; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
   2472 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm2
   2473 ; AVX512-NEXT:    vmovd %xmm2, %eax
   2474 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
   2475 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2476 ; AVX512-NEXT:    vmovd %eax, %xmm3
   2477 ; AVX512-NEXT:    vmovd %xmm2, %eax
   2478 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   2479 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2480 ; AVX512-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
   2481 ; AVX512-NEXT:    vmovd %xmm2, %eax
   2482 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
   2483 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
   2484 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2485 ; AVX512-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
   2486 ; AVX512-NEXT:    vmovd %xmm1, %eax
   2487 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
   2488 ; AVX512-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
   2489 ; AVX512-NEXT:    vmovd %xmm1, %eax
   2490 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
   2491 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2492 ; AVX512-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
   2493 ; AVX512-NEXT:    vmovd %xmm1, %eax
   2494 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
   2495 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2496 ; AVX512-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
   2497 ; AVX512-NEXT:    vmovd %xmm1, %eax
   2498 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   2499 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
   2500 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2501 ; AVX512-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
   2502 ; AVX512-NEXT:    vmovd %xmm2, %eax
   2503 ; AVX512-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm2
   2504 ; AVX512-NEXT:    vmovd %xmm1, %eax
   2505 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2506 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2507 ; AVX512-NEXT:    vmovd %eax, %xmm3
   2508 ; AVX512-NEXT:    vmovd %xmm1, %eax
   2509 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   2510 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2511 ; AVX512-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
   2512 ; AVX512-NEXT:    vmovd %xmm1, %eax
   2513 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
   2514 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
   2515 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2516 ; AVX512-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
   2517 ; AVX512-NEXT:    vmovd %xmm0, %eax
   2518 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
   2519 ; AVX512-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
   2520 ; AVX512-NEXT:    vmovd %xmm0, %eax
   2521 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
   2522 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2523 ; AVX512-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
   2524 ; AVX512-NEXT:    vmovd %xmm0, %eax
   2525 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
   2526 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2527 ; AVX512-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
   2528 ; AVX512-NEXT:    vmovd %xmm0, %eax
   2529 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
   2530 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2531 ; AVX512-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm1
   2532 ; AVX512-NEXT:    vmovd %xmm0, %eax
   2533 ; AVX512-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
   2534 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
   2535 ; AVX512-NEXT:    retq
   2536   %1 = fptrunc <16 x float> %a0 to <16 x half>
   2537   %2 = bitcast <16 x half> %1 to <16 x i16>
   2538   ret <16 x i16> %2
   2539 }
   2540 
   2541 ;
   2542 ; Float to Half (Store)
   2543 ;
   2544 
   2545 define void @store_cvt_f32_to_i16(float %a0, i16* %a1) nounwind {
   2546 ; ALL-LABEL: store_cvt_f32_to_i16:
   2547 ; ALL:       # %bb.0:
   2548 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2549 ; ALL-NEXT:    vmovd %xmm0, %eax
   2550 ; ALL-NEXT:    movw %ax, (%rdi)
   2551 ; ALL-NEXT:    retq
   2552   %1 = fptrunc float %a0 to half
   2553   %2 = bitcast half %1 to i16
   2554   store i16 %2, i16* %a1
   2555   ret void
   2556 }
   2557 
   2558 define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) nounwind {
   2559 ; ALL-LABEL: store_cvt_4f32_to_4i16:
   2560 ; ALL:       # %bb.0:
   2561 ; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2562 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2563 ; ALL-NEXT:    vmovd %xmm1, %eax
   2564 ; ALL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   2565 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2566 ; ALL-NEXT:    vmovd %xmm1, %ecx
   2567 ; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   2568 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2569 ; ALL-NEXT:    vmovd %xmm1, %edx
   2570 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2571 ; ALL-NEXT:    vmovd %xmm0, %esi
   2572 ; ALL-NEXT:    movw %si, (%rdi)
   2573 ; ALL-NEXT:    movw %dx, 6(%rdi)
   2574 ; ALL-NEXT:    movw %cx, 4(%rdi)
   2575 ; ALL-NEXT:    movw %ax, 2(%rdi)
   2576 ; ALL-NEXT:    retq
   2577   %1 = fptrunc <4 x float> %a0 to <4 x half>
   2578   %2 = bitcast <4 x half> %1 to <4 x i16>
   2579   store <4 x i16> %2, <4 x i16>* %a1
   2580   ret void
   2581 }
   2582 
   2583 define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounwind {
   2584 ; ALL-LABEL: store_cvt_4f32_to_8i16_undef:
   2585 ; ALL:       # %bb.0:
   2586 ; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2587 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2588 ; ALL-NEXT:    vmovd %xmm1, %eax
   2589 ; ALL-NEXT:    shll $16, %eax
   2590 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   2591 ; ALL-NEXT:    vmovd %xmm1, %ecx
   2592 ; ALL-NEXT:    movzwl %cx, %ecx
   2593 ; ALL-NEXT:    orl %eax, %ecx
   2594 ; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   2595 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2596 ; ALL-NEXT:    vmovd %xmm1, %eax
   2597 ; ALL-NEXT:    shll $16, %eax
   2598 ; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2599 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2600 ; ALL-NEXT:    vmovd %xmm0, %edx
   2601 ; ALL-NEXT:    movzwl %dx, %edx
   2602 ; ALL-NEXT:    orl %eax, %edx
   2603 ; ALL-NEXT:    shlq $32, %rdx
   2604 ; ALL-NEXT:    orq %rcx, %rdx
   2605 ; ALL-NEXT:    vmovq %rdx, %xmm0
   2606 ; ALL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   2607 ; ALL-NEXT:    vmovdqa %xmm0, (%rdi)
   2608 ; ALL-NEXT:    retq
   2609   %1 = fptrunc <4 x float> %a0 to <4 x half>
   2610   %2 = bitcast <4 x half> %1 to <4 x i16>
   2611   %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   2612   store <8 x i16> %3, <8 x i16>* %a1
   2613   ret void
   2614 }
   2615 
   2616 define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwind {
   2617 ; AVX1-LABEL: store_cvt_4f32_to_8i16_zero:
   2618 ; AVX1:       # %bb.0:
   2619 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2620 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2621 ; AVX1-NEXT:    vmovd %xmm1, %eax
   2622 ; AVX1-NEXT:    shll $16, %eax
   2623 ; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   2624 ; AVX1-NEXT:    vmovd %xmm1, %ecx
   2625 ; AVX1-NEXT:    movzwl %cx, %ecx
   2626 ; AVX1-NEXT:    orl %eax, %ecx
   2627 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   2628 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2629 ; AVX1-NEXT:    vmovd %xmm1, %eax
   2630 ; AVX1-NEXT:    shll $16, %eax
   2631 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2632 ; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2633 ; AVX1-NEXT:    vmovd %xmm0, %edx
   2634 ; AVX1-NEXT:    movzwl %dx, %edx
   2635 ; AVX1-NEXT:    orl %eax, %edx
   2636 ; AVX1-NEXT:    shlq $32, %rdx
   2637 ; AVX1-NEXT:    orq %rcx, %rdx
   2638 ; AVX1-NEXT:    vmovq %rdx, %xmm0
   2639 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   2640 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
   2641 ; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
   2642 ; AVX1-NEXT:    retq
   2643 ;
   2644 ; AVX2-SLOW-LABEL: store_cvt_4f32_to_8i16_zero:
   2645 ; AVX2-SLOW:       # %bb.0:
   2646 ; AVX2-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2647 ; AVX2-SLOW-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2648 ; AVX2-SLOW-NEXT:    vmovd %xmm1, %eax
   2649 ; AVX2-SLOW-NEXT:    shll $16, %eax
   2650 ; AVX2-SLOW-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   2651 ; AVX2-SLOW-NEXT:    vmovd %xmm1, %ecx
   2652 ; AVX2-SLOW-NEXT:    movzwl %cx, %ecx
   2653 ; AVX2-SLOW-NEXT:    orl %eax, %ecx
   2654 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   2655 ; AVX2-SLOW-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2656 ; AVX2-SLOW-NEXT:    vmovd %xmm1, %eax
   2657 ; AVX2-SLOW-NEXT:    shll $16, %eax
   2658 ; AVX2-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2659 ; AVX2-SLOW-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2660 ; AVX2-SLOW-NEXT:    vmovd %xmm0, %edx
   2661 ; AVX2-SLOW-NEXT:    movzwl %dx, %edx
   2662 ; AVX2-SLOW-NEXT:    orl %eax, %edx
   2663 ; AVX2-SLOW-NEXT:    shlq $32, %rdx
   2664 ; AVX2-SLOW-NEXT:    orq %rcx, %rdx
   2665 ; AVX2-SLOW-NEXT:    vmovq %rdx, %xmm0
   2666 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   2667 ; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
   2668 ; AVX2-SLOW-NEXT:    vmovdqa %xmm0, (%rdi)
   2669 ; AVX2-SLOW-NEXT:    retq
   2670 ;
   2671 ; AVX2-FAST-LABEL: store_cvt_4f32_to_8i16_zero:
   2672 ; AVX2-FAST:       # %bb.0:
   2673 ; AVX2-FAST-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2674 ; AVX2-FAST-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2675 ; AVX2-FAST-NEXT:    vmovd %xmm1, %eax
   2676 ; AVX2-FAST-NEXT:    shll $16, %eax
   2677 ; AVX2-FAST-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   2678 ; AVX2-FAST-NEXT:    vmovd %xmm1, %ecx
   2679 ; AVX2-FAST-NEXT:    movzwl %cx, %ecx
   2680 ; AVX2-FAST-NEXT:    orl %eax, %ecx
   2681 ; AVX2-FAST-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   2682 ; AVX2-FAST-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2683 ; AVX2-FAST-NEXT:    vmovd %xmm1, %eax
   2684 ; AVX2-FAST-NEXT:    shll $16, %eax
   2685 ; AVX2-FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2686 ; AVX2-FAST-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2687 ; AVX2-FAST-NEXT:    vmovd %xmm0, %edx
   2688 ; AVX2-FAST-NEXT:    movzwl %dx, %edx
   2689 ; AVX2-FAST-NEXT:    orl %eax, %edx
   2690 ; AVX2-FAST-NEXT:    shlq $32, %rdx
   2691 ; AVX2-FAST-NEXT:    orq %rcx, %rdx
   2692 ; AVX2-FAST-NEXT:    vmovq %rdx, %xmm0
   2693 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
   2694 ; AVX2-FAST-NEXT:    vmovdqa %xmm0, (%rdi)
   2695 ; AVX2-FAST-NEXT:    retq
   2696 ;
   2697 ; AVX512F-LABEL: store_cvt_4f32_to_8i16_zero:
   2698 ; AVX512F:       # %bb.0:
   2699 ; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2700 ; AVX512F-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2701 ; AVX512F-NEXT:    vmovd %xmm1, %eax
   2702 ; AVX512F-NEXT:    shll $16, %eax
   2703 ; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   2704 ; AVX512F-NEXT:    vmovd %xmm1, %ecx
   2705 ; AVX512F-NEXT:    movzwl %cx, %ecx
   2706 ; AVX512F-NEXT:    orl %eax, %ecx
   2707 ; AVX512F-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   2708 ; AVX512F-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2709 ; AVX512F-NEXT:    vmovd %xmm1, %eax
   2710 ; AVX512F-NEXT:    shll $16, %eax
   2711 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2712 ; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2713 ; AVX512F-NEXT:    vmovd %xmm0, %edx
   2714 ; AVX512F-NEXT:    movzwl %dx, %edx
   2715 ; AVX512F-NEXT:    orl %eax, %edx
   2716 ; AVX512F-NEXT:    shlq $32, %rdx
   2717 ; AVX512F-NEXT:    orq %rcx, %rdx
   2718 ; AVX512F-NEXT:    vmovq %rdx, %xmm0
   2719 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   2720 ; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
   2721 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rdi)
   2722 ; AVX512F-NEXT:    retq
   2723 ;
   2724 ; AVX512VL-LABEL: store_cvt_4f32_to_8i16_zero:
   2725 ; AVX512VL:       # %bb.0:
   2726 ; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2727 ; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2728 ; AVX512VL-NEXT:    vmovd %xmm1, %eax
   2729 ; AVX512VL-NEXT:    shll $16, %eax
   2730 ; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   2731 ; AVX512VL-NEXT:    vmovd %xmm1, %ecx
   2732 ; AVX512VL-NEXT:    movzwl %cx, %ecx
   2733 ; AVX512VL-NEXT:    orl %eax, %ecx
   2734 ; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   2735 ; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2736 ; AVX512VL-NEXT:    vmovd %xmm1, %eax
   2737 ; AVX512VL-NEXT:    shll $16, %eax
   2738 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2739 ; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2740 ; AVX512VL-NEXT:    vmovd %xmm0, %edx
   2741 ; AVX512VL-NEXT:    movzwl %dx, %edx
   2742 ; AVX512VL-NEXT:    orl %eax, %edx
   2743 ; AVX512VL-NEXT:    shlq $32, %rdx
   2744 ; AVX512VL-NEXT:    orq %rcx, %rdx
   2745 ; AVX512VL-NEXT:    vmovq %rdx, %xmm0
   2746 ; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
   2747 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%rdi)
   2748 ; AVX512VL-NEXT:    retq
   2749   %1 = fptrunc <4 x float> %a0 to <4 x half>
   2750   %2 = bitcast <4 x half> %1 to <4 x i16>
   2751   %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   2752   store <8 x i16> %3, <8 x i16>* %a1
   2753   ret void
   2754 }
   2755 
   2756 define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind {
   2757 ; ALL-LABEL: store_cvt_8f32_to_8i16:
   2758 ; ALL:       # %bb.0:
   2759 ; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2760 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2761 ; ALL-NEXT:    vmovd %xmm1, %r8d
   2762 ; ALL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   2763 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2764 ; ALL-NEXT:    vmovd %xmm1, %r9d
   2765 ; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   2766 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2767 ; ALL-NEXT:    vmovd %xmm1, %r10d
   2768 ; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
   2769 ; ALL-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
   2770 ; ALL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2771 ; ALL-NEXT:    vmovd %xmm2, %r11d
   2772 ; ALL-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   2773 ; ALL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2774 ; ALL-NEXT:    vmovd %xmm2, %eax
   2775 ; ALL-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
   2776 ; ALL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2777 ; ALL-NEXT:    vmovd %xmm2, %ecx
   2778 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2779 ; ALL-NEXT:    vmovd %xmm0, %edx
   2780 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
   2781 ; ALL-NEXT:    vmovd %xmm0, %esi
   2782 ; ALL-NEXT:    movw %si, 8(%rdi)
   2783 ; ALL-NEXT:    movw %dx, (%rdi)
   2784 ; ALL-NEXT:    movw %cx, 14(%rdi)
   2785 ; ALL-NEXT:    movw %ax, 12(%rdi)
   2786 ; ALL-NEXT:    movw %r11w, 10(%rdi)
   2787 ; ALL-NEXT:    movw %r10w, 6(%rdi)
   2788 ; ALL-NEXT:    movw %r9w, 4(%rdi)
   2789 ; ALL-NEXT:    movw %r8w, 2(%rdi)
   2790 ; ALL-NEXT:    vzeroupper
   2791 ; ALL-NEXT:    retq
   2792   %1 = fptrunc <8 x float> %a0 to <8 x half>
   2793   %2 = bitcast <8 x half> %1 to <8 x i16>
   2794   store <8 x i16> %2, <8 x i16>* %a1
   2795   ret void
   2796 }
   2797 
   2798 define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwind {
   2799 ; AVX1-LABEL: store_cvt_16f32_to_16i16:
   2800 ; AVX1:       # %bb.0:
   2801 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   2802 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
   2803 ; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm4
   2804 ; AVX1-NEXT:    vmovd %xmm4, %eax
   2805 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm4
   2806 ; AVX1-NEXT:    movw %ax, 24(%rdi)
   2807 ; AVX1-NEXT:    vmovd %xmm4, %eax
   2808 ; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm4
   2809 ; AVX1-NEXT:    movw %ax, 16(%rdi)
   2810 ; AVX1-NEXT:    vmovd %xmm4, %eax
   2811 ; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm4
   2812 ; AVX1-NEXT:    movw %ax, 8(%rdi)
   2813 ; AVX1-NEXT:    vmovd %xmm4, %eax
   2814 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
   2815 ; AVX1-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
   2816 ; AVX1-NEXT:    movw %ax, (%rdi)
   2817 ; AVX1-NEXT:    vmovd %xmm4, %eax
   2818 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
   2819 ; AVX1-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
   2820 ; AVX1-NEXT:    movw %ax, 30(%rdi)
   2821 ; AVX1-NEXT:    vmovd %xmm4, %eax
   2822 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
   2823 ; AVX1-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
   2824 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
   2825 ; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2826 ; AVX1-NEXT:    movw %ax, 28(%rdi)
   2827 ; AVX1-NEXT:    vmovd %xmm3, %eax
   2828 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
   2829 ; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2830 ; AVX1-NEXT:    movw %ax, 26(%rdi)
   2831 ; AVX1-NEXT:    vmovd %xmm3, %eax
   2832 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
   2833 ; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2834 ; AVX1-NEXT:    movw %ax, 22(%rdi)
   2835 ; AVX1-NEXT:    vmovd %xmm3, %eax
   2836 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
   2837 ; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2838 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
   2839 ; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2840 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
   2841 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2842 ; AVX1-NEXT:    movw %ax, 20(%rdi)
   2843 ; AVX1-NEXT:    vmovd %xmm1, %eax
   2844 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
   2845 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2846 ; AVX1-NEXT:    movw %ax, 18(%rdi)
   2847 ; AVX1-NEXT:    vmovd %xmm1, %eax
   2848 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
   2849 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2850 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   2851 ; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2852 ; AVX1-NEXT:    movw %ax, 14(%rdi)
   2853 ; AVX1-NEXT:    vmovd %xmm2, %eax
   2854 ; AVX1-NEXT:    movw %ax, 12(%rdi)
   2855 ; AVX1-NEXT:    vmovd %xmm1, %eax
   2856 ; AVX1-NEXT:    movw %ax, 10(%rdi)
   2857 ; AVX1-NEXT:    vmovd %xmm0, %eax
   2858 ; AVX1-NEXT:    movw %ax, 6(%rdi)
   2859 ; AVX1-NEXT:    vmovd %xmm3, %eax
   2860 ; AVX1-NEXT:    movw %ax, 4(%rdi)
   2861 ; AVX1-NEXT:    vmovd %xmm4, %eax
   2862 ; AVX1-NEXT:    movw %ax, 2(%rdi)
   2863 ; AVX1-NEXT:    vzeroupper
   2864 ; AVX1-NEXT:    retq
   2865 ;
   2866 ; AVX2-LABEL: store_cvt_16f32_to_16i16:
   2867 ; AVX2:       # %bb.0:
   2868 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm2
   2869 ; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm3
   2870 ; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm4
   2871 ; AVX2-NEXT:    vmovd %xmm4, %eax
   2872 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm4
   2873 ; AVX2-NEXT:    movw %ax, 24(%rdi)
   2874 ; AVX2-NEXT:    vmovd %xmm4, %eax
   2875 ; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm4
   2876 ; AVX2-NEXT:    movw %ax, 16(%rdi)
   2877 ; AVX2-NEXT:    vmovd %xmm4, %eax
   2878 ; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm4
   2879 ; AVX2-NEXT:    movw %ax, 8(%rdi)
   2880 ; AVX2-NEXT:    vmovd %xmm4, %eax
   2881 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
   2882 ; AVX2-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
   2883 ; AVX2-NEXT:    movw %ax, (%rdi)
   2884 ; AVX2-NEXT:    vmovd %xmm4, %eax
   2885 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
   2886 ; AVX2-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
   2887 ; AVX2-NEXT:    movw %ax, 30(%rdi)
   2888 ; AVX2-NEXT:    vmovd %xmm4, %eax
   2889 ; AVX2-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
   2890 ; AVX2-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
   2891 ; AVX2-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
   2892 ; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2893 ; AVX2-NEXT:    movw %ax, 28(%rdi)
   2894 ; AVX2-NEXT:    vmovd %xmm3, %eax
   2895 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
   2896 ; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2897 ; AVX2-NEXT:    movw %ax, 26(%rdi)
   2898 ; AVX2-NEXT:    vmovd %xmm3, %eax
   2899 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
   2900 ; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2901 ; AVX2-NEXT:    movw %ax, 22(%rdi)
   2902 ; AVX2-NEXT:    vmovd %xmm3, %eax
   2903 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
   2904 ; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2905 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
   2906 ; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2907 ; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
   2908 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2909 ; AVX2-NEXT:    movw %ax, 20(%rdi)
   2910 ; AVX2-NEXT:    vmovd %xmm1, %eax
   2911 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
   2912 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2913 ; AVX2-NEXT:    movw %ax, 18(%rdi)
   2914 ; AVX2-NEXT:    vmovd %xmm1, %eax
   2915 ; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
   2916 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2917 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   2918 ; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2919 ; AVX2-NEXT:    movw %ax, 14(%rdi)
   2920 ; AVX2-NEXT:    vmovd %xmm2, %eax
   2921 ; AVX2-NEXT:    movw %ax, 12(%rdi)
   2922 ; AVX2-NEXT:    vmovd %xmm1, %eax
   2923 ; AVX2-NEXT:    movw %ax, 10(%rdi)
   2924 ; AVX2-NEXT:    vmovd %xmm0, %eax
   2925 ; AVX2-NEXT:    movw %ax, 6(%rdi)
   2926 ; AVX2-NEXT:    vmovd %xmm3, %eax
   2927 ; AVX2-NEXT:    movw %ax, 4(%rdi)
   2928 ; AVX2-NEXT:    vmovd %xmm4, %eax
   2929 ; AVX2-NEXT:    movw %ax, 2(%rdi)
   2930 ; AVX2-NEXT:    vzeroupper
   2931 ; AVX2-NEXT:    retq
   2932 ;
   2933 ; AVX512-LABEL: store_cvt_16f32_to_16i16:
   2934 ; AVX512:       # %bb.0:
   2935 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
   2936 ; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
   2937 ; AVX512-NEXT:    vextractf128 $1, %ymm2, %xmm3
   2938 ; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm4
   2939 ; AVX512-NEXT:    vmovd %xmm4, %eax
   2940 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm4
   2941 ; AVX512-NEXT:    movw %ax, 24(%rdi)
   2942 ; AVX512-NEXT:    vmovd %xmm4, %eax
   2943 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm4
   2944 ; AVX512-NEXT:    movw %ax, 16(%rdi)
   2945 ; AVX512-NEXT:    vmovd %xmm4, %eax
   2946 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm4
   2947 ; AVX512-NEXT:    movw %ax, 8(%rdi)
   2948 ; AVX512-NEXT:    vmovd %xmm4, %eax
   2949 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
   2950 ; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
   2951 ; AVX512-NEXT:    movw %ax, (%rdi)
   2952 ; AVX512-NEXT:    vmovd %xmm4, %eax
   2953 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
   2954 ; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
   2955 ; AVX512-NEXT:    movw %ax, 30(%rdi)
   2956 ; AVX512-NEXT:    vmovd %xmm4, %eax
   2957 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
   2958 ; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
   2959 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
   2960 ; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2961 ; AVX512-NEXT:    movw %ax, 28(%rdi)
   2962 ; AVX512-NEXT:    vmovd %xmm3, %eax
   2963 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
   2964 ; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2965 ; AVX512-NEXT:    movw %ax, 26(%rdi)
   2966 ; AVX512-NEXT:    vmovd %xmm3, %eax
   2967 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
   2968 ; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2969 ; AVX512-NEXT:    movw %ax, 22(%rdi)
   2970 ; AVX512-NEXT:    vmovd %xmm3, %eax
   2971 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
   2972 ; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2973 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
   2974 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2975 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
   2976 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2977 ; AVX512-NEXT:    movw %ax, 20(%rdi)
   2978 ; AVX512-NEXT:    vmovd %xmm2, %eax
   2979 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
   2980 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2981 ; AVX512-NEXT:    movw %ax, 18(%rdi)
   2982 ; AVX512-NEXT:    vmovd %xmm2, %eax
   2983 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
   2984 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2985 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   2986 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2987 ; AVX512-NEXT:    movw %ax, 14(%rdi)
   2988 ; AVX512-NEXT:    vmovd %xmm1, %eax
   2989 ; AVX512-NEXT:    movw %ax, 12(%rdi)
   2990 ; AVX512-NEXT:    vmovd %xmm2, %eax
   2991 ; AVX512-NEXT:    movw %ax, 10(%rdi)
   2992 ; AVX512-NEXT:    vmovd %xmm0, %eax
   2993 ; AVX512-NEXT:    movw %ax, 6(%rdi)
   2994 ; AVX512-NEXT:    vmovd %xmm3, %eax
   2995 ; AVX512-NEXT:    movw %ax, 4(%rdi)
   2996 ; AVX512-NEXT:    vmovd %xmm4, %eax
   2997 ; AVX512-NEXT:    movw %ax, 2(%rdi)
   2998 ; AVX512-NEXT:    vzeroupper
   2999 ; AVX512-NEXT:    retq
   3000   %1 = fptrunc <16 x float> %a0 to <16 x half>
   3001   %2 = bitcast <16 x half> %1 to <16 x i16>
   3002   store <16 x i16> %2, <16 x i16>* %a1
   3003   ret void
   3004 }
   3005 
   3006 ;
   3007 ; Double to Half
   3008 ;
   3009 
   3010 define i16 @cvt_f64_to_i16(double %a0) nounwind {
   3011 ; ALL-LABEL: cvt_f64_to_i16:
   3012 ; ALL:       # %bb.0:
   3013 ; ALL-NEXT:    jmp __truncdfhf2 # TAILCALL
   3014   %1 = fptrunc double %a0 to half
   3015   %2 = bitcast half %1 to i16
   3016   ret i16 %2
   3017 }
   3018 
   3019 define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
   3020 ; ALL-LABEL: cvt_2f64_to_2i16:
   3021 ; ALL:       # %bb.0:
   3022 ; ALL-NEXT:    pushq %rbx
   3023 ; ALL-NEXT:    subq $16, %rsp
   3024 ; ALL-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3025 ; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3026 ; ALL-NEXT:    callq __truncdfhf2
   3027 ; ALL-NEXT:    movl %eax, %ebx
   3028 ; ALL-NEXT:    shll $16, %ebx
   3029 ; ALL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3030 ; ALL-NEXT:    callq __truncdfhf2
   3031 ; ALL-NEXT:    movzwl %ax, %eax
   3032 ; ALL-NEXT:    orl %ebx, %eax
   3033 ; ALL-NEXT:    vmovd %eax, %xmm0
   3034 ; ALL-NEXT:    addq $16, %rsp
   3035 ; ALL-NEXT:    popq %rbx
   3036 ; ALL-NEXT:    retq
   3037   %1 = fptrunc <2 x double> %a0 to <2 x half>
   3038   %2 = bitcast <2 x half> %1 to <2 x i16>
   3039   ret <2 x i16> %2
   3040 }
   3041 
   3042 define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
   3043 ; AVX1-LABEL: cvt_4f64_to_4i16:
   3044 ; AVX1:       # %bb.0:
   3045 ; AVX1-NEXT:    pushq %r14
   3046 ; AVX1-NEXT:    pushq %rbx
   3047 ; AVX1-NEXT:    subq $40, %rsp
   3048 ; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3049 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3050 ; AVX1-NEXT:    vzeroupper
   3051 ; AVX1-NEXT:    callq __truncdfhf2
   3052 ; AVX1-NEXT:    movl %eax, %ebx
   3053 ; AVX1-NEXT:    shll $16, %ebx
   3054 ; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3055 ; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3056 ; AVX1-NEXT:    vzeroupper
   3057 ; AVX1-NEXT:    callq __truncdfhf2
   3058 ; AVX1-NEXT:    movzwl %ax, %r14d
   3059 ; AVX1-NEXT:    orl %ebx, %r14d
   3060 ; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3061 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3062 ; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3063 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3064 ; AVX1-NEXT:    vzeroupper
   3065 ; AVX1-NEXT:    callq __truncdfhf2
   3066 ; AVX1-NEXT:    movl %eax, %ebx
   3067 ; AVX1-NEXT:    shll $16, %ebx
   3068 ; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3069 ; AVX1-NEXT:    callq __truncdfhf2
   3070 ; AVX1-NEXT:    movzwl %ax, %eax
   3071 ; AVX1-NEXT:    orl %ebx, %eax
   3072 ; AVX1-NEXT:    shlq $32, %rax
   3073 ; AVX1-NEXT:    orq %r14, %rax
   3074 ; AVX1-NEXT:    vmovq %rax, %xmm0
   3075 ; AVX1-NEXT:    addq $40, %rsp
   3076 ; AVX1-NEXT:    popq %rbx
   3077 ; AVX1-NEXT:    popq %r14
   3078 ; AVX1-NEXT:    retq
   3079 ;
   3080 ; AVX2-LABEL: cvt_4f64_to_4i16:
   3081 ; AVX2:       # %bb.0:
   3082 ; AVX2-NEXT:    pushq %r14
   3083 ; AVX2-NEXT:    pushq %rbx
   3084 ; AVX2-NEXT:    subq $40, %rsp
   3085 ; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3086 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3087 ; AVX2-NEXT:    vzeroupper
   3088 ; AVX2-NEXT:    callq __truncdfhf2
   3089 ; AVX2-NEXT:    movl %eax, %ebx
   3090 ; AVX2-NEXT:    shll $16, %ebx
   3091 ; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3092 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3093 ; AVX2-NEXT:    vzeroupper
   3094 ; AVX2-NEXT:    callq __truncdfhf2
   3095 ; AVX2-NEXT:    movzwl %ax, %r14d
   3096 ; AVX2-NEXT:    orl %ebx, %r14d
   3097 ; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   3098 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3099 ; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3100 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3101 ; AVX2-NEXT:    vzeroupper
   3102 ; AVX2-NEXT:    callq __truncdfhf2
   3103 ; AVX2-NEXT:    movl %eax, %ebx
   3104 ; AVX2-NEXT:    shll $16, %ebx
   3105 ; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3106 ; AVX2-NEXT:    callq __truncdfhf2
   3107 ; AVX2-NEXT:    movzwl %ax, %eax
   3108 ; AVX2-NEXT:    orl %ebx, %eax
   3109 ; AVX2-NEXT:    shlq $32, %rax
   3110 ; AVX2-NEXT:    orq %r14, %rax
   3111 ; AVX2-NEXT:    vmovq %rax, %xmm0
   3112 ; AVX2-NEXT:    addq $40, %rsp
   3113 ; AVX2-NEXT:    popq %rbx
   3114 ; AVX2-NEXT:    popq %r14
   3115 ; AVX2-NEXT:    retq
   3116 ;
   3117 ; AVX512-LABEL: cvt_4f64_to_4i16:
   3118 ; AVX512:       # %bb.0:
   3119 ; AVX512-NEXT:    pushq %r14
   3120 ; AVX512-NEXT:    pushq %rbx
   3121 ; AVX512-NEXT:    subq $40, %rsp
   3122 ; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3123 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3124 ; AVX512-NEXT:    vzeroupper
   3125 ; AVX512-NEXT:    callq __truncdfhf2
   3126 ; AVX512-NEXT:    movl %eax, %ebx
   3127 ; AVX512-NEXT:    shll $16, %ebx
   3128 ; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3129 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3130 ; AVX512-NEXT:    vzeroupper
   3131 ; AVX512-NEXT:    callq __truncdfhf2
   3132 ; AVX512-NEXT:    movzwl %ax, %r14d
   3133 ; AVX512-NEXT:    orl %ebx, %r14d
   3134 ; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   3135 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3136 ; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3137 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3138 ; AVX512-NEXT:    vzeroupper
   3139 ; AVX512-NEXT:    callq __truncdfhf2
   3140 ; AVX512-NEXT:    movl %eax, %ebx
   3141 ; AVX512-NEXT:    shll $16, %ebx
   3142 ; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3143 ; AVX512-NEXT:    callq __truncdfhf2
   3144 ; AVX512-NEXT:    movzwl %ax, %eax
   3145 ; AVX512-NEXT:    orl %ebx, %eax
   3146 ; AVX512-NEXT:    shlq $32, %rax
   3147 ; AVX512-NEXT:    orq %r14, %rax
   3148 ; AVX512-NEXT:    vmovq %rax, %xmm0
   3149 ; AVX512-NEXT:    addq $40, %rsp
   3150 ; AVX512-NEXT:    popq %rbx
   3151 ; AVX512-NEXT:    popq %r14
   3152 ; AVX512-NEXT:    retq
   3153   %1 = fptrunc <4 x double> %a0 to <4 x half>
   3154   %2 = bitcast <4 x half> %1 to <4 x i16>
   3155   ret <4 x i16> %2
   3156 }
   3157 
   3158 define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
   3159 ; AVX1-LABEL: cvt_4f64_to_8i16_undef:
   3160 ; AVX1:       # %bb.0:
   3161 ; AVX1-NEXT:    pushq %r14
   3162 ; AVX1-NEXT:    pushq %rbx
   3163 ; AVX1-NEXT:    subq $40, %rsp
   3164 ; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3165 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3166 ; AVX1-NEXT:    vzeroupper
   3167 ; AVX1-NEXT:    callq __truncdfhf2
   3168 ; AVX1-NEXT:    movl %eax, %ebx
   3169 ; AVX1-NEXT:    shll $16, %ebx
   3170 ; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3171 ; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3172 ; AVX1-NEXT:    vzeroupper
   3173 ; AVX1-NEXT:    callq __truncdfhf2
   3174 ; AVX1-NEXT:    movzwl %ax, %r14d
   3175 ; AVX1-NEXT:    orl %ebx, %r14d
   3176 ; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3177 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3178 ; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3179 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3180 ; AVX1-NEXT:    vzeroupper
   3181 ; AVX1-NEXT:    callq __truncdfhf2
   3182 ; AVX1-NEXT:    movl %eax, %ebx
   3183 ; AVX1-NEXT:    shll $16, %ebx
   3184 ; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3185 ; AVX1-NEXT:    callq __truncdfhf2
   3186 ; AVX1-NEXT:    movzwl %ax, %eax
   3187 ; AVX1-NEXT:    orl %ebx, %eax
   3188 ; AVX1-NEXT:    shlq $32, %rax
   3189 ; AVX1-NEXT:    orq %r14, %rax
   3190 ; AVX1-NEXT:    vmovq %rax, %xmm0
   3191 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   3192 ; AVX1-NEXT:    addq $40, %rsp
   3193 ; AVX1-NEXT:    popq %rbx
   3194 ; AVX1-NEXT:    popq %r14
   3195 ; AVX1-NEXT:    retq
   3196 ;
   3197 ; AVX2-LABEL: cvt_4f64_to_8i16_undef:
   3198 ; AVX2:       # %bb.0:
   3199 ; AVX2-NEXT:    pushq %r14
   3200 ; AVX2-NEXT:    pushq %rbx
   3201 ; AVX2-NEXT:    subq $40, %rsp
   3202 ; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3203 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3204 ; AVX2-NEXT:    vzeroupper
   3205 ; AVX2-NEXT:    callq __truncdfhf2
   3206 ; AVX2-NEXT:    movl %eax, %ebx
   3207 ; AVX2-NEXT:    shll $16, %ebx
   3208 ; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3209 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3210 ; AVX2-NEXT:    vzeroupper
   3211 ; AVX2-NEXT:    callq __truncdfhf2
   3212 ; AVX2-NEXT:    movzwl %ax, %r14d
   3213 ; AVX2-NEXT:    orl %ebx, %r14d
   3214 ; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   3215 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3216 ; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3217 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3218 ; AVX2-NEXT:    vzeroupper
   3219 ; AVX2-NEXT:    callq __truncdfhf2
   3220 ; AVX2-NEXT:    movl %eax, %ebx
   3221 ; AVX2-NEXT:    shll $16, %ebx
   3222 ; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3223 ; AVX2-NEXT:    callq __truncdfhf2
   3224 ; AVX2-NEXT:    movzwl %ax, %eax
   3225 ; AVX2-NEXT:    orl %ebx, %eax
   3226 ; AVX2-NEXT:    shlq $32, %rax
   3227 ; AVX2-NEXT:    orq %r14, %rax
   3228 ; AVX2-NEXT:    vmovq %rax, %xmm0
   3229 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   3230 ; AVX2-NEXT:    addq $40, %rsp
   3231 ; AVX2-NEXT:    popq %rbx
   3232 ; AVX2-NEXT:    popq %r14
   3233 ; AVX2-NEXT:    retq
   3234 ;
   3235 ; AVX512-LABEL: cvt_4f64_to_8i16_undef:
   3236 ; AVX512:       # %bb.0:
   3237 ; AVX512-NEXT:    pushq %r14
   3238 ; AVX512-NEXT:    pushq %rbx
   3239 ; AVX512-NEXT:    subq $40, %rsp
   3240 ; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3241 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3242 ; AVX512-NEXT:    vzeroupper
   3243 ; AVX512-NEXT:    callq __truncdfhf2
   3244 ; AVX512-NEXT:    movl %eax, %ebx
   3245 ; AVX512-NEXT:    shll $16, %ebx
   3246 ; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3247 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3248 ; AVX512-NEXT:    vzeroupper
   3249 ; AVX512-NEXT:    callq __truncdfhf2
   3250 ; AVX512-NEXT:    movzwl %ax, %r14d
   3251 ; AVX512-NEXT:    orl %ebx, %r14d
   3252 ; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   3253 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3254 ; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3255 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3256 ; AVX512-NEXT:    vzeroupper
   3257 ; AVX512-NEXT:    callq __truncdfhf2
   3258 ; AVX512-NEXT:    movl %eax, %ebx
   3259 ; AVX512-NEXT:    shll $16, %ebx
   3260 ; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3261 ; AVX512-NEXT:    callq __truncdfhf2
   3262 ; AVX512-NEXT:    movzwl %ax, %eax
   3263 ; AVX512-NEXT:    orl %ebx, %eax
   3264 ; AVX512-NEXT:    shlq $32, %rax
   3265 ; AVX512-NEXT:    orq %r14, %rax
   3266 ; AVX512-NEXT:    vmovq %rax, %xmm0
   3267 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   3268 ; AVX512-NEXT:    addq $40, %rsp
   3269 ; AVX512-NEXT:    popq %rbx
   3270 ; AVX512-NEXT:    popq %r14
   3271 ; AVX512-NEXT:    retq
   3272   %1 = fptrunc <4 x double> %a0 to <4 x half>
   3273   %2 = bitcast <4 x half> %1 to <4 x i16>
   3274   %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   3275   ret <8 x i16> %3
   3276 }
   3277 
   3278 define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
   3279 ; AVX1-LABEL: cvt_4f64_to_8i16_zero:
   3280 ; AVX1:       # %bb.0:
   3281 ; AVX1-NEXT:    pushq %r14
   3282 ; AVX1-NEXT:    pushq %rbx
   3283 ; AVX1-NEXT:    subq $40, %rsp
   3284 ; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3285 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3286 ; AVX1-NEXT:    vzeroupper
   3287 ; AVX1-NEXT:    callq __truncdfhf2
   3288 ; AVX1-NEXT:    movl %eax, %ebx
   3289 ; AVX1-NEXT:    shll $16, %ebx
   3290 ; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3291 ; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3292 ; AVX1-NEXT:    vzeroupper
   3293 ; AVX1-NEXT:    callq __truncdfhf2
   3294 ; AVX1-NEXT:    movzwl %ax, %r14d
   3295 ; AVX1-NEXT:    orl %ebx, %r14d
   3296 ; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3297 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3298 ; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3299 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3300 ; AVX1-NEXT:    vzeroupper
   3301 ; AVX1-NEXT:    callq __truncdfhf2
   3302 ; AVX1-NEXT:    movl %eax, %ebx
   3303 ; AVX1-NEXT:    shll $16, %ebx
   3304 ; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3305 ; AVX1-NEXT:    callq __truncdfhf2
   3306 ; AVX1-NEXT:    movzwl %ax, %eax
   3307 ; AVX1-NEXT:    orl %ebx, %eax
   3308 ; AVX1-NEXT:    shlq $32, %rax
   3309 ; AVX1-NEXT:    orq %r14, %rax
   3310 ; AVX1-NEXT:    vmovq %rax, %xmm0
   3311 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   3312 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
   3313 ; AVX1-NEXT:    addq $40, %rsp
   3314 ; AVX1-NEXT:    popq %rbx
   3315 ; AVX1-NEXT:    popq %r14
   3316 ; AVX1-NEXT:    retq
   3317 ;
   3318 ; AVX2-SLOW-LABEL: cvt_4f64_to_8i16_zero:
   3319 ; AVX2-SLOW:       # %bb.0:
   3320 ; AVX2-SLOW-NEXT:    pushq %r14
   3321 ; AVX2-SLOW-NEXT:    pushq %rbx
   3322 ; AVX2-SLOW-NEXT:    subq $40, %rsp
   3323 ; AVX2-SLOW-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3324 ; AVX2-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3325 ; AVX2-SLOW-NEXT:    vzeroupper
   3326 ; AVX2-SLOW-NEXT:    callq __truncdfhf2
   3327 ; AVX2-SLOW-NEXT:    movl %eax, %ebx
   3328 ; AVX2-SLOW-NEXT:    shll $16, %ebx
   3329 ; AVX2-SLOW-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3330 ; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3331 ; AVX2-SLOW-NEXT:    vzeroupper
   3332 ; AVX2-SLOW-NEXT:    callq __truncdfhf2
   3333 ; AVX2-SLOW-NEXT:    movzwl %ax, %r14d
   3334 ; AVX2-SLOW-NEXT:    orl %ebx, %r14d
   3335 ; AVX2-SLOW-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   3336 ; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3337 ; AVX2-SLOW-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3338 ; AVX2-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3339 ; AVX2-SLOW-NEXT:    vzeroupper
   3340 ; AVX2-SLOW-NEXT:    callq __truncdfhf2
   3341 ; AVX2-SLOW-NEXT:    movl %eax, %ebx
   3342 ; AVX2-SLOW-NEXT:    shll $16, %ebx
   3343 ; AVX2-SLOW-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3344 ; AVX2-SLOW-NEXT:    callq __truncdfhf2
   3345 ; AVX2-SLOW-NEXT:    movzwl %ax, %eax
   3346 ; AVX2-SLOW-NEXT:    orl %ebx, %eax
   3347 ; AVX2-SLOW-NEXT:    shlq $32, %rax
   3348 ; AVX2-SLOW-NEXT:    orq %r14, %rax
   3349 ; AVX2-SLOW-NEXT:    vmovq %rax, %xmm0
   3350 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   3351 ; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
   3352 ; AVX2-SLOW-NEXT:    addq $40, %rsp
   3353 ; AVX2-SLOW-NEXT:    popq %rbx
   3354 ; AVX2-SLOW-NEXT:    popq %r14
   3355 ; AVX2-SLOW-NEXT:    retq
   3356 ;
   3357 ; AVX2-FAST-LABEL: cvt_4f64_to_8i16_zero:
   3358 ; AVX2-FAST:       # %bb.0:
   3359 ; AVX2-FAST-NEXT:    pushq %r14
   3360 ; AVX2-FAST-NEXT:    pushq %rbx
   3361 ; AVX2-FAST-NEXT:    subq $40, %rsp
   3362 ; AVX2-FAST-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3363 ; AVX2-FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3364 ; AVX2-FAST-NEXT:    vzeroupper
   3365 ; AVX2-FAST-NEXT:    callq __truncdfhf2
   3366 ; AVX2-FAST-NEXT:    movl %eax, %ebx
   3367 ; AVX2-FAST-NEXT:    shll $16, %ebx
   3368 ; AVX2-FAST-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3369 ; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3370 ; AVX2-FAST-NEXT:    vzeroupper
   3371 ; AVX2-FAST-NEXT:    callq __truncdfhf2
   3372 ; AVX2-FAST-NEXT:    movzwl %ax, %r14d
   3373 ; AVX2-FAST-NEXT:    orl %ebx, %r14d
   3374 ; AVX2-FAST-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   3375 ; AVX2-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3376 ; AVX2-FAST-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3377 ; AVX2-FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3378 ; AVX2-FAST-NEXT:    vzeroupper
   3379 ; AVX2-FAST-NEXT:    callq __truncdfhf2
   3380 ; AVX2-FAST-NEXT:    movl %eax, %ebx
   3381 ; AVX2-FAST-NEXT:    shll $16, %ebx
   3382 ; AVX2-FAST-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3383 ; AVX2-FAST-NEXT:    callq __truncdfhf2
   3384 ; AVX2-FAST-NEXT:    movzwl %ax, %eax
   3385 ; AVX2-FAST-NEXT:    orl %ebx, %eax
   3386 ; AVX2-FAST-NEXT:    shlq $32, %rax
   3387 ; AVX2-FAST-NEXT:    orq %r14, %rax
   3388 ; AVX2-FAST-NEXT:    vmovq %rax, %xmm0
   3389 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
   3390 ; AVX2-FAST-NEXT:    addq $40, %rsp
   3391 ; AVX2-FAST-NEXT:    popq %rbx
   3392 ; AVX2-FAST-NEXT:    popq %r14
   3393 ; AVX2-FAST-NEXT:    retq
   3394 ;
   3395 ; AVX512F-LABEL: cvt_4f64_to_8i16_zero:
   3396 ; AVX512F:       # %bb.0:
   3397 ; AVX512F-NEXT:    pushq %r14
   3398 ; AVX512F-NEXT:    pushq %rbx
   3399 ; AVX512F-NEXT:    subq $40, %rsp
   3400 ; AVX512F-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3401 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3402 ; AVX512F-NEXT:    vzeroupper
   3403 ; AVX512F-NEXT:    callq __truncdfhf2
   3404 ; AVX512F-NEXT:    movl %eax, %ebx
   3405 ; AVX512F-NEXT:    shll $16, %ebx
   3406 ; AVX512F-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3407 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3408 ; AVX512F-NEXT:    vzeroupper
   3409 ; AVX512F-NEXT:    callq __truncdfhf2
   3410 ; AVX512F-NEXT:    movzwl %ax, %r14d
   3411 ; AVX512F-NEXT:    orl %ebx, %r14d
   3412 ; AVX512F-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   3413 ; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3414 ; AVX512F-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3415 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3416 ; AVX512F-NEXT:    vzeroupper
   3417 ; AVX512F-NEXT:    callq __truncdfhf2
   3418 ; AVX512F-NEXT:    movl %eax, %ebx
   3419 ; AVX512F-NEXT:    shll $16, %ebx
   3420 ; AVX512F-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3421 ; AVX512F-NEXT:    callq __truncdfhf2
   3422 ; AVX512F-NEXT:    movzwl %ax, %eax
   3423 ; AVX512F-NEXT:    orl %ebx, %eax
   3424 ; AVX512F-NEXT:    shlq $32, %rax
   3425 ; AVX512F-NEXT:    orq %r14, %rax
   3426 ; AVX512F-NEXT:    vmovq %rax, %xmm0
   3427 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   3428 ; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
   3429 ; AVX512F-NEXT:    addq $40, %rsp
   3430 ; AVX512F-NEXT:    popq %rbx
   3431 ; AVX512F-NEXT:    popq %r14
   3432 ; AVX512F-NEXT:    retq
   3433 ;
   3434 ; AVX512VL-LABEL: cvt_4f64_to_8i16_zero:
   3435 ; AVX512VL:       # %bb.0:
   3436 ; AVX512VL-NEXT:    pushq %r14
   3437 ; AVX512VL-NEXT:    pushq %rbx
   3438 ; AVX512VL-NEXT:    subq $40, %rsp
   3439 ; AVX512VL-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3440 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3441 ; AVX512VL-NEXT:    vzeroupper
   3442 ; AVX512VL-NEXT:    callq __truncdfhf2
   3443 ; AVX512VL-NEXT:    movl %eax, %ebx
   3444 ; AVX512VL-NEXT:    shll $16, %ebx
   3445 ; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3446 ; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3447 ; AVX512VL-NEXT:    vzeroupper
   3448 ; AVX512VL-NEXT:    callq __truncdfhf2
   3449 ; AVX512VL-NEXT:    movzwl %ax, %r14d
   3450 ; AVX512VL-NEXT:    orl %ebx, %r14d
   3451 ; AVX512VL-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   3452 ; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3453 ; AVX512VL-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3454 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3455 ; AVX512VL-NEXT:    vzeroupper
   3456 ; AVX512VL-NEXT:    callq __truncdfhf2
   3457 ; AVX512VL-NEXT:    movl %eax, %ebx
   3458 ; AVX512VL-NEXT:    shll $16, %ebx
   3459 ; AVX512VL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3460 ; AVX512VL-NEXT:    callq __truncdfhf2
   3461 ; AVX512VL-NEXT:    movzwl %ax, %eax
   3462 ; AVX512VL-NEXT:    orl %ebx, %eax
   3463 ; AVX512VL-NEXT:    shlq $32, %rax
   3464 ; AVX512VL-NEXT:    orq %r14, %rax
   3465 ; AVX512VL-NEXT:    vmovq %rax, %xmm0
   3466 ; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
   3467 ; AVX512VL-NEXT:    addq $40, %rsp
   3468 ; AVX512VL-NEXT:    popq %rbx
   3469 ; AVX512VL-NEXT:    popq %r14
   3470 ; AVX512VL-NEXT:    retq
   3471   %1 = fptrunc <4 x double> %a0 to <4 x half>
   3472   %2 = bitcast <4 x half> %1 to <4 x i16>
   3473   %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   3474   ret <8 x i16> %3
   3475 }
   3476 
   3477 define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
   3478 ; AVX1-LABEL: cvt_8f64_to_8i16:
   3479 ; AVX1:       # %bb.0:
   3480 ; AVX1-NEXT:    pushq %r15
   3481 ; AVX1-NEXT:    pushq %r14
   3482 ; AVX1-NEXT:    pushq %rbx
   3483 ; AVX1-NEXT:    subq $64, %rsp
   3484 ; AVX1-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
   3485 ; AVX1-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
   3486 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3487 ; AVX1-NEXT:    vzeroupper
   3488 ; AVX1-NEXT:    callq __truncdfhf2
   3489 ; AVX1-NEXT:    movl %eax, %ebx
   3490 ; AVX1-NEXT:    shll $16, %ebx
   3491 ; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
   3492 ; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3493 ; AVX1-NEXT:    vzeroupper
   3494 ; AVX1-NEXT:    callq __truncdfhf2
   3495 ; AVX1-NEXT:    movzwl %ax, %r15d
   3496 ; AVX1-NEXT:    orl %ebx, %r15d
   3497 ; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
   3498 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3499 ; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
   3500 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3501 ; AVX1-NEXT:    vzeroupper
   3502 ; AVX1-NEXT:    callq __truncdfhf2
   3503 ; AVX1-NEXT:    movl %eax, %ebx
   3504 ; AVX1-NEXT:    shll $16, %ebx
   3505 ; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
   3506 ; AVX1-NEXT:    callq __truncdfhf2
   3507 ; AVX1-NEXT:    movzwl %ax, %r14d
   3508 ; AVX1-NEXT:    orl %ebx, %r14d
   3509 ; AVX1-NEXT:    shlq $32, %r14
   3510 ; AVX1-NEXT:    orq %r15, %r14
   3511 ; AVX1-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
   3512 ; AVX1-NEXT:    # xmm0 = mem[1,0]
   3513 ; AVX1-NEXT:    callq __truncdfhf2
   3514 ; AVX1-NEXT:    movl %eax, %ebx
   3515 ; AVX1-NEXT:    shll $16, %ebx
   3516 ; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3517 ; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3518 ; AVX1-NEXT:    vzeroupper
   3519 ; AVX1-NEXT:    callq __truncdfhf2
   3520 ; AVX1-NEXT:    movzwl %ax, %r15d
   3521 ; AVX1-NEXT:    orl %ebx, %r15d
   3522 ; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3523 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3524 ; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3525 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3526 ; AVX1-NEXT:    vzeroupper
   3527 ; AVX1-NEXT:    callq __truncdfhf2
   3528 ; AVX1-NEXT:    movl %eax, %ebx
   3529 ; AVX1-NEXT:    shll $16, %ebx
   3530 ; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3531 ; AVX1-NEXT:    callq __truncdfhf2
   3532 ; AVX1-NEXT:    movzwl %ax, %eax
   3533 ; AVX1-NEXT:    orl %ebx, %eax
   3534 ; AVX1-NEXT:    shlq $32, %rax
   3535 ; AVX1-NEXT:    orq %r15, %rax
   3536 ; AVX1-NEXT:    vmovq %rax, %xmm0
   3537 ; AVX1-NEXT:    vmovq %r14, %xmm1
   3538 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   3539 ; AVX1-NEXT:    addq $64, %rsp
   3540 ; AVX1-NEXT:    popq %rbx
   3541 ; AVX1-NEXT:    popq %r14
   3542 ; AVX1-NEXT:    popq %r15
   3543 ; AVX1-NEXT:    retq
   3544 ;
   3545 ; AVX2-LABEL: cvt_8f64_to_8i16:
   3546 ; AVX2:       # %bb.0:
   3547 ; AVX2-NEXT:    pushq %r15
   3548 ; AVX2-NEXT:    pushq %r14
   3549 ; AVX2-NEXT:    pushq %rbx
   3550 ; AVX2-NEXT:    subq $64, %rsp
   3551 ; AVX2-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
   3552 ; AVX2-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
   3553 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3554 ; AVX2-NEXT:    vzeroupper
   3555 ; AVX2-NEXT:    callq __truncdfhf2
   3556 ; AVX2-NEXT:    movl %eax, %ebx
   3557 ; AVX2-NEXT:    shll $16, %ebx
   3558 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
   3559 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3560 ; AVX2-NEXT:    vzeroupper
   3561 ; AVX2-NEXT:    callq __truncdfhf2
   3562 ; AVX2-NEXT:    movzwl %ax, %r15d
   3563 ; AVX2-NEXT:    orl %ebx, %r15d
   3564 ; AVX2-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
   3565 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3566 ; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
   3567 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3568 ; AVX2-NEXT:    vzeroupper
   3569 ; AVX2-NEXT:    callq __truncdfhf2
   3570 ; AVX2-NEXT:    movl %eax, %ebx
   3571 ; AVX2-NEXT:    shll $16, %ebx
   3572 ; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
   3573 ; AVX2-NEXT:    callq __truncdfhf2
   3574 ; AVX2-NEXT:    movzwl %ax, %r14d
   3575 ; AVX2-NEXT:    orl %ebx, %r14d
   3576 ; AVX2-NEXT:    shlq $32, %r14
   3577 ; AVX2-NEXT:    orq %r15, %r14
   3578 ; AVX2-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
   3579 ; AVX2-NEXT:    # xmm0 = mem[1,0]
   3580 ; AVX2-NEXT:    callq __truncdfhf2
   3581 ; AVX2-NEXT:    movl %eax, %ebx
   3582 ; AVX2-NEXT:    shll $16, %ebx
   3583 ; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3584 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3585 ; AVX2-NEXT:    vzeroupper
   3586 ; AVX2-NEXT:    callq __truncdfhf2
   3587 ; AVX2-NEXT:    movzwl %ax, %r15d
   3588 ; AVX2-NEXT:    orl %ebx, %r15d
   3589 ; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   3590 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3591 ; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3592 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3593 ; AVX2-NEXT:    vzeroupper
   3594 ; AVX2-NEXT:    callq __truncdfhf2
   3595 ; AVX2-NEXT:    movl %eax, %ebx
   3596 ; AVX2-NEXT:    shll $16, %ebx
   3597 ; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3598 ; AVX2-NEXT:    callq __truncdfhf2
   3599 ; AVX2-NEXT:    movzwl %ax, %eax
   3600 ; AVX2-NEXT:    orl %ebx, %eax
   3601 ; AVX2-NEXT:    shlq $32, %rax
   3602 ; AVX2-NEXT:    orq %r15, %rax
   3603 ; AVX2-NEXT:    vmovq %rax, %xmm0
   3604 ; AVX2-NEXT:    vmovq %r14, %xmm1
   3605 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   3606 ; AVX2-NEXT:    addq $64, %rsp
   3607 ; AVX2-NEXT:    popq %rbx
   3608 ; AVX2-NEXT:    popq %r14
   3609 ; AVX2-NEXT:    popq %r15
   3610 ; AVX2-NEXT:    retq
   3611 ;
   3612 ; AVX512-LABEL: cvt_8f64_to_8i16:
   3613 ; AVX512:       # %bb.0:
   3614 ; AVX512-NEXT:    pushq %r15
   3615 ; AVX512-NEXT:    pushq %r14
   3616 ; AVX512-NEXT:    pushq %rbx
   3617 ; AVX512-NEXT:    subq $96, %rsp
   3618 ; AVX512-NEXT:    vmovupd %zmm0, (%rsp) # 64-byte Spill
   3619 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3620 ; AVX512-NEXT:    vzeroupper
   3621 ; AVX512-NEXT:    callq __truncdfhf2
   3622 ; AVX512-NEXT:    movl %eax, %ebx
   3623 ; AVX512-NEXT:    shll $16, %ebx
   3624 ; AVX512-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
   3625 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
   3626 ; AVX512-NEXT:    vzeroupper
   3627 ; AVX512-NEXT:    callq __truncdfhf2
   3628 ; AVX512-NEXT:    movzwl %ax, %r15d
   3629 ; AVX512-NEXT:    orl %ebx, %r15d
   3630 ; AVX512-NEXT:    vmovupd (%rsp), %zmm0 # 64-byte Reload
   3631 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3632 ; AVX512-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
   3633 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3634 ; AVX512-NEXT:    vzeroupper
   3635 ; AVX512-NEXT:    callq __truncdfhf2
   3636 ; AVX512-NEXT:    movl %eax, %ebx
   3637 ; AVX512-NEXT:    shll $16, %ebx
   3638 ; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
   3639 ; AVX512-NEXT:    callq __truncdfhf2
   3640 ; AVX512-NEXT:    movzwl %ax, %r14d
   3641 ; AVX512-NEXT:    orl %ebx, %r14d
   3642 ; AVX512-NEXT:    shlq $32, %r14
   3643 ; AVX512-NEXT:    orq %r15, %r14
   3644 ; AVX512-NEXT:    vmovupd (%rsp), %zmm0 # 64-byte Reload
   3645 ; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
   3646 ; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3647 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3648 ; AVX512-NEXT:    vzeroupper
   3649 ; AVX512-NEXT:    callq __truncdfhf2
   3650 ; AVX512-NEXT:    movl %eax, %ebx
   3651 ; AVX512-NEXT:    shll $16, %ebx
   3652 ; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3653 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3654 ; AVX512-NEXT:    vzeroupper
   3655 ; AVX512-NEXT:    callq __truncdfhf2
   3656 ; AVX512-NEXT:    movzwl %ax, %r15d
   3657 ; AVX512-NEXT:    orl %ebx, %r15d
   3658 ; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   3659 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3660 ; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3661 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3662 ; AVX512-NEXT:    vzeroupper
   3663 ; AVX512-NEXT:    callq __truncdfhf2
   3664 ; AVX512-NEXT:    movl %eax, %ebx
   3665 ; AVX512-NEXT:    shll $16, %ebx
   3666 ; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3667 ; AVX512-NEXT:    callq __truncdfhf2
   3668 ; AVX512-NEXT:    movzwl %ax, %eax
   3669 ; AVX512-NEXT:    orl %ebx, %eax
   3670 ; AVX512-NEXT:    shlq $32, %rax
   3671 ; AVX512-NEXT:    orq %r15, %rax
   3672 ; AVX512-NEXT:    vmovq %rax, %xmm0
   3673 ; AVX512-NEXT:    vmovq %r14, %xmm1
   3674 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   3675 ; AVX512-NEXT:    addq $96, %rsp
   3676 ; AVX512-NEXT:    popq %rbx
   3677 ; AVX512-NEXT:    popq %r14
   3678 ; AVX512-NEXT:    popq %r15
   3679 ; AVX512-NEXT:    retq
   3680   %1 = fptrunc <8 x double> %a0 to <8 x half>
   3681   %2 = bitcast <8 x half> %1 to <8 x i16>
   3682   ret <8 x i16> %2
   3683 }
   3684 
   3685 ;
   3686 ; Double to Half (Store)
   3687 ;
   3688 
   3689 define void @store_cvt_f64_to_i16(double %a0, i16* %a1) nounwind {
   3690 ; ALL-LABEL: store_cvt_f64_to_i16:
   3691 ; ALL:       # %bb.0:
   3692 ; ALL-NEXT:    pushq %rbx
   3693 ; ALL-NEXT:    movq %rdi, %rbx
   3694 ; ALL-NEXT:    callq __truncdfhf2
   3695 ; ALL-NEXT:    movw %ax, (%rbx)
   3696 ; ALL-NEXT:    popq %rbx
   3697 ; ALL-NEXT:    retq
   3698   %1 = fptrunc double %a0 to half
   3699   %2 = bitcast half %1 to i16
   3700   store i16 %2, i16* %a1
   3701   ret void
   3702 }
   3703 
   3704 define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) nounwind {
   3705 ; ALL-LABEL: store_cvt_2f64_to_2i16:
   3706 ; ALL:       # %bb.0:
   3707 ; ALL-NEXT:    pushq %rbp
   3708 ; ALL-NEXT:    pushq %rbx
   3709 ; ALL-NEXT:    subq $24, %rsp
   3710 ; ALL-NEXT:    movq %rdi, %rbx
   3711 ; ALL-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3712 ; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3713 ; ALL-NEXT:    callq __truncdfhf2
   3714 ; ALL-NEXT:    movl %eax, %ebp
   3715 ; ALL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3716 ; ALL-NEXT:    callq __truncdfhf2
   3717 ; ALL-NEXT:    movw %ax, (%rbx)
   3718 ; ALL-NEXT:    movw %bp, 2(%rbx)
   3719 ; ALL-NEXT:    addq $24, %rsp
   3720 ; ALL-NEXT:    popq %rbx
   3721 ; ALL-NEXT:    popq %rbp
   3722 ; ALL-NEXT:    retq
   3723   %1 = fptrunc <2 x double> %a0 to <2 x half>
   3724   %2 = bitcast <2 x half> %1 to <2 x i16>
   3725   store <2 x i16> %2, <2 x i16>* %a1
   3726   ret void
   3727 }
   3728 
   3729 define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind {
   3730 ; AVX1-LABEL: store_cvt_4f64_to_4i16:
   3731 ; AVX1:       # %bb.0:
   3732 ; AVX1-NEXT:    pushq %rbp
   3733 ; AVX1-NEXT:    pushq %r15
   3734 ; AVX1-NEXT:    pushq %r14
   3735 ; AVX1-NEXT:    pushq %rbx
   3736 ; AVX1-NEXT:    subq $88, %rsp
   3737 ; AVX1-NEXT:    movq %rdi, %rbx
   3738 ; AVX1-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
   3739 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3740 ; AVX1-NEXT:    vzeroupper
   3741 ; AVX1-NEXT:    callq __truncdfhf2
   3742 ; AVX1-NEXT:    movl %eax, %r14d
   3743 ; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
   3744 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3745 ; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
   3746 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3747 ; AVX1-NEXT:    vzeroupper
   3748 ; AVX1-NEXT:    callq __truncdfhf2
   3749 ; AVX1-NEXT:    movl %eax, %r15d
   3750 ; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
   3751 ; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3752 ; AVX1-NEXT:    vzeroupper
   3753 ; AVX1-NEXT:    callq __truncdfhf2
   3754 ; AVX1-NEXT:    movl %eax, %ebp
   3755 ; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
   3756 ; AVX1-NEXT:    callq __truncdfhf2
   3757 ; AVX1-NEXT:    movw %ax, 4(%rbx)
   3758 ; AVX1-NEXT:    movw %bp, (%rbx)
   3759 ; AVX1-NEXT:    movw %r15w, 6(%rbx)
   3760 ; AVX1-NEXT:    movw %r14w, 2(%rbx)
   3761 ; AVX1-NEXT:    addq $88, %rsp
   3762 ; AVX1-NEXT:    popq %rbx
   3763 ; AVX1-NEXT:    popq %r14
   3764 ; AVX1-NEXT:    popq %r15
   3765 ; AVX1-NEXT:    popq %rbp
   3766 ; AVX1-NEXT:    retq
   3767 ;
   3768 ; AVX2-LABEL: store_cvt_4f64_to_4i16:
   3769 ; AVX2:       # %bb.0:
   3770 ; AVX2-NEXT:    pushq %rbp
   3771 ; AVX2-NEXT:    pushq %r15
   3772 ; AVX2-NEXT:    pushq %r14
   3773 ; AVX2-NEXT:    pushq %rbx
   3774 ; AVX2-NEXT:    subq $88, %rsp
   3775 ; AVX2-NEXT:    movq %rdi, %rbx
   3776 ; AVX2-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
   3777 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3778 ; AVX2-NEXT:    vzeroupper
   3779 ; AVX2-NEXT:    callq __truncdfhf2
   3780 ; AVX2-NEXT:    movl %eax, %r14d
   3781 ; AVX2-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
   3782 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3783 ; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
   3784 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3785 ; AVX2-NEXT:    vzeroupper
   3786 ; AVX2-NEXT:    callq __truncdfhf2
   3787 ; AVX2-NEXT:    movl %eax, %r15d
   3788 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
   3789 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3790 ; AVX2-NEXT:    vzeroupper
   3791 ; AVX2-NEXT:    callq __truncdfhf2
   3792 ; AVX2-NEXT:    movl %eax, %ebp
   3793 ; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
   3794 ; AVX2-NEXT:    callq __truncdfhf2
   3795 ; AVX2-NEXT:    movw %ax, 4(%rbx)
   3796 ; AVX2-NEXT:    movw %bp, (%rbx)
   3797 ; AVX2-NEXT:    movw %r15w, 6(%rbx)
   3798 ; AVX2-NEXT:    movw %r14w, 2(%rbx)
   3799 ; AVX2-NEXT:    addq $88, %rsp
   3800 ; AVX2-NEXT:    popq %rbx
   3801 ; AVX2-NEXT:    popq %r14
   3802 ; AVX2-NEXT:    popq %r15
   3803 ; AVX2-NEXT:    popq %rbp
   3804 ; AVX2-NEXT:    retq
   3805 ;
   3806 ; AVX512-LABEL: store_cvt_4f64_to_4i16:
   3807 ; AVX512:       # %bb.0:
   3808 ; AVX512-NEXT:    pushq %rbp
   3809 ; AVX512-NEXT:    pushq %r15
   3810 ; AVX512-NEXT:    pushq %r14
   3811 ; AVX512-NEXT:    pushq %rbx
   3812 ; AVX512-NEXT:    subq $88, %rsp
   3813 ; AVX512-NEXT:    movq %rdi, %rbx
   3814 ; AVX512-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
   3815 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3816 ; AVX512-NEXT:    vzeroupper
   3817 ; AVX512-NEXT:    callq __truncdfhf2
   3818 ; AVX512-NEXT:    movl %eax, %r14d
   3819 ; AVX512-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
   3820 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3821 ; AVX512-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
   3822 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3823 ; AVX512-NEXT:    vzeroupper
   3824 ; AVX512-NEXT:    callq __truncdfhf2
   3825 ; AVX512-NEXT:    movl %eax, %r15d
   3826 ; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
   3827 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3828 ; AVX512-NEXT:    vzeroupper
   3829 ; AVX512-NEXT:    callq __truncdfhf2
   3830 ; AVX512-NEXT:    movl %eax, %ebp
   3831 ; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
   3832 ; AVX512-NEXT:    callq __truncdfhf2
   3833 ; AVX512-NEXT:    movw %ax, 4(%rbx)
   3834 ; AVX512-NEXT:    movw %bp, (%rbx)
   3835 ; AVX512-NEXT:    movw %r15w, 6(%rbx)
   3836 ; AVX512-NEXT:    movw %r14w, 2(%rbx)
   3837 ; AVX512-NEXT:    addq $88, %rsp
   3838 ; AVX512-NEXT:    popq %rbx
   3839 ; AVX512-NEXT:    popq %r14
   3840 ; AVX512-NEXT:    popq %r15
   3841 ; AVX512-NEXT:    popq %rbp
   3842 ; AVX512-NEXT:    retq
   3843   %1 = fptrunc <4 x double> %a0 to <4 x half>
   3844   %2 = bitcast <4 x half> %1 to <4 x i16>
   3845   store <4 x i16> %2, <4 x i16>* %a1
   3846   ret void
   3847 }
   3848 
   3849 define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) nounwind {
   3850 ; AVX1-LABEL: store_cvt_4f64_to_8i16_undef:
   3851 ; AVX1:       # %bb.0:
   3852 ; AVX1-NEXT:    pushq %rbp
   3853 ; AVX1-NEXT:    pushq %r14
   3854 ; AVX1-NEXT:    pushq %rbx
   3855 ; AVX1-NEXT:    subq $32, %rsp
   3856 ; AVX1-NEXT:    movq %rdi, %r14
   3857 ; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3858 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3859 ; AVX1-NEXT:    vzeroupper
   3860 ; AVX1-NEXT:    callq __truncdfhf2
   3861 ; AVX1-NEXT:    movl %eax, %ebp
   3862 ; AVX1-NEXT:    shll $16, %ebp
   3863 ; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3864 ; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3865 ; AVX1-NEXT:    vzeroupper
   3866 ; AVX1-NEXT:    callq __truncdfhf2
   3867 ; AVX1-NEXT:    movzwl %ax, %ebx
   3868 ; AVX1-NEXT:    orl %ebp, %ebx
   3869 ; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3870 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3871 ; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3872 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3873 ; AVX1-NEXT:    vzeroupper
   3874 ; AVX1-NEXT:    callq __truncdfhf2
   3875 ; AVX1-NEXT:    movl %eax, %ebp
   3876 ; AVX1-NEXT:    shll $16, %ebp
   3877 ; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3878 ; AVX1-NEXT:    callq __truncdfhf2
   3879 ; AVX1-NEXT:    movzwl %ax, %eax
   3880 ; AVX1-NEXT:    orl %ebp, %eax
   3881 ; AVX1-NEXT:    shlq $32, %rax
   3882 ; AVX1-NEXT:    orq %rbx, %rax
   3883 ; AVX1-NEXT:    vmovq %rax, %xmm0
   3884 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   3885 ; AVX1-NEXT:    vmovdqa %xmm0, (%r14)
   3886 ; AVX1-NEXT:    addq $32, %rsp
   3887 ; AVX1-NEXT:    popq %rbx
   3888 ; AVX1-NEXT:    popq %r14
   3889 ; AVX1-NEXT:    popq %rbp
   3890 ; AVX1-NEXT:    retq
   3891 ;
   3892 ; AVX2-LABEL: store_cvt_4f64_to_8i16_undef:
   3893 ; AVX2:       # %bb.0:
   3894 ; AVX2-NEXT:    pushq %rbp
   3895 ; AVX2-NEXT:    pushq %r14
   3896 ; AVX2-NEXT:    pushq %rbx
   3897 ; AVX2-NEXT:    subq $32, %rsp
   3898 ; AVX2-NEXT:    movq %rdi, %r14
   3899 ; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3900 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3901 ; AVX2-NEXT:    vzeroupper
   3902 ; AVX2-NEXT:    callq __truncdfhf2
   3903 ; AVX2-NEXT:    movl %eax, %ebp
   3904 ; AVX2-NEXT:    shll $16, %ebp
   3905 ; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3906 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3907 ; AVX2-NEXT:    vzeroupper
   3908 ; AVX2-NEXT:    callq __truncdfhf2
   3909 ; AVX2-NEXT:    movzwl %ax, %ebx
   3910 ; AVX2-NEXT:    orl %ebp, %ebx
   3911 ; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   3912 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3913 ; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3914 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3915 ; AVX2-NEXT:    vzeroupper
   3916 ; AVX2-NEXT:    callq __truncdfhf2
   3917 ; AVX2-NEXT:    movl %eax, %ebp
   3918 ; AVX2-NEXT:    shll $16, %ebp
   3919 ; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3920 ; AVX2-NEXT:    callq __truncdfhf2
   3921 ; AVX2-NEXT:    movzwl %ax, %eax
   3922 ; AVX2-NEXT:    orl %ebp, %eax
   3923 ; AVX2-NEXT:    shlq $32, %rax
   3924 ; AVX2-NEXT:    orq %rbx, %rax
   3925 ; AVX2-NEXT:    vmovq %rax, %xmm0
   3926 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   3927 ; AVX2-NEXT:    vmovdqa %xmm0, (%r14)
   3928 ; AVX2-NEXT:    addq $32, %rsp
   3929 ; AVX2-NEXT:    popq %rbx
   3930 ; AVX2-NEXT:    popq %r14
   3931 ; AVX2-NEXT:    popq %rbp
   3932 ; AVX2-NEXT:    retq
   3933 ;
   3934 ; AVX512-LABEL: store_cvt_4f64_to_8i16_undef:
   3935 ; AVX512:       # %bb.0:
   3936 ; AVX512-NEXT:    pushq %rbp
   3937 ; AVX512-NEXT:    pushq %r14
   3938 ; AVX512-NEXT:    pushq %rbx
   3939 ; AVX512-NEXT:    subq $32, %rsp
   3940 ; AVX512-NEXT:    movq %rdi, %r14
   3941 ; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3942 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3943 ; AVX512-NEXT:    vzeroupper
   3944 ; AVX512-NEXT:    callq __truncdfhf2
   3945 ; AVX512-NEXT:    movl %eax, %ebp
   3946 ; AVX512-NEXT:    shll $16, %ebp
   3947 ; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3948 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3949 ; AVX512-NEXT:    vzeroupper
   3950 ; AVX512-NEXT:    callq __truncdfhf2
   3951 ; AVX512-NEXT:    movzwl %ax, %ebx
   3952 ; AVX512-NEXT:    orl %ebp, %ebx
   3953 ; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   3954 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3955 ; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3956 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3957 ; AVX512-NEXT:    vzeroupper
   3958 ; AVX512-NEXT:    callq __truncdfhf2
   3959 ; AVX512-NEXT:    movl %eax, %ebp
   3960 ; AVX512-NEXT:    shll $16, %ebp
   3961 ; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3962 ; AVX512-NEXT:    callq __truncdfhf2
   3963 ; AVX512-NEXT:    movzwl %ax, %eax
   3964 ; AVX512-NEXT:    orl %ebp, %eax
   3965 ; AVX512-NEXT:    shlq $32, %rax
   3966 ; AVX512-NEXT:    orq %rbx, %rax
   3967 ; AVX512-NEXT:    vmovq %rax, %xmm0
   3968 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   3969 ; AVX512-NEXT:    vmovdqa %xmm0, (%r14)
   3970 ; AVX512-NEXT:    addq $32, %rsp
   3971 ; AVX512-NEXT:    popq %rbx
   3972 ; AVX512-NEXT:    popq %r14
   3973 ; AVX512-NEXT:    popq %rbp
   3974 ; AVX512-NEXT:    retq
   3975   %1 = fptrunc <4 x double> %a0 to <4 x half>
   3976   %2 = bitcast <4 x half> %1 to <4 x i16>
   3977   %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   3978   store <8 x i16> %3, <8 x i16>* %a1
   3979   ret void
   3980 }
   3981 
   3982 define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounwind {
   3983 ; AVX1-LABEL: store_cvt_4f64_to_8i16_zero:
   3984 ; AVX1:       # %bb.0:
   3985 ; AVX1-NEXT:    pushq %rbp
   3986 ; AVX1-NEXT:    pushq %r14
   3987 ; AVX1-NEXT:    pushq %rbx
   3988 ; AVX1-NEXT:    subq $32, %rsp
   3989 ; AVX1-NEXT:    movq %rdi, %r14
   3990 ; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3991 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3992 ; AVX1-NEXT:    vzeroupper
   3993 ; AVX1-NEXT:    callq __truncdfhf2
   3994 ; AVX1-NEXT:    movl %eax, %ebp
   3995 ; AVX1-NEXT:    shll $16, %ebp
   3996 ; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3997 ; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3998 ; AVX1-NEXT:    vzeroupper
   3999 ; AVX1-NEXT:    callq __truncdfhf2
   4000 ; AVX1-NEXT:    movzwl %ax, %ebx
   4001 ; AVX1-NEXT:    orl %ebp, %ebx
   4002 ; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   4003 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   4004 ; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   4005 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   4006 ; AVX1-NEXT:    vzeroupper
   4007 ; AVX1-NEXT:    callq __truncdfhf2
   4008 ; AVX1-NEXT:    movl %eax, %ebp
   4009 ; AVX1-NEXT:    shll $16, %ebp
   4010 ; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   4011 ; AVX1-NEXT:    callq __truncdfhf2
   4012 ; AVX1-NEXT:    movzwl %ax, %eax
   4013 ; AVX1-NEXT:    orl %ebp, %eax
   4014 ; AVX1-NEXT:    shlq $32, %rax
   4015 ; AVX1-NEXT:    orq %rbx, %rax
   4016 ; AVX1-NEXT:    vmovq %rax, %xmm0
   4017 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   4018 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
   4019 ; AVX1-NEXT:    vmovdqa %xmm0, (%r14)
   4020 ; AVX1-NEXT:    addq $32, %rsp
   4021 ; AVX1-NEXT:    popq %rbx
   4022 ; AVX1-NEXT:    popq %r14
   4023 ; AVX1-NEXT:    popq %rbp
   4024 ; AVX1-NEXT:    retq
   4025 ;
   4026 ; AVX2-SLOW-LABEL: store_cvt_4f64_to_8i16_zero:
   4027 ; AVX2-SLOW:       # %bb.0:
   4028 ; AVX2-SLOW-NEXT:    pushq %rbp
   4029 ; AVX2-SLOW-NEXT:    pushq %r14
   4030 ; AVX2-SLOW-NEXT:    pushq %rbx
   4031 ; AVX2-SLOW-NEXT:    subq $32, %rsp
   4032 ; AVX2-SLOW-NEXT:    movq %rdi, %r14
   4033 ; AVX2-SLOW-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   4034 ; AVX2-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   4035 ; AVX2-SLOW-NEXT:    vzeroupper
   4036 ; AVX2-SLOW-NEXT:    callq __truncdfhf2
   4037 ; AVX2-SLOW-NEXT:    movl %eax, %ebp
   4038 ; AVX2-SLOW-NEXT:    shll $16, %ebp
   4039 ; AVX2-SLOW-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   4040 ; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   4041 ; AVX2-SLOW-NEXT:    vzeroupper
   4042 ; AVX2-SLOW-NEXT:    callq __truncdfhf2
   4043 ; AVX2-SLOW-NEXT:    movzwl %ax, %ebx
   4044 ; AVX2-SLOW-NEXT:    orl %ebp, %ebx
   4045 ; AVX2-SLOW-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   4046 ; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
   4047 ; AVX2-SLOW-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   4048 ; AVX2-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   4049 ; AVX2-SLOW-NEXT:    vzeroupper
   4050 ; AVX2-SLOW-NEXT:    callq __truncdfhf2
   4051 ; AVX2-SLOW-NEXT:    movl %eax, %ebp
   4052 ; AVX2-SLOW-NEXT:    shll $16, %ebp
   4053 ; AVX2-SLOW-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   4054 ; AVX2-SLOW-NEXT:    callq __truncdfhf2
   4055 ; AVX2-SLOW-NEXT:    movzwl %ax, %eax
   4056 ; AVX2-SLOW-NEXT:    orl %ebp, %eax
   4057 ; AVX2-SLOW-NEXT:    shlq $32, %rax
   4058 ; AVX2-SLOW-NEXT:    orq %rbx, %rax
   4059 ; AVX2-SLOW-NEXT:    vmovq %rax, %xmm0
   4060 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   4061 ; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
   4062 ; AVX2-SLOW-NEXT:    vmovdqa %xmm0, (%r14)
   4063 ; AVX2-SLOW-NEXT:    addq $32, %rsp
   4064 ; AVX2-SLOW-NEXT:    popq %rbx
   4065 ; AVX2-SLOW-NEXT:    popq %r14
   4066 ; AVX2-SLOW-NEXT:    popq %rbp
   4067 ; AVX2-SLOW-NEXT:    retq
   4068 ;
   4069 ; AVX2-FAST-LABEL: store_cvt_4f64_to_8i16_zero:
   4070 ; AVX2-FAST:       # %bb.0:
   4071 ; AVX2-FAST-NEXT:    pushq %rbp
   4072 ; AVX2-FAST-NEXT:    pushq %r14
   4073 ; AVX2-FAST-NEXT:    pushq %rbx
   4074 ; AVX2-FAST-NEXT:    subq $32, %rsp
   4075 ; AVX2-FAST-NEXT:    movq %rdi, %r14
   4076 ; AVX2-FAST-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   4077 ; AVX2-FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   4078 ; AVX2-FAST-NEXT:    vzeroupper
   4079 ; AVX2-FAST-NEXT:    callq __truncdfhf2
   4080 ; AVX2-FAST-NEXT:    movl %eax, %ebp
   4081 ; AVX2-FAST-NEXT:    shll $16, %ebp
   4082 ; AVX2-FAST-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   4083 ; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   4084 ; AVX2-FAST-NEXT:    vzeroupper
   4085 ; AVX2-FAST-NEXT:    callq __truncdfhf2
   4086 ; AVX2-FAST-NEXT:    movzwl %ax, %ebx
   4087 ; AVX2-FAST-NEXT:    orl %ebp, %ebx
   4088 ; AVX2-FAST-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   4089 ; AVX2-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
   4090 ; AVX2-FAST-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   4091 ; AVX2-FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   4092 ; AVX2-FAST-NEXT:    vzeroupper
   4093 ; AVX2-FAST-NEXT:    callq __truncdfhf2
   4094 ; AVX2-FAST-NEXT:    movl %eax, %ebp
   4095 ; AVX2-FAST-NEXT:    shll $16, %ebp
   4096 ; AVX2-FAST-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   4097 ; AVX2-FAST-NEXT:    callq __truncdfhf2
   4098 ; AVX2-FAST-NEXT:    movzwl %ax, %eax
   4099 ; AVX2-FAST-NEXT:    orl %ebp, %eax
   4100 ; AVX2-FAST-NEXT:    shlq $32, %rax
   4101 ; AVX2-FAST-NEXT:    orq %rbx, %rax
   4102 ; AVX2-FAST-NEXT:    vmovq %rax, %xmm0
   4103 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
   4104 ; AVX2-FAST-NEXT:    vmovdqa %xmm0, (%r14)
   4105 ; AVX2-FAST-NEXT:    addq $32, %rsp
   4106 ; AVX2-FAST-NEXT:    popq %rbx
   4107 ; AVX2-FAST-NEXT:    popq %r14
   4108 ; AVX2-FAST-NEXT:    popq %rbp
   4109 ; AVX2-FAST-NEXT:    retq
   4110 ;
   4111 ; AVX512F-LABEL: store_cvt_4f64_to_8i16_zero:
   4112 ; AVX512F:       # %bb.0:
   4113 ; AVX512F-NEXT:    pushq %rbp
   4114 ; AVX512F-NEXT:    pushq %r14
   4115 ; AVX512F-NEXT:    pushq %rbx
   4116 ; AVX512F-NEXT:    subq $32, %rsp
   4117 ; AVX512F-NEXT:    movq %rdi, %r14
   4118 ; AVX512F-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   4119 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   4120 ; AVX512F-NEXT:    vzeroupper
   4121 ; AVX512F-NEXT:    callq __truncdfhf2
   4122 ; AVX512F-NEXT:    movl %eax, %ebp
   4123 ; AVX512F-NEXT:    shll $16, %ebp
   4124 ; AVX512F-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   4125 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   4126 ; AVX512F-NEXT:    vzeroupper
   4127 ; AVX512F-NEXT:    callq __truncdfhf2
   4128 ; AVX512F-NEXT:    movzwl %ax, %ebx
   4129 ; AVX512F-NEXT:    orl %ebp, %ebx
   4130 ; AVX512F-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   4131 ; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
   4132 ; AVX512F-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   4133 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   4134 ; AVX512F-NEXT:    vzeroupper
   4135 ; AVX512F-NEXT:    callq __truncdfhf2
   4136 ; AVX512F-NEXT:    movl %eax, %ebp
   4137 ; AVX512F-NEXT:    shll $16, %ebp
   4138 ; AVX512F-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   4139 ; AVX512F-NEXT:    callq __truncdfhf2
   4140 ; AVX512F-NEXT:    movzwl %ax, %eax
   4141 ; AVX512F-NEXT:    orl %ebp, %eax
   4142 ; AVX512F-NEXT:    shlq $32, %rax
   4143 ; AVX512F-NEXT:    orq %rbx, %rax
   4144 ; AVX512F-NEXT:    vmovq %rax, %xmm0
   4145 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   4146 ; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
   4147 ; AVX512F-NEXT:    vmovdqa %xmm0, (%r14)
   4148 ; AVX512F-NEXT:    addq $32, %rsp
   4149 ; AVX512F-NEXT:    popq %rbx
   4150 ; AVX512F-NEXT:    popq %r14
   4151 ; AVX512F-NEXT:    popq %rbp
   4152 ; AVX512F-NEXT:    retq
   4153 ;
   4154 ; AVX512VL-LABEL: store_cvt_4f64_to_8i16_zero:
   4155 ; AVX512VL:       # %bb.0:
   4156 ; AVX512VL-NEXT:    pushq %rbp
   4157 ; AVX512VL-NEXT:    pushq %r14
   4158 ; AVX512VL-NEXT:    pushq %rbx
   4159 ; AVX512VL-NEXT:    subq $32, %rsp
   4160 ; AVX512VL-NEXT:    movq %rdi, %r14
   4161 ; AVX512VL-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   4162 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   4163 ; AVX512VL-NEXT:    vzeroupper
   4164 ; AVX512VL-NEXT:    callq __truncdfhf2
   4165 ; AVX512VL-NEXT:    movl %eax, %ebp
   4166 ; AVX512VL-NEXT:    shll $16, %ebp
   4167 ; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   4168 ; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   4169 ; AVX512VL-NEXT:    vzeroupper
   4170 ; AVX512VL-NEXT:    callq __truncdfhf2
   4171 ; AVX512VL-NEXT:    movzwl %ax, %ebx
   4172 ; AVX512VL-NEXT:    orl %ebp, %ebx
   4173 ; AVX512VL-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   4174 ; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
   4175 ; AVX512VL-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   4176 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   4177 ; AVX512VL-NEXT:    vzeroupper
   4178 ; AVX512VL-NEXT:    callq __truncdfhf2
   4179 ; AVX512VL-NEXT:    movl %eax, %ebp
   4180 ; AVX512VL-NEXT:    shll $16, %ebp
   4181 ; AVX512VL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   4182 ; AVX512VL-NEXT:    callq __truncdfhf2
   4183 ; AVX512VL-NEXT:    movzwl %ax, %eax
   4184 ; AVX512VL-NEXT:    orl %ebp, %eax
   4185 ; AVX512VL-NEXT:    shlq $32, %rax
   4186 ; AVX512VL-NEXT:    orq %rbx, %rax
   4187 ; AVX512VL-NEXT:    vmovq %rax, %xmm0
   4188 ; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
   4189 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%r14)
   4190 ; AVX512VL-NEXT:    addq $32, %rsp
   4191 ; AVX512VL-NEXT:    popq %rbx
   4192 ; AVX512VL-NEXT:    popq %r14
   4193 ; AVX512VL-NEXT:    popq %rbp
   4194 ; AVX512VL-NEXT:    retq
   4195   %1 = fptrunc <4 x double> %a0 to <4 x half>
   4196   %2 = bitcast <4 x half> %1 to <4 x i16>
   4197   %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   4198   store <8 x i16> %3, <8 x i16>* %a1
   4199   ret void
   4200 }
   4201 
   4202 define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
   4203 ; AVX1-LABEL: store_cvt_8f64_to_8i16:
   4204 ; AVX1:       # %bb.0:
   4205 ; AVX1-NEXT:    pushq %rbp
   4206 ; AVX1-NEXT:    pushq %r15
   4207 ; AVX1-NEXT:    pushq %r14
   4208 ; AVX1-NEXT:    pushq %r13
   4209 ; AVX1-NEXT:    pushq %r12
   4210 ; AVX1-NEXT:    pushq %rbx
   4211 ; AVX1-NEXT:    subq $136, %rsp
   4212 ; AVX1-NEXT:    movq %rdi, %rbx
   4213 ; AVX1-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
   4214 ; AVX1-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
   4215 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   4216 ; AVX1-NEXT:    vzeroupper
   4217 ; AVX1-NEXT:    callq __truncdfhf2
   4218 ; AVX1-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
   4219 ; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
   4220 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   4221 ; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
   4222 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   4223 ; AVX1-NEXT:    vzeroupper
   4224 ; AVX1-NEXT:    callq __truncdfhf2
   4225 ; AVX1-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
   4226 ; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
   4227 ; AVX1-NEXT:    # xmm0 = mem[1,0]
   4228 ; AVX1-NEXT:    callq __truncdfhf2
   4229 ; AVX1-NEXT:    movl %eax, %r12d
   4230 ; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
   4231 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   4232 ; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
   4233 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   4234 ; AVX1-NEXT:    vzeroupper
   4235 ; AVX1-NEXT:    callq __truncdfhf2
   4236 ; AVX1-NEXT:    movl %eax, %r13d
   4237 ; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
   4238 ; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   4239 ; AVX1-NEXT:    vzeroupper
   4240 ; AVX1-NEXT:    callq __truncdfhf2
   4241 ; AVX1-NEXT:    movl %eax, %ebp
   4242 ; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
   4243 ; AVX1-NEXT:    callq __truncdfhf2
   4244 ; AVX1-NEXT:    movl %eax, %r14d
   4245 ; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
   4246 ; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   4247 ; AVX1-NEXT:    vzeroupper
   4248 ; AVX1-NEXT:    callq __truncdfhf2
   4249 ; AVX1-NEXT:    movl %eax, %r15d
   4250 ; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
   4251 ; AVX1-NEXT:    callq __truncdfhf2
   4252 ; AVX1-NEXT:    movw %ax, 12(%rbx)
   4253 ; AVX1-NEXT:    movw %r15w, 8(%rbx)
   4254 ; AVX1-NEXT:    movw %r14w, 4(%rbx)
   4255 ; AVX1-NEXT:    movw %bp, (%rbx)
   4256 ; AVX1-NEXT:    movw %r13w, 14(%rbx)
   4257 ; AVX1-NEXT:    movw %r12w, 10(%rbx)
   4258 ; AVX1-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
   4259 ; AVX1-NEXT:    movw %ax, 6(%rbx)
   4260 ; AVX1-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
   4261 ; AVX1-NEXT:    movw %ax, 2(%rbx)
   4262 ; AVX1-NEXT:    addq $136, %rsp
   4263 ; AVX1-NEXT:    popq %rbx
   4264 ; AVX1-NEXT:    popq %r12
   4265 ; AVX1-NEXT:    popq %r13
   4266 ; AVX1-NEXT:    popq %r14
   4267 ; AVX1-NEXT:    popq %r15
   4268 ; AVX1-NEXT:    popq %rbp
   4269 ; AVX1-NEXT:    retq
   4270 ;
   4271 ; AVX2-LABEL: store_cvt_8f64_to_8i16:
   4272 ; AVX2:       # %bb.0:
   4273 ; AVX2-NEXT:    pushq %rbp
   4274 ; AVX2-NEXT:    pushq %r15
   4275 ; AVX2-NEXT:    pushq %r14
   4276 ; AVX2-NEXT:    pushq %r13
   4277 ; AVX2-NEXT:    pushq %r12
   4278 ; AVX2-NEXT:    pushq %rbx
   4279 ; AVX2-NEXT:    subq $136, %rsp
   4280 ; AVX2-NEXT:    movq %rdi, %rbx
   4281 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
   4282 ; AVX2-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
   4283 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   4284 ; AVX2-NEXT:    vzeroupper
   4285 ; AVX2-NEXT:    callq __truncdfhf2
   4286 ; AVX2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
   4287 ; AVX2-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
   4288 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
   4289 ; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
   4290 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   4291 ; AVX2-NEXT:    vzeroupper
   4292 ; AVX2-NEXT:    callq __truncdfhf2
   4293 ; AVX2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
   4294 ; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
   4295 ; AVX2-NEXT:    # xmm0 = mem[1,0]
   4296 ; AVX2-NEXT:    callq __truncdfhf2
   4297 ; AVX2-NEXT:    movl %eax, %r12d
   4298 ; AVX2-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
   4299 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
   4300 ; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
   4301 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   4302 ; AVX2-NEXT:    vzeroupper
   4303 ; AVX2-NEXT:    callq __truncdfhf2
   4304 ; AVX2-NEXT:    movl %eax, %r13d
   4305 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
   4306 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   4307 ; AVX2-NEXT:    vzeroupper
   4308 ; AVX2-NEXT:    callq __truncdfhf2
   4309 ; AVX2-NEXT:    movl %eax, %ebp
   4310 ; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
   4311 ; AVX2-NEXT:    callq __truncdfhf2
   4312 ; AVX2-NEXT:    movl %eax, %r14d
   4313 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
   4314 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   4315 ; AVX2-NEXT:    vzeroupper
   4316 ; AVX2-NEXT:    callq __truncdfhf2
   4317 ; AVX2-NEXT:    movl %eax, %r15d
   4318 ; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
   4319 ; AVX2-NEXT:    callq __truncdfhf2
   4320 ; AVX2-NEXT:    movw %ax, 12(%rbx)
   4321 ; AVX2-NEXT:    movw %r15w, 8(%rbx)
   4322 ; AVX2-NEXT:    movw %r14w, 4(%rbx)
   4323 ; AVX2-NEXT:    movw %bp, (%rbx)
   4324 ; AVX2-NEXT:    movw %r13w, 14(%rbx)
   4325 ; AVX2-NEXT:    movw %r12w, 10(%rbx)
   4326 ; AVX2-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
   4327 ; AVX2-NEXT:    movw %ax, 6(%rbx)
   4328 ; AVX2-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
   4329 ; AVX2-NEXT:    movw %ax, 2(%rbx)
   4330 ; AVX2-NEXT:    addq $136, %rsp
   4331 ; AVX2-NEXT:    popq %rbx
   4332 ; AVX2-NEXT:    popq %r12
   4333 ; AVX2-NEXT:    popq %r13
   4334 ; AVX2-NEXT:    popq %r14
   4335 ; AVX2-NEXT:    popq %r15
   4336 ; AVX2-NEXT:    popq %rbp
   4337 ; AVX2-NEXT:    retq
   4338 ;
   4339 ; AVX512-LABEL: store_cvt_8f64_to_8i16:
   4340 ; AVX512:       # %bb.0:
   4341 ; AVX512-NEXT:    pushq %rbp
   4342 ; AVX512-NEXT:    pushq %r15
   4343 ; AVX512-NEXT:    pushq %r14
   4344 ; AVX512-NEXT:    pushq %r13
   4345 ; AVX512-NEXT:    pushq %r12
   4346 ; AVX512-NEXT:    pushq %rbx
   4347 ; AVX512-NEXT:    subq $200, %rsp
   4348 ; AVX512-NEXT:    movq %rdi, %rbx
   4349 ; AVX512-NEXT:    vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
   4350 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   4351 ; AVX512-NEXT:    vzeroupper
   4352 ; AVX512-NEXT:    callq __truncdfhf2
   4353 ; AVX512-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
   4354 ; AVX512-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
   4355 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   4356 ; AVX512-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
   4357 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   4358 ; AVX512-NEXT:    vzeroupper
   4359 ; AVX512-NEXT:    callq __truncdfhf2
   4360 ; AVX512-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
   4361 ; AVX512-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
   4362 ; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
   4363 ; AVX512-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
   4364 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   4365 ; AVX512-NEXT:    vzeroupper
   4366 ; AVX512-NEXT:    callq __truncdfhf2
   4367 ; AVX512-NEXT:    movl %eax, %r12d
   4368 ; AVX512-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
   4369 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   4370 ; AVX512-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
   4371 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   4372 ; AVX512-NEXT:    vzeroupper
   4373 ; AVX512-NEXT:    callq __truncdfhf2
   4374 ; AVX512-NEXT:    movl %eax, %r13d
   4375 ; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
   4376 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
   4377 ; AVX512-NEXT:    vzeroupper
   4378 ; AVX512-NEXT:    callq __truncdfhf2
   4379 ; AVX512-NEXT:    movl %eax, %ebp
   4380 ; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
   4381 ; AVX512-NEXT:    callq __truncdfhf2
   4382 ; AVX512-NEXT:    movl %eax, %r14d
   4383 ; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
   4384 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   4385 ; AVX512-NEXT:    vzeroupper
   4386 ; AVX512-NEXT:    callq __truncdfhf2
   4387 ; AVX512-NEXT:    movl %eax, %r15d
   4388 ; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
   4389 ; AVX512-NEXT:    callq __truncdfhf2
   4390 ; AVX512-NEXT:    movw %ax, 12(%rbx)
   4391 ; AVX512-NEXT:    movw %r15w, 8(%rbx)
   4392 ; AVX512-NEXT:    movw %r14w, 4(%rbx)
   4393 ; AVX512-NEXT:    movw %bp, (%rbx)
   4394 ; AVX512-NEXT:    movw %r13w, 14(%rbx)
   4395 ; AVX512-NEXT:    movw %r12w, 10(%rbx)
   4396 ; AVX512-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
   4397 ; AVX512-NEXT:    movw %ax, 6(%rbx)
   4398 ; AVX512-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
   4399 ; AVX512-NEXT:    movw %ax, 2(%rbx)
   4400 ; AVX512-NEXT:    addq $200, %rsp
   4401 ; AVX512-NEXT:    popq %rbx
   4402 ; AVX512-NEXT:    popq %r12
   4403 ; AVX512-NEXT:    popq %r13
   4404 ; AVX512-NEXT:    popq %r14
   4405 ; AVX512-NEXT:    popq %r15
   4406 ; AVX512-NEXT:    popq %rbp
   4407 ; AVX512-NEXT:    retq
   4408   %1 = fptrunc <8 x double> %a0 to <8 x half>
   4409   %2 = bitcast <8 x half> %1 to <8 x i16>
   4410   store <8 x i16> %2, <8 x i16>* %a1
   4411   ret void
   4412 }
   4413