Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512
      5 
      6 ;
      7 ; Half to Float
      8 ;
      9 
     10 define float @cvt_i16_to_f32(i16 %a0) {
     11 ; ALL-LABEL: cvt_i16_to_f32:
     12 ; ALL:       # BB#0:
     13 ; ALL-NEXT:    movswl %di, %eax
     14 ; ALL-NEXT:    vmovd %eax, %xmm0
     15 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
     16 ; ALL-NEXT:    retq
     17   %1 = bitcast i16 %a0 to half
     18   %2 = fpext half %1 to float
     19   ret float %2
     20 }
     21 
     22 define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) {
     23 ; ALL-LABEL: cvt_4i16_to_4f32:
     24 ; ALL:       # BB#0:
     25 ; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
     26 ; ALL-NEXT:    vmovq %xmm0, %rax
     27 ; ALL-NEXT:    movq %rax, %rcx
     28 ; ALL-NEXT:    movq %rax, %rdx
     29 ; ALL-NEXT:    movswl %ax, %esi
     30 ; ALL-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
     31 ; ALL-NEXT:    shrl $16, %eax
     32 ; ALL-NEXT:    shrq $32, %rcx
     33 ; ALL-NEXT:    shrq $48, %rdx
     34 ; ALL-NEXT:    movswl %dx, %edx
     35 ; ALL-NEXT:    vmovd %edx, %xmm0
     36 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
     37 ; ALL-NEXT:    movswl %cx, %ecx
     38 ; ALL-NEXT:    vmovd %ecx, %xmm1
     39 ; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
     40 ; ALL-NEXT:    cwtl
     41 ; ALL-NEXT:    vmovd %eax, %xmm2
     42 ; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
     43 ; ALL-NEXT:    vmovd %esi, %xmm3
     44 ; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
     45 ; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
     46 ; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
     47 ; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
     48 ; ALL-NEXT:    retq
     49   %1 = bitcast <4 x i16> %a0 to <4 x half>
     50   %2 = fpext <4 x half> %1 to <4 x float>
     51   ret <4 x float> %2
     52 }
     53 
     54 define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) {
     55 ; ALL-LABEL: cvt_8i16_to_4f32:
     56 ; ALL:       # BB#0:
     57 ; ALL-NEXT:    vmovq %xmm0, %rax
     58 ; ALL-NEXT:    movq %rax, %rcx
     59 ; ALL-NEXT:    movq %rax, %rdx
     60 ; ALL-NEXT:    movswl %ax, %esi
     61 ; ALL-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
     62 ; ALL-NEXT:    shrl $16, %eax
     63 ; ALL-NEXT:    shrq $32, %rcx
     64 ; ALL-NEXT:    shrq $48, %rdx
     65 ; ALL-NEXT:    movswl %dx, %edx
     66 ; ALL-NEXT:    vmovd %edx, %xmm0
     67 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
     68 ; ALL-NEXT:    movswl %cx, %ecx
     69 ; ALL-NEXT:    vmovd %ecx, %xmm1
     70 ; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
     71 ; ALL-NEXT:    cwtl
     72 ; ALL-NEXT:    vmovd %eax, %xmm2
     73 ; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
     74 ; ALL-NEXT:    vmovd %esi, %xmm3
     75 ; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
     76 ; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
     77 ; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
     78 ; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
     79 ; ALL-NEXT:    retq
     80   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
     81   %2 = bitcast <4 x i16> %1 to <4 x half>
     82   %3 = fpext <4 x half> %2 to <4 x float>
     83   ret <4 x float> %3
     84 }
     85 
     86 define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) {
     87 ; AVX1-LABEL: cvt_8i16_to_8f32:
     88 ; AVX1:       # BB#0:
     89 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rdx
     90 ; AVX1-NEXT:    movq %rdx, %r8
     91 ; AVX1-NEXT:    movq %rdx, %r10
     92 ; AVX1-NEXT:    movswl %dx, %r9d
     93 ; AVX1-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<kill>
     94 ; AVX1-NEXT:    shrl $16, %edx
     95 ; AVX1-NEXT:    shrq $32, %r8
     96 ; AVX1-NEXT:    shrq $48, %r10
     97 ; AVX1-NEXT:    vmovq %xmm0, %rdi
     98 ; AVX1-NEXT:    movq %rdi, %rax
     99 ; AVX1-NEXT:    movq %rdi, %rsi
    100 ; AVX1-NEXT:    movswl %di, %ecx
    101 ; AVX1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<kill>
    102 ; AVX1-NEXT:    shrl $16, %edi
    103 ; AVX1-NEXT:    shrq $32, %rax
    104 ; AVX1-NEXT:    shrq $48, %rsi
    105 ; AVX1-NEXT:    movswl %si, %esi
    106 ; AVX1-NEXT:    vmovd %esi, %xmm0
    107 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
    108 ; AVX1-NEXT:    cwtl
    109 ; AVX1-NEXT:    vmovd %eax, %xmm1
    110 ; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
    111 ; AVX1-NEXT:    movswl %di, %eax
    112 ; AVX1-NEXT:    vmovd %eax, %xmm2
    113 ; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
    114 ; AVX1-NEXT:    vmovd %ecx, %xmm3
    115 ; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
    116 ; AVX1-NEXT:    movswl %r10w, %eax
    117 ; AVX1-NEXT:    vmovd %eax, %xmm4
    118 ; AVX1-NEXT:    vcvtph2ps %xmm4, %xmm4
    119 ; AVX1-NEXT:    movswl %r8w, %eax
    120 ; AVX1-NEXT:    vmovd %eax, %xmm5
    121 ; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
    122 ; AVX1-NEXT:    movswl %dx, %eax
    123 ; AVX1-NEXT:    vmovd %eax, %xmm6
    124 ; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
    125 ; AVX1-NEXT:    vmovd %r9d, %xmm7
    126 ; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
    127 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
    128 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
    129 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
    130 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
    131 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
    132 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    133 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
    134 ; AVX1-NEXT:    retq
    135 ;
    136 ; AVX2-LABEL: cvt_8i16_to_8f32:
    137 ; AVX2:       # BB#0:
    138 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
    139 ; AVX2-NEXT:    movq %rdx, %r8
    140 ; AVX2-NEXT:    movq %rdx, %r10
    141 ; AVX2-NEXT:    movswl %dx, %r9d
    142 ; AVX2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<kill>
    143 ; AVX2-NEXT:    shrl $16, %edx
    144 ; AVX2-NEXT:    shrq $32, %r8
    145 ; AVX2-NEXT:    shrq $48, %r10
    146 ; AVX2-NEXT:    vmovq %xmm0, %rdi
    147 ; AVX2-NEXT:    movq %rdi, %rax
    148 ; AVX2-NEXT:    movq %rdi, %rsi
    149 ; AVX2-NEXT:    movswl %di, %ecx
    150 ; AVX2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<kill>
    151 ; AVX2-NEXT:    shrl $16, %edi
    152 ; AVX2-NEXT:    shrq $32, %rax
    153 ; AVX2-NEXT:    shrq $48, %rsi
    154 ; AVX2-NEXT:    movswl %si, %esi
    155 ; AVX2-NEXT:    vmovd %esi, %xmm0
    156 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
    157 ; AVX2-NEXT:    cwtl
    158 ; AVX2-NEXT:    vmovd %eax, %xmm1
    159 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
    160 ; AVX2-NEXT:    movswl %di, %eax
    161 ; AVX2-NEXT:    vmovd %eax, %xmm2
    162 ; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
    163 ; AVX2-NEXT:    vmovd %ecx, %xmm3
    164 ; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
    165 ; AVX2-NEXT:    movswl %r10w, %eax
    166 ; AVX2-NEXT:    vmovd %eax, %xmm4
    167 ; AVX2-NEXT:    vcvtph2ps %xmm4, %xmm4
    168 ; AVX2-NEXT:    movswl %r8w, %eax
    169 ; AVX2-NEXT:    vmovd %eax, %xmm5
    170 ; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
    171 ; AVX2-NEXT:    movswl %dx, %eax
    172 ; AVX2-NEXT:    vmovd %eax, %xmm6
    173 ; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
    174 ; AVX2-NEXT:    vmovd %r9d, %xmm7
    175 ; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
    176 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
    177 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
    178 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
    179 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
    180 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
    181 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    182 ; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
    183 ; AVX2-NEXT:    retq
    184 ;
    185 ; AVX512-LABEL: cvt_8i16_to_8f32:
    186 ; AVX512:       # BB#0:
    187 ; AVX512-NEXT:    vpextrq $1, %xmm0, %rdx
    188 ; AVX512-NEXT:    movq %rdx, %r8
    189 ; AVX512-NEXT:    movq %rdx, %r10
    190 ; AVX512-NEXT:    movswl %dx, %r9d
    191 ; AVX512-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<kill>
    192 ; AVX512-NEXT:    shrl $16, %edx
    193 ; AVX512-NEXT:    shrq $32, %r8
    194 ; AVX512-NEXT:    shrq $48, %r10
    195 ; AVX512-NEXT:    vmovq %xmm0, %rdi
    196 ; AVX512-NEXT:    movq %rdi, %rax
    197 ; AVX512-NEXT:    movq %rdi, %rsi
    198 ; AVX512-NEXT:    movswl %di, %ecx
    199 ; AVX512-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<kill>
    200 ; AVX512-NEXT:    shrl $16, %edi
    201 ; AVX512-NEXT:    shrq $32, %rax
    202 ; AVX512-NEXT:    shrq $48, %rsi
    203 ; AVX512-NEXT:    movswl %si, %esi
    204 ; AVX512-NEXT:    vmovd %esi, %xmm0
    205 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
    206 ; AVX512-NEXT:    cwtl
    207 ; AVX512-NEXT:    vmovd %eax, %xmm1
    208 ; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
    209 ; AVX512-NEXT:    movswl %di, %eax
    210 ; AVX512-NEXT:    vmovd %eax, %xmm2
    211 ; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
    212 ; AVX512-NEXT:    vmovd %ecx, %xmm3
    213 ; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
    214 ; AVX512-NEXT:    movswl %r10w, %eax
    215 ; AVX512-NEXT:    vmovd %eax, %xmm4
    216 ; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm4
    217 ; AVX512-NEXT:    movswl %r8w, %eax
    218 ; AVX512-NEXT:    vmovd %eax, %xmm5
    219 ; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
    220 ; AVX512-NEXT:    movswl %dx, %eax
    221 ; AVX512-NEXT:    vmovd %eax, %xmm6
    222 ; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm6
    223 ; AVX512-NEXT:    vmovd %r9d, %xmm7
    224 ; AVX512-NEXT:    vcvtph2ps %xmm7, %xmm7
    225 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
    226 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
    227 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
    228 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
    229 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
    230 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    231 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
    232 ; AVX512-NEXT:    retq
    233   %1 = bitcast <8 x i16> %a0 to <8 x half>
    234   %2 = fpext <8 x half> %1 to <8 x float>
    235   ret <8 x float> %2
    236 }
    237 
    238 define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) {
    239 ; AVX1-LABEL: cvt_16i16_to_16f32:
    240 ; AVX1:       # BB#0:
    241 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
    242 ; AVX1-NEXT:    vmovq %xmm4, %rax
    243 ; AVX1-NEXT:    movq %rax, %rcx
    244 ; AVX1-NEXT:    shrq $48, %rcx
    245 ; AVX1-NEXT:    movswl %cx, %ecx
    246 ; AVX1-NEXT:    vmovd %ecx, %xmm8
    247 ; AVX1-NEXT:    movq %rax, %rcx
    248 ; AVX1-NEXT:    shrq $32, %rcx
    249 ; AVX1-NEXT:    movswl %cx, %ecx
    250 ; AVX1-NEXT:    vmovd %ecx, %xmm9
    251 ; AVX1-NEXT:    movswl %ax, %ecx
    252 ; AVX1-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
    253 ; AVX1-NEXT:    shrl $16, %eax
    254 ; AVX1-NEXT:    cwtl
    255 ; AVX1-NEXT:    vmovd %eax, %xmm10
    256 ; AVX1-NEXT:    vpextrq $1, %xmm4, %rax
    257 ; AVX1-NEXT:    vmovd %ecx, %xmm11
    258 ; AVX1-NEXT:    movq %rax, %rcx
    259 ; AVX1-NEXT:    shrq $48, %rcx
    260 ; AVX1-NEXT:    movswl %cx, %ecx
    261 ; AVX1-NEXT:    vmovd %ecx, %xmm12
    262 ; AVX1-NEXT:    movq %rax, %rcx
    263 ; AVX1-NEXT:    shrq $32, %rcx
    264 ; AVX1-NEXT:    movswl %cx, %ecx
    265 ; AVX1-NEXT:    vmovd %ecx, %xmm13
    266 ; AVX1-NEXT:    movswl %ax, %ecx
    267 ; AVX1-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
    268 ; AVX1-NEXT:    shrl $16, %eax
    269 ; AVX1-NEXT:    cwtl
    270 ; AVX1-NEXT:    vmovd %eax, %xmm14
    271 ; AVX1-NEXT:    vmovq %xmm0, %rax
    272 ; AVX1-NEXT:    vmovd %ecx, %xmm15
    273 ; AVX1-NEXT:    movq %rax, %rcx
    274 ; AVX1-NEXT:    shrq $48, %rcx
    275 ; AVX1-NEXT:    movswl %cx, %ecx
    276 ; AVX1-NEXT:    vmovd %ecx, %xmm2
    277 ; AVX1-NEXT:    movq %rax, %rcx
    278 ; AVX1-NEXT:    shrq $32, %rcx
    279 ; AVX1-NEXT:    movswl %cx, %ecx
    280 ; AVX1-NEXT:    vmovd %ecx, %xmm3
    281 ; AVX1-NEXT:    movswl %ax, %ecx
    282 ; AVX1-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
    283 ; AVX1-NEXT:    shrl $16, %eax
    284 ; AVX1-NEXT:    cwtl
    285 ; AVX1-NEXT:    vmovd %eax, %xmm4
    286 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
    287 ; AVX1-NEXT:    vmovd %ecx, %xmm0
    288 ; AVX1-NEXT:    movq %rax, %rcx
    289 ; AVX1-NEXT:    shrq $48, %rcx
    290 ; AVX1-NEXT:    movswl %cx, %ecx
    291 ; AVX1-NEXT:    vmovd %ecx, %xmm5
    292 ; AVX1-NEXT:    movq %rax, %rcx
    293 ; AVX1-NEXT:    shrq $32, %rcx
    294 ; AVX1-NEXT:    movswl %cx, %ecx
    295 ; AVX1-NEXT:    vmovd %ecx, %xmm6
    296 ; AVX1-NEXT:    movl %eax, %ecx
    297 ; AVX1-NEXT:    shrl $16, %ecx
    298 ; AVX1-NEXT:    movswl %cx, %ecx
    299 ; AVX1-NEXT:    vmovd %ecx, %xmm7
    300 ; AVX1-NEXT:    cwtl
    301 ; AVX1-NEXT:    vmovd %eax, %xmm1
    302 ; AVX1-NEXT:    vcvtph2ps %xmm8, %xmm8
    303 ; AVX1-NEXT:    vcvtph2ps %xmm9, %xmm9
    304 ; AVX1-NEXT:    vcvtph2ps %xmm10, %xmm10
    305 ; AVX1-NEXT:    vcvtph2ps %xmm11, %xmm11
    306 ; AVX1-NEXT:    vcvtph2ps %xmm12, %xmm12
    307 ; AVX1-NEXT:    vcvtph2ps %xmm13, %xmm13
    308 ; AVX1-NEXT:    vcvtph2ps %xmm14, %xmm14
    309 ; AVX1-NEXT:    vcvtph2ps %xmm15, %xmm15
    310 ; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
    311 ; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
    312 ; AVX1-NEXT:    vcvtph2ps %xmm4, %xmm4
    313 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
    314 ; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
    315 ; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
    316 ; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
    317 ; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
    318 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3]
    319 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
    320 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
    321 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
    322 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
    323 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
    324 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    325 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3]
    326 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
    327 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
    328 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3]
    329 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
    330 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
    331 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
    332 ; AVX1-NEXT:    retq
    333 ;
    334 ; AVX2-LABEL: cvt_16i16_to_16f32:
    335 ; AVX2:       # BB#0:
    336 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
    337 ; AVX2-NEXT:    vmovq %xmm4, %rax
    338 ; AVX2-NEXT:    movq %rax, %rcx
    339 ; AVX2-NEXT:    shrq $48, %rcx
    340 ; AVX2-NEXT:    movswl %cx, %ecx
    341 ; AVX2-NEXT:    vmovd %ecx, %xmm8
    342 ; AVX2-NEXT:    movq %rax, %rcx
    343 ; AVX2-NEXT:    shrq $32, %rcx
    344 ; AVX2-NEXT:    movswl %cx, %ecx
    345 ; AVX2-NEXT:    vmovd %ecx, %xmm9
    346 ; AVX2-NEXT:    movswl %ax, %ecx
    347 ; AVX2-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
    348 ; AVX2-NEXT:    shrl $16, %eax
    349 ; AVX2-NEXT:    cwtl
    350 ; AVX2-NEXT:    vmovd %eax, %xmm10
    351 ; AVX2-NEXT:    vpextrq $1, %xmm4, %rax
    352 ; AVX2-NEXT:    vmovd %ecx, %xmm11
    353 ; AVX2-NEXT:    movq %rax, %rcx
    354 ; AVX2-NEXT:    shrq $48, %rcx
    355 ; AVX2-NEXT:    movswl %cx, %ecx
    356 ; AVX2-NEXT:    vmovd %ecx, %xmm12
    357 ; AVX2-NEXT:    movq %rax, %rcx
    358 ; AVX2-NEXT:    shrq $32, %rcx
    359 ; AVX2-NEXT:    movswl %cx, %ecx
    360 ; AVX2-NEXT:    vmovd %ecx, %xmm13
    361 ; AVX2-NEXT:    movswl %ax, %ecx
    362 ; AVX2-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
    363 ; AVX2-NEXT:    shrl $16, %eax
    364 ; AVX2-NEXT:    cwtl
    365 ; AVX2-NEXT:    vmovd %eax, %xmm14
    366 ; AVX2-NEXT:    vmovq %xmm0, %rax
    367 ; AVX2-NEXT:    vmovd %ecx, %xmm15
    368 ; AVX2-NEXT:    movq %rax, %rcx
    369 ; AVX2-NEXT:    shrq $48, %rcx
    370 ; AVX2-NEXT:    movswl %cx, %ecx
    371 ; AVX2-NEXT:    vmovd %ecx, %xmm2
    372 ; AVX2-NEXT:    movq %rax, %rcx
    373 ; AVX2-NEXT:    shrq $32, %rcx
    374 ; AVX2-NEXT:    movswl %cx, %ecx
    375 ; AVX2-NEXT:    vmovd %ecx, %xmm3
    376 ; AVX2-NEXT:    movswl %ax, %ecx
    377 ; AVX2-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
    378 ; AVX2-NEXT:    shrl $16, %eax
    379 ; AVX2-NEXT:    cwtl
    380 ; AVX2-NEXT:    vmovd %eax, %xmm4
    381 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
    382 ; AVX2-NEXT:    vmovd %ecx, %xmm0
    383 ; AVX2-NEXT:    movq %rax, %rcx
    384 ; AVX2-NEXT:    shrq $48, %rcx
    385 ; AVX2-NEXT:    movswl %cx, %ecx
    386 ; AVX2-NEXT:    vmovd %ecx, %xmm5
    387 ; AVX2-NEXT:    movq %rax, %rcx
    388 ; AVX2-NEXT:    shrq $32, %rcx
    389 ; AVX2-NEXT:    movswl %cx, %ecx
    390 ; AVX2-NEXT:    vmovd %ecx, %xmm6
    391 ; AVX2-NEXT:    movl %eax, %ecx
    392 ; AVX2-NEXT:    shrl $16, %ecx
    393 ; AVX2-NEXT:    movswl %cx, %ecx
    394 ; AVX2-NEXT:    vmovd %ecx, %xmm7
    395 ; AVX2-NEXT:    cwtl
    396 ; AVX2-NEXT:    vmovd %eax, %xmm1
    397 ; AVX2-NEXT:    vcvtph2ps %xmm8, %xmm8
    398 ; AVX2-NEXT:    vcvtph2ps %xmm9, %xmm9
    399 ; AVX2-NEXT:    vcvtph2ps %xmm10, %xmm10
    400 ; AVX2-NEXT:    vcvtph2ps %xmm11, %xmm11
    401 ; AVX2-NEXT:    vcvtph2ps %xmm12, %xmm12
    402 ; AVX2-NEXT:    vcvtph2ps %xmm13, %xmm13
    403 ; AVX2-NEXT:    vcvtph2ps %xmm14, %xmm14
    404 ; AVX2-NEXT:    vcvtph2ps %xmm15, %xmm15
    405 ; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
    406 ; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
    407 ; AVX2-NEXT:    vcvtph2ps %xmm4, %xmm4
    408 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
    409 ; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
    410 ; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
    411 ; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
    412 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
    413 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3]
    414 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
    415 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
    416 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
    417 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
    418 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
    419 ; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    420 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3]
    421 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
    422 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
    423 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3]
    424 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
    425 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
    426 ; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
    427 ; AVX2-NEXT:    retq
    428 ;
    429 ; AVX512-LABEL: cvt_16i16_to_16f32:
    430 ; AVX512:       # BB#0:
    431 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm10
    432 ; AVX512-NEXT:    vmovq %xmm0, %rax
    433 ; AVX512-NEXT:    movq %rax, %rcx
    434 ; AVX512-NEXT:    shrq $48, %rcx
    435 ; AVX512-NEXT:    movswl %cx, %ecx
    436 ; AVX512-NEXT:    vmovd %ecx, %xmm8
    437 ; AVX512-NEXT:    movq %rax, %rcx
    438 ; AVX512-NEXT:    shrq $32, %rcx
    439 ; AVX512-NEXT:    movswl %cx, %ecx
    440 ; AVX512-NEXT:    vmovd %ecx, %xmm9
    441 ; AVX512-NEXT:    movswl %ax, %ecx
    442 ; AVX512-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
    443 ; AVX512-NEXT:    shrl $16, %eax
    444 ; AVX512-NEXT:    cwtl
    445 ; AVX512-NEXT:    vmovd %eax, %xmm11
    446 ; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
    447 ; AVX512-NEXT:    vmovd %ecx, %xmm12
    448 ; AVX512-NEXT:    movq %rax, %rcx
    449 ; AVX512-NEXT:    shrq $48, %rcx
    450 ; AVX512-NEXT:    movswl %cx, %ecx
    451 ; AVX512-NEXT:    vmovd %ecx, %xmm13
    452 ; AVX512-NEXT:    movq %rax, %rcx
    453 ; AVX512-NEXT:    shrq $32, %rcx
    454 ; AVX512-NEXT:    movswl %cx, %ecx
    455 ; AVX512-NEXT:    vmovd %ecx, %xmm14
    456 ; AVX512-NEXT:    movswl %ax, %ecx
    457 ; AVX512-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
    458 ; AVX512-NEXT:    shrl $16, %eax
    459 ; AVX512-NEXT:    cwtl
    460 ; AVX512-NEXT:    vmovd %eax, %xmm15
    461 ; AVX512-NEXT:    vmovq %xmm10, %rax
    462 ; AVX512-NEXT:    vmovd %ecx, %xmm2
    463 ; AVX512-NEXT:    movq %rax, %rcx
    464 ; AVX512-NEXT:    shrq $48, %rcx
    465 ; AVX512-NEXT:    movswl %cx, %ecx
    466 ; AVX512-NEXT:    vmovd %ecx, %xmm3
    467 ; AVX512-NEXT:    movq %rax, %rcx
    468 ; AVX512-NEXT:    shrq $32, %rcx
    469 ; AVX512-NEXT:    movswl %cx, %ecx
    470 ; AVX512-NEXT:    vmovd %ecx, %xmm1
    471 ; AVX512-NEXT:    movswl %ax, %ecx
    472 ; AVX512-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
    473 ; AVX512-NEXT:    shrl $16, %eax
    474 ; AVX512-NEXT:    cwtl
    475 ; AVX512-NEXT:    vmovd %eax, %xmm4
    476 ; AVX512-NEXT:    vpextrq $1, %xmm10, %rax
    477 ; AVX512-NEXT:    vmovd %ecx, %xmm10
    478 ; AVX512-NEXT:    movq %rax, %rcx
    479 ; AVX512-NEXT:    shrq $48, %rcx
    480 ; AVX512-NEXT:    movswl %cx, %ecx
    481 ; AVX512-NEXT:    vmovd %ecx, %xmm5
    482 ; AVX512-NEXT:    movq %rax, %rcx
    483 ; AVX512-NEXT:    shrq $32, %rcx
    484 ; AVX512-NEXT:    movswl %cx, %ecx
    485 ; AVX512-NEXT:    vmovd %ecx, %xmm6
    486 ; AVX512-NEXT:    movl %eax, %ecx
    487 ; AVX512-NEXT:    shrl $16, %ecx
    488 ; AVX512-NEXT:    movswl %cx, %ecx
    489 ; AVX512-NEXT:    vmovd %ecx, %xmm7
    490 ; AVX512-NEXT:    cwtl
    491 ; AVX512-NEXT:    vmovd %eax, %xmm0
    492 ; AVX512-NEXT:    vcvtph2ps %xmm8, %xmm8
    493 ; AVX512-NEXT:    vcvtph2ps %xmm9, %xmm9
    494 ; AVX512-NEXT:    vcvtph2ps %xmm11, %xmm11
    495 ; AVX512-NEXT:    vcvtph2ps %xmm12, %xmm12
    496 ; AVX512-NEXT:    vcvtph2ps %xmm13, %xmm13
    497 ; AVX512-NEXT:    vcvtph2ps %xmm14, %xmm14
    498 ; AVX512-NEXT:    vcvtph2ps %xmm15, %xmm15
    499 ; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
    500 ; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
    501 ; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
    502 ; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm4
    503 ; AVX512-NEXT:    vcvtph2ps %xmm10, %xmm10
    504 ; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
    505 ; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm6
    506 ; AVX512-NEXT:    vcvtph2ps %xmm7, %xmm7
    507 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
    508 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3]
    509 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3]
    510 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0]
    511 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3]
    512 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3]
    513 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
    514 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
    515 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3]
    516 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
    517 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
    518 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
    519 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
    520 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
    521 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
    522 ; AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
    523 ; AVX512-NEXT:    retq
    524   %1 = bitcast <16 x i16> %a0 to <16 x half>
    525   %2 = fpext <16 x half> %1 to <16 x float>
    526   ret <16 x float> %2
    527 }
    528 
    529 ;
    530 ; Half to Float (Load)
    531 ;
    532 
    533 define float @load_cvt_i16_to_f32(i16* %a0) {
    534 ; ALL-LABEL: load_cvt_i16_to_f32:
    535 ; ALL:       # BB#0:
    536 ; ALL-NEXT:    movswl (%rdi), %eax
    537 ; ALL-NEXT:    vmovd %eax, %xmm0
    538 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
    539 ; ALL-NEXT:    retq
    540   %1 = load i16, i16* %a0
    541   %2 = bitcast i16 %1 to half
    542   %3 = fpext half %2 to float
    543   ret float %3
    544 }
    545 
    546 define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) {
    547 ; ALL-LABEL: load_cvt_4i16_to_4f32:
    548 ; ALL:       # BB#0:
    549 ; ALL-NEXT:    movswl 6(%rdi), %eax
    550 ; ALL-NEXT:    vmovd %eax, %xmm0
    551 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
    552 ; ALL-NEXT:    movswl 4(%rdi), %eax
    553 ; ALL-NEXT:    vmovd %eax, %xmm1
    554 ; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
    555 ; ALL-NEXT:    movswl (%rdi), %eax
    556 ; ALL-NEXT:    vmovd %eax, %xmm2
    557 ; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
    558 ; ALL-NEXT:    movswl 2(%rdi), %eax
    559 ; ALL-NEXT:    vmovd %eax, %xmm3
    560 ; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
    561 ; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
    562 ; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
    563 ; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    564 ; ALL-NEXT:    retq
    565   %1 = load <4 x i16>, <4 x i16>* %a0
    566   %2 = bitcast <4 x i16> %1 to <4 x half>
    567   %3 = fpext <4 x half> %2 to <4 x float>
    568   ret <4 x float> %3
    569 }
    570 
    571 define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) {
    572 ; ALL-LABEL: load_cvt_8i16_to_4f32:
    573 ; ALL:       # BB#0:
    574 ; ALL-NEXT:    movq (%rdi), %rax
    575 ; ALL-NEXT:    movq %rax, %rcx
    576 ; ALL-NEXT:    movq %rax, %rdx
    577 ; ALL-NEXT:    movswl %ax, %esi
    578 ; ALL-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
    579 ; ALL-NEXT:    shrl $16, %eax
    580 ; ALL-NEXT:    shrq $32, %rcx
    581 ; ALL-NEXT:    shrq $48, %rdx
    582 ; ALL-NEXT:    movswl %dx, %edx
    583 ; ALL-NEXT:    vmovd %edx, %xmm0
    584 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
    585 ; ALL-NEXT:    movswl %cx, %ecx
    586 ; ALL-NEXT:    vmovd %ecx, %xmm1
    587 ; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
    588 ; ALL-NEXT:    cwtl
    589 ; ALL-NEXT:    vmovd %eax, %xmm2
    590 ; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
    591 ; ALL-NEXT:    vmovd %esi, %xmm3
    592 ; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
    593 ; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
    594 ; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
    595 ; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    596 ; ALL-NEXT:    retq
    597   %1 = load <8 x i16>, <8 x i16>* %a0
    598   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    599   %3 = bitcast <4 x i16> %2 to <4 x half>
    600   %4 = fpext <4 x half> %3 to <4 x float>
    601   ret <4 x float> %4
    602 }
    603 
    604 define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) {
    605 ; AVX1-LABEL: load_cvt_8i16_to_8f32:
    606 ; AVX1:       # BB#0:
    607 ; AVX1-NEXT:    movswl 6(%rdi), %eax
    608 ; AVX1-NEXT:    vmovd %eax, %xmm0
    609 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
    610 ; AVX1-NEXT:    movswl 4(%rdi), %eax
    611 ; AVX1-NEXT:    vmovd %eax, %xmm1
    612 ; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
    613 ; AVX1-NEXT:    movswl (%rdi), %eax
    614 ; AVX1-NEXT:    vmovd %eax, %xmm2
    615 ; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
    616 ; AVX1-NEXT:    movswl 2(%rdi), %eax
    617 ; AVX1-NEXT:    vmovd %eax, %xmm3
    618 ; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
    619 ; AVX1-NEXT:    movswl 14(%rdi), %eax
    620 ; AVX1-NEXT:    vmovd %eax, %xmm4
    621 ; AVX1-NEXT:    vcvtph2ps %xmm4, %xmm4
    622 ; AVX1-NEXT:    movswl 12(%rdi), %eax
    623 ; AVX1-NEXT:    vmovd %eax, %xmm5
    624 ; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
    625 ; AVX1-NEXT:    movswl 8(%rdi), %eax
    626 ; AVX1-NEXT:    vmovd %eax, %xmm6
    627 ; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
    628 ; AVX1-NEXT:    movswl 10(%rdi), %eax
    629 ; AVX1-NEXT:    vmovd %eax, %xmm7
    630 ; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
    631 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
    632 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
    633 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
    634 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
    635 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
    636 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    637 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
    638 ; AVX1-NEXT:    retq
    639 ;
    640 ; AVX2-LABEL: load_cvt_8i16_to_8f32:
    641 ; AVX2:       # BB#0:
    642 ; AVX2-NEXT:    movswl 6(%rdi), %eax
    643 ; AVX2-NEXT:    vmovd %eax, %xmm0
    644 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
    645 ; AVX2-NEXT:    movswl 4(%rdi), %eax
    646 ; AVX2-NEXT:    vmovd %eax, %xmm1
    647 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
    648 ; AVX2-NEXT:    movswl (%rdi), %eax
    649 ; AVX2-NEXT:    vmovd %eax, %xmm2
    650 ; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
    651 ; AVX2-NEXT:    movswl 2(%rdi), %eax
    652 ; AVX2-NEXT:    vmovd %eax, %xmm3
    653 ; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
    654 ; AVX2-NEXT:    movswl 14(%rdi), %eax
    655 ; AVX2-NEXT:    vmovd %eax, %xmm4
    656 ; AVX2-NEXT:    vcvtph2ps %xmm4, %xmm4
    657 ; AVX2-NEXT:    movswl 12(%rdi), %eax
    658 ; AVX2-NEXT:    vmovd %eax, %xmm5
    659 ; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
    660 ; AVX2-NEXT:    movswl 8(%rdi), %eax
    661 ; AVX2-NEXT:    vmovd %eax, %xmm6
    662 ; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
    663 ; AVX2-NEXT:    movswl 10(%rdi), %eax
    664 ; AVX2-NEXT:    vmovd %eax, %xmm7
    665 ; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
    666 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
    667 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
    668 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
    669 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
    670 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
    671 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    672 ; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
    673 ; AVX2-NEXT:    retq
    674 ;
    675 ; AVX512-LABEL: load_cvt_8i16_to_8f32:
    676 ; AVX512:       # BB#0:
    677 ; AVX512-NEXT:    movswl 6(%rdi), %eax
    678 ; AVX512-NEXT:    vmovd %eax, %xmm0
    679 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
    680 ; AVX512-NEXT:    movswl 4(%rdi), %eax
    681 ; AVX512-NEXT:    vmovd %eax, %xmm1
    682 ; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
    683 ; AVX512-NEXT:    movswl (%rdi), %eax
    684 ; AVX512-NEXT:    vmovd %eax, %xmm2
    685 ; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
    686 ; AVX512-NEXT:    movswl 2(%rdi), %eax
    687 ; AVX512-NEXT:    vmovd %eax, %xmm3
    688 ; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
    689 ; AVX512-NEXT:    movswl 14(%rdi), %eax
    690 ; AVX512-NEXT:    vmovd %eax, %xmm4
    691 ; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm4
    692 ; AVX512-NEXT:    movswl 12(%rdi), %eax
    693 ; AVX512-NEXT:    vmovd %eax, %xmm5
    694 ; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
    695 ; AVX512-NEXT:    movswl 8(%rdi), %eax
    696 ; AVX512-NEXT:    vmovd %eax, %xmm6
    697 ; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm6
    698 ; AVX512-NEXT:    movswl 10(%rdi), %eax
    699 ; AVX512-NEXT:    vmovd %eax, %xmm7
    700 ; AVX512-NEXT:    vcvtph2ps %xmm7, %xmm7
    701 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
    702 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
    703 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
    704 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
    705 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
    706 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    707 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
    708 ; AVX512-NEXT:    retq
    709   %1 = load <8 x i16>, <8 x i16>* %a0
    710   %2 = bitcast <8 x i16> %1 to <8 x half>
    711   %3 = fpext <8 x half> %2 to <8 x float>
    712   ret <8 x float> %3
    713 }
    714 
    715 define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) {
    716 ; AVX1-LABEL: load_cvt_16i16_to_16f32:
    717 ; AVX1:       # BB#0:
    718 ; AVX1-NEXT:    movswl 22(%rdi), %eax
    719 ; AVX1-NEXT:    vmovd %eax, %xmm0
    720 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm8
    721 ; AVX1-NEXT:    movswl 20(%rdi), %eax
    722 ; AVX1-NEXT:    vmovd %eax, %xmm0
    723 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm9
    724 ; AVX1-NEXT:    movswl 16(%rdi), %eax
    725 ; AVX1-NEXT:    vmovd %eax, %xmm0
    726 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm10
    727 ; AVX1-NEXT:    movswl 18(%rdi), %eax
    728 ; AVX1-NEXT:    vmovd %eax, %xmm0
    729 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm11
    730 ; AVX1-NEXT:    movswl 30(%rdi), %eax
    731 ; AVX1-NEXT:    vmovd %eax, %xmm0
    732 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm12
    733 ; AVX1-NEXT:    movswl 28(%rdi), %eax
    734 ; AVX1-NEXT:    vmovd %eax, %xmm0
    735 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm13
    736 ; AVX1-NEXT:    movswl 24(%rdi), %eax
    737 ; AVX1-NEXT:    vmovd %eax, %xmm0
    738 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm14
    739 ; AVX1-NEXT:    movswl 26(%rdi), %eax
    740 ; AVX1-NEXT:    vmovd %eax, %xmm0
    741 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm15
    742 ; AVX1-NEXT:    movswl 6(%rdi), %eax
    743 ; AVX1-NEXT:    vmovd %eax, %xmm0
    744 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
    745 ; AVX1-NEXT:    movswl 4(%rdi), %eax
    746 ; AVX1-NEXT:    vmovd %eax, %xmm2
    747 ; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
    748 ; AVX1-NEXT:    movswl (%rdi), %eax
    749 ; AVX1-NEXT:    vmovd %eax, %xmm3
    750 ; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
    751 ; AVX1-NEXT:    movswl 2(%rdi), %eax
    752 ; AVX1-NEXT:    vmovd %eax, %xmm4
    753 ; AVX1-NEXT:    vcvtph2ps %xmm4, %xmm4
    754 ; AVX1-NEXT:    movswl 14(%rdi), %eax
    755 ; AVX1-NEXT:    vmovd %eax, %xmm5
    756 ; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
    757 ; AVX1-NEXT:    movswl 12(%rdi), %eax
    758 ; AVX1-NEXT:    vmovd %eax, %xmm6
    759 ; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
    760 ; AVX1-NEXT:    movswl 8(%rdi), %eax
    761 ; AVX1-NEXT:    vmovd %eax, %xmm7
    762 ; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
    763 ; AVX1-NEXT:    movswl 10(%rdi), %eax
    764 ; AVX1-NEXT:    vmovd %eax, %xmm1
    765 ; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
    766 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
    767 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
    768 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
    769 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
    770 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
    771 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
    772 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    773 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
    774 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
    775 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
    776 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
    777 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
    778 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
    779 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
    780 ; AVX1-NEXT:    retq
    781 ;
    782 ; AVX2-LABEL: load_cvt_16i16_to_16f32:
    783 ; AVX2:       # BB#0:
    784 ; AVX2-NEXT:    movswl 22(%rdi), %eax
    785 ; AVX2-NEXT:    vmovd %eax, %xmm0
    786 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm8
    787 ; AVX2-NEXT:    movswl 20(%rdi), %eax
    788 ; AVX2-NEXT:    vmovd %eax, %xmm0
    789 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm9
    790 ; AVX2-NEXT:    movswl 16(%rdi), %eax
    791 ; AVX2-NEXT:    vmovd %eax, %xmm0
    792 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm10
    793 ; AVX2-NEXT:    movswl 18(%rdi), %eax
    794 ; AVX2-NEXT:    vmovd %eax, %xmm0
    795 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm11
    796 ; AVX2-NEXT:    movswl 30(%rdi), %eax
    797 ; AVX2-NEXT:    vmovd %eax, %xmm0
    798 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm12
    799 ; AVX2-NEXT:    movswl 28(%rdi), %eax
    800 ; AVX2-NEXT:    vmovd %eax, %xmm0
    801 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm13
    802 ; AVX2-NEXT:    movswl 24(%rdi), %eax
    803 ; AVX2-NEXT:    vmovd %eax, %xmm0
    804 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm14
    805 ; AVX2-NEXT:    movswl 26(%rdi), %eax
    806 ; AVX2-NEXT:    vmovd %eax, %xmm0
    807 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm15
    808 ; AVX2-NEXT:    movswl 6(%rdi), %eax
    809 ; AVX2-NEXT:    vmovd %eax, %xmm0
    810 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
    811 ; AVX2-NEXT:    movswl 4(%rdi), %eax
    812 ; AVX2-NEXT:    vmovd %eax, %xmm2
    813 ; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
    814 ; AVX2-NEXT:    movswl (%rdi), %eax
    815 ; AVX2-NEXT:    vmovd %eax, %xmm3
    816 ; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
    817 ; AVX2-NEXT:    movswl 2(%rdi), %eax
    818 ; AVX2-NEXT:    vmovd %eax, %xmm4
    819 ; AVX2-NEXT:    vcvtph2ps %xmm4, %xmm4
    820 ; AVX2-NEXT:    movswl 14(%rdi), %eax
    821 ; AVX2-NEXT:    vmovd %eax, %xmm5
    822 ; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
    823 ; AVX2-NEXT:    movswl 12(%rdi), %eax
    824 ; AVX2-NEXT:    vmovd %eax, %xmm6
    825 ; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
    826 ; AVX2-NEXT:    movswl 8(%rdi), %eax
    827 ; AVX2-NEXT:    vmovd %eax, %xmm7
    828 ; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
    829 ; AVX2-NEXT:    movswl 10(%rdi), %eax
    830 ; AVX2-NEXT:    vmovd %eax, %xmm1
    831 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
    832 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
    833 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
    834 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
    835 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
    836 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
    837 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
    838 ; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    839 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
    840 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
    841 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
    842 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
    843 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
    844 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
    845 ; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
    846 ; AVX2-NEXT:    retq
    847 ;
    848 ; AVX512-LABEL: load_cvt_16i16_to_16f32:
    849 ; AVX512:       # BB#0:
    850 ; AVX512-NEXT:    movswl 6(%rdi), %eax
    851 ; AVX512-NEXT:    vmovd %eax, %xmm0
    852 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm8
    853 ; AVX512-NEXT:    movswl 4(%rdi), %eax
    854 ; AVX512-NEXT:    vmovd %eax, %xmm0
    855 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm9
    856 ; AVX512-NEXT:    movswl (%rdi), %eax
    857 ; AVX512-NEXT:    vmovd %eax, %xmm0
    858 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm10
    859 ; AVX512-NEXT:    movswl 2(%rdi), %eax
    860 ; AVX512-NEXT:    vmovd %eax, %xmm0
    861 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm11
    862 ; AVX512-NEXT:    movswl 14(%rdi), %eax
    863 ; AVX512-NEXT:    vmovd %eax, %xmm0
    864 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm12
    865 ; AVX512-NEXT:    movswl 12(%rdi), %eax
    866 ; AVX512-NEXT:    vmovd %eax, %xmm0
    867 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm13
    868 ; AVX512-NEXT:    movswl 8(%rdi), %eax
    869 ; AVX512-NEXT:    vmovd %eax, %xmm0
    870 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm14
    871 ; AVX512-NEXT:    movswl 10(%rdi), %eax
    872 ; AVX512-NEXT:    vmovd %eax, %xmm0
    873 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm15
    874 ; AVX512-NEXT:    movswl 22(%rdi), %eax
    875 ; AVX512-NEXT:    vmovd %eax, %xmm0
    876 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
    877 ; AVX512-NEXT:    movswl 20(%rdi), %eax
    878 ; AVX512-NEXT:    vmovd %eax, %xmm1
    879 ; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
    880 ; AVX512-NEXT:    movswl 16(%rdi), %eax
    881 ; AVX512-NEXT:    vmovd %eax, %xmm2
    882 ; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
    883 ; AVX512-NEXT:    movswl 18(%rdi), %eax
    884 ; AVX512-NEXT:    vmovd %eax, %xmm3
    885 ; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
    886 ; AVX512-NEXT:    movswl 30(%rdi), %eax
    887 ; AVX512-NEXT:    vmovd %eax, %xmm4
    888 ; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm4
    889 ; AVX512-NEXT:    movswl 28(%rdi), %eax
    890 ; AVX512-NEXT:    vmovd %eax, %xmm5
    891 ; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
    892 ; AVX512-NEXT:    movswl 24(%rdi), %eax
    893 ; AVX512-NEXT:    vmovd %eax, %xmm6
    894 ; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm6
    895 ; AVX512-NEXT:    movswl 26(%rdi), %eax
    896 ; AVX512-NEXT:    vmovd %eax, %xmm7
    897 ; AVX512-NEXT:    vcvtph2ps %xmm7, %xmm7
    898 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
    899 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
    900 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
    901 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
    902 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
    903 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    904 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
    905 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
    906 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
    907 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
    908 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
    909 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
    910 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
    911 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
    912 ; AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
    913 ; AVX512-NEXT:    retq
    914   %1 = load <16 x i16>, <16 x i16>* %a0
    915   %2 = bitcast <16 x i16> %1 to <16 x half>
    916   %3 = fpext <16 x half> %2 to <16 x float>
    917   ret <16 x float> %3
    918 }
    919 
    920 ;
    921 ; Half to Double
    922 ;
    923 
    924 define double @cvt_i16_to_f64(i16 %a0) {
    925 ; ALL-LABEL: cvt_i16_to_f64:
    926 ; ALL:       # BB#0:
    927 ; ALL-NEXT:    movswl %di, %eax
    928 ; ALL-NEXT:    vmovd %eax, %xmm0
    929 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
    930 ; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
    931 ; ALL-NEXT:    retq
    932   %1 = bitcast i16 %a0 to half
    933   %2 = fpext half %1 to double
    934   ret double %2
    935 }
    936 
    937 define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) {
    938 ; ALL-LABEL: cvt_2i16_to_2f64:
    939 ; ALL:       # BB#0:
    940 ; ALL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    941 ; ALL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
    942 ; ALL-NEXT:    vmovd %xmm0, %eax
    943 ; ALL-NEXT:    movswl %ax, %ecx
    944 ; ALL-NEXT:    shrl $16, %eax
    945 ; ALL-NEXT:    cwtl
    946 ; ALL-NEXT:    vmovd %eax, %xmm0
    947 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
    948 ; ALL-NEXT:    vmovd %ecx, %xmm1
    949 ; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
    950 ; ALL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
    951 ; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
    952 ; ALL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
    953 ; ALL-NEXT:    retq
    954   %1 = bitcast <2 x i16> %a0 to <2 x half>
    955   %2 = fpext <2 x half> %1 to <2 x double>
    956   ret <2 x double> %2
    957 }
    958 
    959 define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) {
    960 ; ALL-LABEL: cvt_4i16_to_4f64:
    961 ; ALL:       # BB#0:
    962 ; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    963 ; ALL-NEXT:    vmovq %xmm0, %rax
    964 ; ALL-NEXT:    movq %rax, %rcx
    965 ; ALL-NEXT:    movl %eax, %edx
    966 ; ALL-NEXT:    movswl %ax, %esi
    967 ; ALL-NEXT:    shrq $48, %rax
    968 ; ALL-NEXT:    shrq $32, %rcx
    969 ; ALL-NEXT:    shrl $16, %edx
    970 ; ALL-NEXT:    movswl %dx, %edx
    971 ; ALL-NEXT:    vmovd %edx, %xmm0
    972 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
    973 ; ALL-NEXT:    vmovd %esi, %xmm1
    974 ; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
    975 ; ALL-NEXT:    movswl %cx, %ecx
    976 ; ALL-NEXT:    vmovd %ecx, %xmm2
    977 ; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
    978 ; ALL-NEXT:    cwtl
    979 ; ALL-NEXT:    vmovd %eax, %xmm3
    980 ; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
    981 ; ALL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
    982 ; ALL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
    983 ; ALL-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
    984 ; ALL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
    985 ; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
    986 ; ALL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
    987 ; ALL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    988 ; ALL-NEXT:    retq
    989   %1 = bitcast <4 x i16> %a0 to <4 x half>
    990   %2 = fpext <4 x half> %1 to <4 x double>
    991   ret <4 x double> %2
    992 }
    993 
    994 define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) {
    995 ; ALL-LABEL: cvt_8i16_to_2f64:
    996 ; ALL:       # BB#0:
    997 ; ALL-NEXT:    vmovd %xmm0, %eax
    998 ; ALL-NEXT:    movswl %ax, %ecx
    999 ; ALL-NEXT:    shrl $16, %eax
   1000 ; ALL-NEXT:    cwtl
   1001 ; ALL-NEXT:    vmovd %eax, %xmm0
   1002 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
   1003 ; ALL-NEXT:    vmovd %ecx, %xmm1
   1004 ; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
   1005 ; ALL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1006 ; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1007 ; ALL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1008 ; ALL-NEXT:    retq
   1009   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
   1010   %2 = bitcast <2 x i16> %1 to <2 x half>
   1011   %3 = fpext <2 x half> %2 to <2 x double>
   1012   ret <2 x double> %3
   1013 }
   1014 
   1015 define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) {
   1016 ; ALL-LABEL: cvt_8i16_to_4f64:
   1017 ; ALL:       # BB#0:
   1018 ; ALL-NEXT:    vmovq %xmm0, %rax
   1019 ; ALL-NEXT:    movq %rax, %rcx
   1020 ; ALL-NEXT:    movl %eax, %edx
   1021 ; ALL-NEXT:    movswl %ax, %esi
   1022 ; ALL-NEXT:    shrq $48, %rax
   1023 ; ALL-NEXT:    shrq $32, %rcx
   1024 ; ALL-NEXT:    shrl $16, %edx
   1025 ; ALL-NEXT:    movswl %dx, %edx
   1026 ; ALL-NEXT:    vmovd %edx, %xmm0
   1027 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
   1028 ; ALL-NEXT:    vmovd %esi, %xmm1
   1029 ; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
   1030 ; ALL-NEXT:    movswl %cx, %ecx
   1031 ; ALL-NEXT:    vmovd %ecx, %xmm2
   1032 ; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
   1033 ; ALL-NEXT:    cwtl
   1034 ; ALL-NEXT:    vmovd %eax, %xmm3
   1035 ; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
   1036 ; ALL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1037 ; ALL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1038 ; ALL-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   1039 ; ALL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1040 ; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1041 ; ALL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1042 ; ALL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1043 ; ALL-NEXT:    retq
   1044   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1045   %2 = bitcast <4 x i16> %1 to <4 x half>
   1046   %3 = fpext <4 x half> %2 to <4 x double>
   1047   ret <4 x double> %3
   1048 }
   1049 
   1050 define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) {
   1051 ; AVX1-LABEL: cvt_8i16_to_8f64:
   1052 ; AVX1:       # BB#0:
   1053 ; AVX1-NEXT:    vmovq %xmm0, %rdx
   1054 ; AVX1-NEXT:    movq %rdx, %r9
   1055 ; AVX1-NEXT:    movl %edx, %r10d
   1056 ; AVX1-NEXT:    movswl %dx, %r8d
   1057 ; AVX1-NEXT:    shrq $48, %rdx
   1058 ; AVX1-NEXT:    shrq $32, %r9
   1059 ; AVX1-NEXT:    shrl $16, %r10d
   1060 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rdi
   1061 ; AVX1-NEXT:    movq %rdi, %rsi
   1062 ; AVX1-NEXT:    movl %edi, %eax
   1063 ; AVX1-NEXT:    movswl %di, %ecx
   1064 ; AVX1-NEXT:    shrq $48, %rdi
   1065 ; AVX1-NEXT:    shrq $32, %rsi
   1066 ; AVX1-NEXT:    shrl $16, %eax
   1067 ; AVX1-NEXT:    cwtl
   1068 ; AVX1-NEXT:    vmovd %eax, %xmm0
   1069 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm1
   1070 ; AVX1-NEXT:    vmovd %ecx, %xmm0
   1071 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm2
   1072 ; AVX1-NEXT:    movswl %si, %eax
   1073 ; AVX1-NEXT:    vmovd %eax, %xmm0
   1074 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm3
   1075 ; AVX1-NEXT:    movswl %di, %eax
   1076 ; AVX1-NEXT:    vmovd %eax, %xmm0
   1077 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm4
   1078 ; AVX1-NEXT:    movswl %r10w, %eax
   1079 ; AVX1-NEXT:    vmovd %eax, %xmm0
   1080 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
   1081 ; AVX1-NEXT:    vmovd %r8d, %xmm5
   1082 ; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
   1083 ; AVX1-NEXT:    movswl %r9w, %eax
   1084 ; AVX1-NEXT:    vmovd %eax, %xmm6
   1085 ; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
   1086 ; AVX1-NEXT:    movswl %dx, %eax
   1087 ; AVX1-NEXT:    vmovd %eax, %xmm7
   1088 ; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
   1089 ; AVX1-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
   1090 ; AVX1-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
   1091 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
   1092 ; AVX1-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
   1093 ; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1094 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm5[0],xmm0[0]
   1095 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
   1096 ; AVX1-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
   1097 ; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1098 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
   1099 ; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1100 ; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1101 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
   1102 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
   1103 ; AVX1-NEXT:    retq
   1104 ;
   1105 ; AVX2-LABEL: cvt_8i16_to_8f64:
   1106 ; AVX2:       # BB#0:
   1107 ; AVX2-NEXT:    vmovq %xmm0, %rdx
   1108 ; AVX2-NEXT:    movq %rdx, %r9
   1109 ; AVX2-NEXT:    movl %edx, %r10d
   1110 ; AVX2-NEXT:    movswl %dx, %r8d
   1111 ; AVX2-NEXT:    shrq $48, %rdx
   1112 ; AVX2-NEXT:    shrq $32, %r9
   1113 ; AVX2-NEXT:    shrl $16, %r10d
   1114 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rdi
   1115 ; AVX2-NEXT:    movq %rdi, %rsi
   1116 ; AVX2-NEXT:    movl %edi, %eax
   1117 ; AVX2-NEXT:    movswl %di, %ecx
   1118 ; AVX2-NEXT:    shrq $48, %rdi
   1119 ; AVX2-NEXT:    shrq $32, %rsi
   1120 ; AVX2-NEXT:    shrl $16, %eax
   1121 ; AVX2-NEXT:    cwtl
   1122 ; AVX2-NEXT:    vmovd %eax, %xmm0
   1123 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm1
   1124 ; AVX2-NEXT:    vmovd %ecx, %xmm0
   1125 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm2
   1126 ; AVX2-NEXT:    movswl %si, %eax
   1127 ; AVX2-NEXT:    vmovd %eax, %xmm0
   1128 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm3
   1129 ; AVX2-NEXT:    movswl %di, %eax
   1130 ; AVX2-NEXT:    vmovd %eax, %xmm0
   1131 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm4
   1132 ; AVX2-NEXT:    movswl %r10w, %eax
   1133 ; AVX2-NEXT:    vmovd %eax, %xmm0
   1134 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
   1135 ; AVX2-NEXT:    vmovd %r8d, %xmm5
   1136 ; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
   1137 ; AVX2-NEXT:    movswl %r9w, %eax
   1138 ; AVX2-NEXT:    vmovd %eax, %xmm6
   1139 ; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
   1140 ; AVX2-NEXT:    movswl %dx, %eax
   1141 ; AVX2-NEXT:    vmovd %eax, %xmm7
   1142 ; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
   1143 ; AVX2-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
   1144 ; AVX2-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
   1145 ; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
   1146 ; AVX2-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
   1147 ; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1148 ; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm5[0],xmm0[0]
   1149 ; AVX2-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
   1150 ; AVX2-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
   1151 ; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1152 ; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
   1153 ; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1154 ; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1155 ; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
   1156 ; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
   1157 ; AVX2-NEXT:    retq
   1158 ;
   1159 ; AVX512-LABEL: cvt_8i16_to_8f64:
   1160 ; AVX512:       # BB#0:
   1161 ; AVX512-NEXT:    vpextrq $1, %xmm0, %rdx
   1162 ; AVX512-NEXT:    movq %rdx, %r8
   1163 ; AVX512-NEXT:    movl %edx, %r10d
   1164 ; AVX512-NEXT:    movswl %dx, %r9d
   1165 ; AVX512-NEXT:    shrq $48, %rdx
   1166 ; AVX512-NEXT:    shrq $32, %r8
   1167 ; AVX512-NEXT:    shrl $16, %r10d
   1168 ; AVX512-NEXT:    vmovq %xmm0, %rdi
   1169 ; AVX512-NEXT:    movq %rdi, %rax
   1170 ; AVX512-NEXT:    movl %edi, %esi
   1171 ; AVX512-NEXT:    movswl %di, %ecx
   1172 ; AVX512-NEXT:    shrq $48, %rdi
   1173 ; AVX512-NEXT:    shrq $32, %rax
   1174 ; AVX512-NEXT:    shrl $16, %esi
   1175 ; AVX512-NEXT:    movswl %si, %esi
   1176 ; AVX512-NEXT:    vmovd %esi, %xmm0
   1177 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
   1178 ; AVX512-NEXT:    vmovd %ecx, %xmm1
   1179 ; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
   1180 ; AVX512-NEXT:    cwtl
   1181 ; AVX512-NEXT:    vmovd %eax, %xmm2
   1182 ; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
   1183 ; AVX512-NEXT:    movswl %di, %eax
   1184 ; AVX512-NEXT:    vmovd %eax, %xmm3
   1185 ; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
   1186 ; AVX512-NEXT:    movswl %r10w, %eax
   1187 ; AVX512-NEXT:    vmovd %eax, %xmm4
   1188 ; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm4
   1189 ; AVX512-NEXT:    vmovd %r9d, %xmm5
   1190 ; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
   1191 ; AVX512-NEXT:    movswl %r8w, %eax
   1192 ; AVX512-NEXT:    vmovd %eax, %xmm6
   1193 ; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm6
   1194 ; AVX512-NEXT:    movswl %dx, %eax
   1195 ; AVX512-NEXT:    vmovd %eax, %xmm7
   1196 ; AVX512-NEXT:    vcvtph2ps %xmm7, %xmm7
   1197 ; AVX512-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
   1198 ; AVX512-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
   1199 ; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
   1200 ; AVX512-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
   1201 ; AVX512-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
   1202 ; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm4 = xmm5[0],xmm4[0]
   1203 ; AVX512-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm4
   1204 ; AVX512-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1205 ; AVX512-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1206 ; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   1207 ; AVX512-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1208 ; AVX512-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1209 ; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1210 ; AVX512-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1211 ; AVX512-NEXT:    vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
   1212 ; AVX512-NEXT:    retq
   1213   %1 = bitcast <8 x i16> %a0 to <8 x half>
   1214   %2 = fpext <8 x half> %1 to <8 x double>
   1215   ret <8 x double> %2
   1216 }
   1217 
   1218 ;
   1219 ; Half to Double (Load)
   1220 ;
   1221 
   1222 define double @load_cvt_i16_to_f64(i16* %a0) {
   1223 ; ALL-LABEL: load_cvt_i16_to_f64:
   1224 ; ALL:       # BB#0:
   1225 ; ALL-NEXT:    movswl (%rdi), %eax
   1226 ; ALL-NEXT:    vmovd %eax, %xmm0
   1227 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
   1228 ; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1229 ; ALL-NEXT:    retq
   1230   %1 = load i16, i16* %a0
   1231   %2 = bitcast i16 %1 to half
   1232   %3 = fpext half %2 to double
   1233   ret double %3
   1234 }
   1235 
   1236 define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) {
   1237 ; ALL-LABEL: load_cvt_2i16_to_2f64:
   1238 ; ALL:       # BB#0:
   1239 ; ALL-NEXT:    movswl (%rdi), %eax
   1240 ; ALL-NEXT:    vmovd %eax, %xmm0
   1241 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
   1242 ; ALL-NEXT:    movswl 2(%rdi), %eax
   1243 ; ALL-NEXT:    vmovd %eax, %xmm1
   1244 ; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
   1245 ; ALL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1246 ; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1247 ; ALL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1248 ; ALL-NEXT:    retq
   1249   %1 = load <2 x i16>, <2 x i16>* %a0
   1250   %2 = bitcast <2 x i16> %1 to <2 x half>
   1251   %3 = fpext <2 x half> %2 to <2 x double>
   1252   ret <2 x double> %3
   1253 }
   1254 
   1255 define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) {
   1256 ; ALL-LABEL: load_cvt_4i16_to_4f64:
   1257 ; ALL:       # BB#0:
   1258 ; ALL-NEXT:    movswl (%rdi), %eax
   1259 ; ALL-NEXT:    vmovd %eax, %xmm0
   1260 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
   1261 ; ALL-NEXT:    movswl 2(%rdi), %eax
   1262 ; ALL-NEXT:    vmovd %eax, %xmm1
   1263 ; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
   1264 ; ALL-NEXT:    movswl 4(%rdi), %eax
   1265 ; ALL-NEXT:    vmovd %eax, %xmm2
   1266 ; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
   1267 ; ALL-NEXT:    movswl 6(%rdi), %eax
   1268 ; ALL-NEXT:    vmovd %eax, %xmm3
   1269 ; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
   1270 ; ALL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1271 ; ALL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1272 ; ALL-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   1273 ; ALL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1274 ; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1275 ; ALL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1276 ; ALL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1277 ; ALL-NEXT:    retq
   1278   %1 = load <4 x i16>, <4 x i16>* %a0
   1279   %2 = bitcast <4 x i16> %1 to <4 x half>
   1280   %3 = fpext <4 x half> %2 to <4 x double>
   1281   ret <4 x double> %3
   1282 }
   1283 
   1284 define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) {
   1285 ; ALL-LABEL: load_cvt_8i16_to_4f64:
   1286 ; ALL:       # BB#0:
   1287 ; ALL-NEXT:    movq (%rdi), %rax
   1288 ; ALL-NEXT:    movq %rax, %rcx
   1289 ; ALL-NEXT:    movl %eax, %edx
   1290 ; ALL-NEXT:    movswl %ax, %esi
   1291 ; ALL-NEXT:    shrq $48, %rax
   1292 ; ALL-NEXT:    shrq $32, %rcx
   1293 ; ALL-NEXT:    shrl $16, %edx
   1294 ; ALL-NEXT:    movswl %dx, %edx
   1295 ; ALL-NEXT:    vmovd %edx, %xmm0
   1296 ; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
   1297 ; ALL-NEXT:    vmovd %esi, %xmm1
   1298 ; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
   1299 ; ALL-NEXT:    movswl %cx, %ecx
   1300 ; ALL-NEXT:    vmovd %ecx, %xmm2
   1301 ; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
   1302 ; ALL-NEXT:    cwtl
   1303 ; ALL-NEXT:    vmovd %eax, %xmm3
   1304 ; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
   1305 ; ALL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1306 ; ALL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1307 ; ALL-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   1308 ; ALL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1309 ; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1310 ; ALL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1311 ; ALL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1312 ; ALL-NEXT:    retq
   1313   %1 = load <8 x i16>, <8 x i16>* %a0
   1314   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1315   %3 = bitcast <4 x i16> %2 to <4 x half>
   1316   %4 = fpext <4 x half> %3 to <4 x double>
   1317   ret <4 x double> %4
   1318 }
   1319 
   1320 define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) {
   1321 ; AVX1-LABEL: load_cvt_8i16_to_8f64:
   1322 ; AVX1:       # BB#0:
   1323 ; AVX1-NEXT:    movswl 8(%rdi), %eax
   1324 ; AVX1-NEXT:    vmovd %eax, %xmm0
   1325 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm1
   1326 ; AVX1-NEXT:    movswl 10(%rdi), %eax
   1327 ; AVX1-NEXT:    vmovd %eax, %xmm0
   1328 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm2
   1329 ; AVX1-NEXT:    movswl 12(%rdi), %eax
   1330 ; AVX1-NEXT:    vmovd %eax, %xmm0
   1331 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm3
   1332 ; AVX1-NEXT:    movswl 14(%rdi), %eax
   1333 ; AVX1-NEXT:    vmovd %eax, %xmm0
   1334 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm4
   1335 ; AVX1-NEXT:    movswl (%rdi), %eax
   1336 ; AVX1-NEXT:    vmovd %eax, %xmm0
   1337 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
   1338 ; AVX1-NEXT:    movswl 2(%rdi), %eax
   1339 ; AVX1-NEXT:    vmovd %eax, %xmm5
   1340 ; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
   1341 ; AVX1-NEXT:    movswl 4(%rdi), %eax
   1342 ; AVX1-NEXT:    vmovd %eax, %xmm6
   1343 ; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
   1344 ; AVX1-NEXT:    movswl 6(%rdi), %eax
   1345 ; AVX1-NEXT:    vmovd %eax, %xmm7
   1346 ; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
   1347 ; AVX1-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
   1348 ; AVX1-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
   1349 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
   1350 ; AVX1-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
   1351 ; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1352 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
   1353 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
   1354 ; AVX1-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
   1355 ; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1356 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
   1357 ; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1358 ; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1359 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
   1360 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
   1361 ; AVX1-NEXT:    retq
   1362 ;
   1363 ; AVX2-LABEL: load_cvt_8i16_to_8f64:
   1364 ; AVX2:       # BB#0:
   1365 ; AVX2-NEXT:    movswl 8(%rdi), %eax
   1366 ; AVX2-NEXT:    vmovd %eax, %xmm0
   1367 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm1
   1368 ; AVX2-NEXT:    movswl 10(%rdi), %eax
   1369 ; AVX2-NEXT:    vmovd %eax, %xmm0
   1370 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm2
   1371 ; AVX2-NEXT:    movswl 12(%rdi), %eax
   1372 ; AVX2-NEXT:    vmovd %eax, %xmm0
   1373 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm3
   1374 ; AVX2-NEXT:    movswl 14(%rdi), %eax
   1375 ; AVX2-NEXT:    vmovd %eax, %xmm0
   1376 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm4
   1377 ; AVX2-NEXT:    movswl (%rdi), %eax
   1378 ; AVX2-NEXT:    vmovd %eax, %xmm0
   1379 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
   1380 ; AVX2-NEXT:    movswl 2(%rdi), %eax
   1381 ; AVX2-NEXT:    vmovd %eax, %xmm5
   1382 ; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
   1383 ; AVX2-NEXT:    movswl 4(%rdi), %eax
   1384 ; AVX2-NEXT:    vmovd %eax, %xmm6
   1385 ; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
   1386 ; AVX2-NEXT:    movswl 6(%rdi), %eax
   1387 ; AVX2-NEXT:    vmovd %eax, %xmm7
   1388 ; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
   1389 ; AVX2-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
   1390 ; AVX2-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
   1391 ; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
   1392 ; AVX2-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
   1393 ; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1394 ; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
   1395 ; AVX2-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
   1396 ; AVX2-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
   1397 ; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1398 ; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
   1399 ; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1400 ; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1401 ; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
   1402 ; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
   1403 ; AVX2-NEXT:    retq
   1404 ;
   1405 ; AVX512-LABEL: load_cvt_8i16_to_8f64:
   1406 ; AVX512:       # BB#0:
   1407 ; AVX512-NEXT:    movswl (%rdi), %eax
   1408 ; AVX512-NEXT:    vmovd %eax, %xmm0
   1409 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
   1410 ; AVX512-NEXT:    movswl 2(%rdi), %eax
   1411 ; AVX512-NEXT:    vmovd %eax, %xmm1
   1412 ; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
   1413 ; AVX512-NEXT:    movswl 4(%rdi), %eax
   1414 ; AVX512-NEXT:    vmovd %eax, %xmm2
   1415 ; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
   1416 ; AVX512-NEXT:    movswl 6(%rdi), %eax
   1417 ; AVX512-NEXT:    vmovd %eax, %xmm3
   1418 ; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
   1419 ; AVX512-NEXT:    movswl 8(%rdi), %eax
   1420 ; AVX512-NEXT:    vmovd %eax, %xmm4
   1421 ; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm4
   1422 ; AVX512-NEXT:    movswl 10(%rdi), %eax
   1423 ; AVX512-NEXT:    vmovd %eax, %xmm5
   1424 ; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
   1425 ; AVX512-NEXT:    movswl 12(%rdi), %eax
   1426 ; AVX512-NEXT:    vmovd %eax, %xmm6
   1427 ; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm6
   1428 ; AVX512-NEXT:    movswl 14(%rdi), %eax
   1429 ; AVX512-NEXT:    vmovd %eax, %xmm7
   1430 ; AVX512-NEXT:    vcvtph2ps %xmm7, %xmm7
   1431 ; AVX512-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
   1432 ; AVX512-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
   1433 ; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
   1434 ; AVX512-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
   1435 ; AVX512-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
   1436 ; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm5[0]
   1437 ; AVX512-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm4
   1438 ; AVX512-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
   1439 ; AVX512-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
   1440 ; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   1441 ; AVX512-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
   1442 ; AVX512-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
   1443 ; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1444 ; AVX512-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1445 ; AVX512-NEXT:    vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
   1446 ; AVX512-NEXT:    retq
   1447   %1 = load <8 x i16>, <8 x i16>* %a0
   1448   %2 = bitcast <8 x i16> %1 to <8 x half>
   1449   %3 = fpext <8 x half> %2 to <8 x double>
   1450   ret <8 x double> %3
   1451 }
   1452 
   1453 ;
   1454 ; Float to Half
   1455 ;
   1456 
   1457 define i16 @cvt_f32_to_i16(float %a0) {
   1458 ; ALL-LABEL: cvt_f32_to_i16:
   1459 ; ALL:       # BB#0:
   1460 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   1461 ; ALL-NEXT:    vmovd %xmm0, %eax
   1462 ; ALL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
   1463 ; ALL-NEXT:    retq
   1464   %1 = fptrunc float %a0 to half
   1465   %2 = bitcast half %1 to i16
   1466   ret i16 %2
   1467 }
   1468 
   1469 define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) {
   1470 ; ALL-LABEL: cvt_4f32_to_4i16:
   1471 ; ALL:       # BB#0:
   1472 ; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   1473 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1474 ; ALL-NEXT:    vmovd %xmm1, %eax
   1475 ; ALL-NEXT:    shll $16, %eax
   1476 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   1477 ; ALL-NEXT:    vmovd %xmm1, %ecx
   1478 ; ALL-NEXT:    movzwl %cx, %ecx
   1479 ; ALL-NEXT:    orl %eax, %ecx
   1480 ; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   1481 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1482 ; ALL-NEXT:    vmovd %xmm1, %eax
   1483 ; ALL-NEXT:    shll $16, %eax
   1484 ; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1485 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   1486 ; ALL-NEXT:    vmovd %xmm0, %edx
   1487 ; ALL-NEXT:    movzwl %dx, %edx
   1488 ; ALL-NEXT:    orl %eax, %edx
   1489 ; ALL-NEXT:    shlq $32, %rdx
   1490 ; ALL-NEXT:    orq %rcx, %rdx
   1491 ; ALL-NEXT:    vmovq %rdx, %xmm0
   1492 ; ALL-NEXT:    retq
   1493   %1 = fptrunc <4 x float> %a0 to <4 x half>
   1494   %2 = bitcast <4 x half> %1 to <4 x i16>
   1495   ret <4 x i16> %2
   1496 }
   1497 
   1498 define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) {
   1499 ; ALL-LABEL: cvt_4f32_to_8i16_undef:
   1500 ; ALL:       # BB#0:
   1501 ; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   1502 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1503 ; ALL-NEXT:    vmovd %xmm1, %eax
   1504 ; ALL-NEXT:    shll $16, %eax
   1505 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   1506 ; ALL-NEXT:    vmovd %xmm1, %ecx
   1507 ; ALL-NEXT:    movzwl %cx, %ecx
   1508 ; ALL-NEXT:    orl %eax, %ecx
   1509 ; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   1510 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1511 ; ALL-NEXT:    vmovd %xmm1, %eax
   1512 ; ALL-NEXT:    shll $16, %eax
   1513 ; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1514 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   1515 ; ALL-NEXT:    vmovd %xmm0, %edx
   1516 ; ALL-NEXT:    movzwl %dx, %edx
   1517 ; ALL-NEXT:    orl %eax, %edx
   1518 ; ALL-NEXT:    shlq $32, %rdx
   1519 ; ALL-NEXT:    orq %rcx, %rdx
   1520 ; ALL-NEXT:    vmovq %rdx, %xmm0
   1521 ; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1522 ; ALL-NEXT:    retq
   1523   %1 = fptrunc <4 x float> %a0 to <4 x half>
   1524   %2 = bitcast <4 x half> %1 to <4 x i16>
   1525   %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1526   ret <8 x i16> %3
   1527 }
   1528 
   1529 define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) {
   1530 ; ALL-LABEL: cvt_4f32_to_8i16_zero:
   1531 ; ALL:       # BB#0:
   1532 ; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   1533 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1534 ; ALL-NEXT:    vmovd %xmm1, %eax
   1535 ; ALL-NEXT:    shll $16, %eax
   1536 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   1537 ; ALL-NEXT:    vmovd %xmm1, %ecx
   1538 ; ALL-NEXT:    movzwl %cx, %ecx
   1539 ; ALL-NEXT:    orl %eax, %ecx
   1540 ; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   1541 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1542 ; ALL-NEXT:    vmovd %xmm1, %eax
   1543 ; ALL-NEXT:    shll $16, %eax
   1544 ; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1545 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   1546 ; ALL-NEXT:    vmovd %xmm0, %edx
   1547 ; ALL-NEXT:    movzwl %dx, %edx
   1548 ; ALL-NEXT:    orl %eax, %edx
   1549 ; ALL-NEXT:    shlq $32, %rdx
   1550 ; ALL-NEXT:    orq %rcx, %rdx
   1551 ; ALL-NEXT:    vmovq %rdx, %xmm0
   1552 ; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
   1553 ; ALL-NEXT:    retq
   1554   %1 = fptrunc <4 x float> %a0 to <4 x half>
   1555   %2 = bitcast <4 x half> %1 to <4 x i16>
   1556   %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1557   ret <8 x i16> %3
   1558 }
   1559 
   1560 define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) {
   1561 ; AVX1-LABEL: cvt_8f32_to_8i16:
   1562 ; AVX1:       # BB#0:
   1563 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   1564 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1565 ; AVX1-NEXT:    vmovd %xmm1, %eax
   1566 ; AVX1-NEXT:    shll $16, %eax
   1567 ; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   1568 ; AVX1-NEXT:    vmovd %xmm1, %ecx
   1569 ; AVX1-NEXT:    movzwl %cx, %ecx
   1570 ; AVX1-NEXT:    orl %eax, %ecx
   1571 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   1572 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1573 ; AVX1-NEXT:    vmovd %xmm1, %edx
   1574 ; AVX1-NEXT:    shll $16, %edx
   1575 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   1576 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1577 ; AVX1-NEXT:    vmovd %xmm1, %eax
   1578 ; AVX1-NEXT:    movzwl %ax, %eax
   1579 ; AVX1-NEXT:    orl %edx, %eax
   1580 ; AVX1-NEXT:    shlq $32, %rax
   1581 ; AVX1-NEXT:    orq %rcx, %rax
   1582 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1583 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   1584 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1585 ; AVX1-NEXT:    vmovd %xmm1, %ecx
   1586 ; AVX1-NEXT:    shll $16, %ecx
   1587 ; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   1588 ; AVX1-NEXT:    vmovd %xmm1, %edx
   1589 ; AVX1-NEXT:    movzwl %dx, %edx
   1590 ; AVX1-NEXT:    orl %ecx, %edx
   1591 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   1592 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1593 ; AVX1-NEXT:    vmovd %xmm1, %ecx
   1594 ; AVX1-NEXT:    shll $16, %ecx
   1595 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1596 ; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   1597 ; AVX1-NEXT:    vmovd %xmm0, %esi
   1598 ; AVX1-NEXT:    movzwl %si, %esi
   1599 ; AVX1-NEXT:    orl %ecx, %esi
   1600 ; AVX1-NEXT:    shlq $32, %rsi
   1601 ; AVX1-NEXT:    orq %rdx, %rsi
   1602 ; AVX1-NEXT:    vmovq %rsi, %xmm0
   1603 ; AVX1-NEXT:    vmovq %rax, %xmm1
   1604 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1605 ; AVX1-NEXT:    vzeroupper
   1606 ; AVX1-NEXT:    retq
   1607 ;
   1608 ; AVX2-LABEL: cvt_8f32_to_8i16:
   1609 ; AVX2:       # BB#0:
   1610 ; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   1611 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1612 ; AVX2-NEXT:    vmovd %xmm1, %eax
   1613 ; AVX2-NEXT:    shll $16, %eax
   1614 ; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   1615 ; AVX2-NEXT:    vmovd %xmm1, %ecx
   1616 ; AVX2-NEXT:    movzwl %cx, %ecx
   1617 ; AVX2-NEXT:    orl %eax, %ecx
   1618 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   1619 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1620 ; AVX2-NEXT:    vmovd %xmm1, %edx
   1621 ; AVX2-NEXT:    shll $16, %edx
   1622 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   1623 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1624 ; AVX2-NEXT:    vmovd %xmm1, %eax
   1625 ; AVX2-NEXT:    movzwl %ax, %eax
   1626 ; AVX2-NEXT:    orl %edx, %eax
   1627 ; AVX2-NEXT:    shlq $32, %rax
   1628 ; AVX2-NEXT:    orq %rcx, %rax
   1629 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1630 ; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   1631 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1632 ; AVX2-NEXT:    vmovd %xmm1, %ecx
   1633 ; AVX2-NEXT:    shll $16, %ecx
   1634 ; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   1635 ; AVX2-NEXT:    vmovd %xmm1, %edx
   1636 ; AVX2-NEXT:    movzwl %dx, %edx
   1637 ; AVX2-NEXT:    orl %ecx, %edx
   1638 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   1639 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1640 ; AVX2-NEXT:    vmovd %xmm1, %ecx
   1641 ; AVX2-NEXT:    shll $16, %ecx
   1642 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1643 ; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   1644 ; AVX2-NEXT:    vmovd %xmm0, %esi
   1645 ; AVX2-NEXT:    movzwl %si, %esi
   1646 ; AVX2-NEXT:    orl %ecx, %esi
   1647 ; AVX2-NEXT:    shlq $32, %rsi
   1648 ; AVX2-NEXT:    orq %rdx, %rsi
   1649 ; AVX2-NEXT:    vmovq %rsi, %xmm0
   1650 ; AVX2-NEXT:    vmovq %rax, %xmm1
   1651 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1652 ; AVX2-NEXT:    vzeroupper
   1653 ; AVX2-NEXT:    retq
   1654 ;
   1655 ; AVX512-LABEL: cvt_8f32_to_8i16:
   1656 ; AVX512:       # BB#0:
   1657 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   1658 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1659 ; AVX512-NEXT:    vmovd %xmm1, %eax
   1660 ; AVX512-NEXT:    shll $16, %eax
   1661 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   1662 ; AVX512-NEXT:    vmovd %xmm1, %ecx
   1663 ; AVX512-NEXT:    movzwl %cx, %ecx
   1664 ; AVX512-NEXT:    orl %eax, %ecx
   1665 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   1666 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1667 ; AVX512-NEXT:    vmovd %xmm1, %edx
   1668 ; AVX512-NEXT:    shll $16, %edx
   1669 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   1670 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1671 ; AVX512-NEXT:    vmovd %xmm1, %eax
   1672 ; AVX512-NEXT:    movzwl %ax, %eax
   1673 ; AVX512-NEXT:    orl %edx, %eax
   1674 ; AVX512-NEXT:    shlq $32, %rax
   1675 ; AVX512-NEXT:    orq %rcx, %rax
   1676 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1677 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   1678 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1679 ; AVX512-NEXT:    vmovd %xmm1, %ecx
   1680 ; AVX512-NEXT:    shll $16, %ecx
   1681 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   1682 ; AVX512-NEXT:    vmovd %xmm1, %edx
   1683 ; AVX512-NEXT:    movzwl %dx, %edx
   1684 ; AVX512-NEXT:    orl %ecx, %edx
   1685 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   1686 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1687 ; AVX512-NEXT:    vmovd %xmm1, %ecx
   1688 ; AVX512-NEXT:    shll $16, %ecx
   1689 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1690 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   1691 ; AVX512-NEXT:    vmovd %xmm0, %esi
   1692 ; AVX512-NEXT:    movzwl %si, %esi
   1693 ; AVX512-NEXT:    orl %ecx, %esi
   1694 ; AVX512-NEXT:    shlq $32, %rsi
   1695 ; AVX512-NEXT:    orq %rdx, %rsi
   1696 ; AVX512-NEXT:    vmovq %rsi, %xmm0
   1697 ; AVX512-NEXT:    vmovq %rax, %xmm1
   1698 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1699 ; AVX512-NEXT:    retq
   1700   %1 = fptrunc <8 x float> %a0 to <8 x half>
   1701   %2 = bitcast <8 x half> %1 to <8 x i16>
   1702   ret <8 x i16> %2
   1703 }
   1704 
   1705 define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) {
   1706 ; AVX1-LABEL: cvt_16f32_to_16i16:
   1707 ; AVX1:       # BB#0:
   1708 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm2
   1709 ; AVX1-NEXT:    vmovd %xmm2, %eax
   1710 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
   1711 ; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   1712 ; AVX1-NEXT:    vmovd %eax, %xmm3
   1713 ; AVX1-NEXT:    vmovd %xmm2, %eax
   1714 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   1715 ; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   1716 ; AVX1-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
   1717 ; AVX1-NEXT:    vmovd %xmm2, %eax
   1718 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1719 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
   1720 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1721 ; AVX1-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
   1722 ; AVX1-NEXT:    vmovd %xmm1, %eax
   1723 ; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
   1724 ; AVX1-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
   1725 ; AVX1-NEXT:    vmovd %xmm1, %eax
   1726 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
   1727 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1728 ; AVX1-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
   1729 ; AVX1-NEXT:    vmovd %xmm1, %eax
   1730 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
   1731 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1732 ; AVX1-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
   1733 ; AVX1-NEXT:    vmovd %xmm1, %eax
   1734 ; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   1735 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
   1736 ; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   1737 ; AVX1-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
   1738 ; AVX1-NEXT:    vmovd %xmm2, %eax
   1739 ; AVX1-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm2
   1740 ; AVX1-NEXT:    vmovd %xmm1, %eax
   1741 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   1742 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1743 ; AVX1-NEXT:    vmovd %eax, %xmm3
   1744 ; AVX1-NEXT:    vmovd %xmm1, %eax
   1745 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   1746 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1747 ; AVX1-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
   1748 ; AVX1-NEXT:    vmovd %xmm1, %eax
   1749 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1750 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
   1751 ; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   1752 ; AVX1-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
   1753 ; AVX1-NEXT:    vmovd %xmm0, %eax
   1754 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
   1755 ; AVX1-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
   1756 ; AVX1-NEXT:    vmovd %xmm0, %eax
   1757 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
   1758 ; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   1759 ; AVX1-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
   1760 ; AVX1-NEXT:    vmovd %xmm0, %eax
   1761 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
   1762 ; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   1763 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1764 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1765 ; AVX1-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
   1766 ; AVX1-NEXT:    vmovd %xmm1, %eax
   1767 ; AVX1-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm1
   1768 ; AVX1-NEXT:    vmovd %xmm0, %eax
   1769 ; AVX1-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
   1770 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1771 ; AVX1-NEXT:    retq
   1772 ;
   1773 ; AVX2-LABEL: cvt_16f32_to_16i16:
   1774 ; AVX2:       # BB#0:
   1775 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm2
   1776 ; AVX2-NEXT:    vmovd %xmm2, %eax
   1777 ; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
   1778 ; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   1779 ; AVX2-NEXT:    vmovd %eax, %xmm3
   1780 ; AVX2-NEXT:    vmovd %xmm2, %eax
   1781 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   1782 ; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   1783 ; AVX2-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
   1784 ; AVX2-NEXT:    vmovd %xmm2, %eax
   1785 ; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1786 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
   1787 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1788 ; AVX2-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
   1789 ; AVX2-NEXT:    vmovd %xmm1, %eax
   1790 ; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
   1791 ; AVX2-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
   1792 ; AVX2-NEXT:    vmovd %xmm1, %eax
   1793 ; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
   1794 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1795 ; AVX2-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
   1796 ; AVX2-NEXT:    vmovd %xmm1, %eax
   1797 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
   1798 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1799 ; AVX2-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
   1800 ; AVX2-NEXT:    vmovd %xmm1, %eax
   1801 ; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   1802 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
   1803 ; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   1804 ; AVX2-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
   1805 ; AVX2-NEXT:    vmovd %xmm2, %eax
   1806 ; AVX2-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm2
   1807 ; AVX2-NEXT:    vmovd %xmm1, %eax
   1808 ; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   1809 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1810 ; AVX2-NEXT:    vmovd %eax, %xmm3
   1811 ; AVX2-NEXT:    vmovd %xmm1, %eax
   1812 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   1813 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1814 ; AVX2-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
   1815 ; AVX2-NEXT:    vmovd %xmm1, %eax
   1816 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1817 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
   1818 ; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   1819 ; AVX2-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
   1820 ; AVX2-NEXT:    vmovd %xmm0, %eax
   1821 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
   1822 ; AVX2-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
   1823 ; AVX2-NEXT:    vmovd %xmm0, %eax
   1824 ; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
   1825 ; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   1826 ; AVX2-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
   1827 ; AVX2-NEXT:    vmovd %xmm0, %eax
   1828 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
   1829 ; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   1830 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1831 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1832 ; AVX2-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
   1833 ; AVX2-NEXT:    vmovd %xmm1, %eax
   1834 ; AVX2-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm1
   1835 ; AVX2-NEXT:    vmovd %xmm0, %eax
   1836 ; AVX2-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
   1837 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
   1838 ; AVX2-NEXT:    retq
   1839 ;
   1840 ; AVX512-LABEL: cvt_16f32_to_16i16:
   1841 ; AVX512:       # BB#0:
   1842 ; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
   1843 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm2
   1844 ; AVX512-NEXT:    vmovd %xmm2, %eax
   1845 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
   1846 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   1847 ; AVX512-NEXT:    vmovd %eax, %xmm3
   1848 ; AVX512-NEXT:    vmovd %xmm2, %eax
   1849 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   1850 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   1851 ; AVX512-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
   1852 ; AVX512-NEXT:    vmovd %xmm2, %eax
   1853 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1854 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
   1855 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1856 ; AVX512-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
   1857 ; AVX512-NEXT:    vmovd %xmm1, %eax
   1858 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
   1859 ; AVX512-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
   1860 ; AVX512-NEXT:    vmovd %xmm1, %eax
   1861 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
   1862 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1863 ; AVX512-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
   1864 ; AVX512-NEXT:    vmovd %xmm1, %eax
   1865 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
   1866 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1867 ; AVX512-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
   1868 ; AVX512-NEXT:    vmovd %xmm1, %eax
   1869 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   1870 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
   1871 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   1872 ; AVX512-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
   1873 ; AVX512-NEXT:    vmovd %xmm2, %eax
   1874 ; AVX512-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm2
   1875 ; AVX512-NEXT:    vmovd %xmm1, %eax
   1876 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   1877 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1878 ; AVX512-NEXT:    vmovd %eax, %xmm3
   1879 ; AVX512-NEXT:    vmovd %xmm1, %eax
   1880 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   1881 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1882 ; AVX512-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
   1883 ; AVX512-NEXT:    vmovd %xmm1, %eax
   1884 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1885 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
   1886 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   1887 ; AVX512-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
   1888 ; AVX512-NEXT:    vmovd %xmm0, %eax
   1889 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
   1890 ; AVX512-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
   1891 ; AVX512-NEXT:    vmovd %xmm0, %eax
   1892 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
   1893 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   1894 ; AVX512-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
   1895 ; AVX512-NEXT:    vmovd %xmm0, %eax
   1896 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
   1897 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   1898 ; AVX512-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
   1899 ; AVX512-NEXT:    vmovd %xmm0, %eax
   1900 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
   1901 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   1902 ; AVX512-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm1
   1903 ; AVX512-NEXT:    vmovd %xmm0, %eax
   1904 ; AVX512-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
   1905 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
   1906 ; AVX512-NEXT:    retq
   1907   %1 = fptrunc <16 x float> %a0 to <16 x half>
   1908   %2 = bitcast <16 x half> %1 to <16 x i16>
   1909   ret <16 x i16> %2
   1910 }
   1911 
   1912 ;
   1913 ; Float to Half (Store)
   1914 ;
   1915 
   1916 define void @store_cvt_f32_to_i16(float %a0, i16* %a1) {
   1917 ; ALL-LABEL: store_cvt_f32_to_i16:
   1918 ; ALL:       # BB#0:
   1919 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   1920 ; ALL-NEXT:    vmovd %xmm0, %eax
   1921 ; ALL-NEXT:    movw %ax, (%rdi)
   1922 ; ALL-NEXT:    retq
   1923   %1 = fptrunc float %a0 to half
   1924   %2 = bitcast half %1 to i16
   1925   store i16 %2, i16* %a1
   1926   ret void
   1927 }
   1928 
   1929 define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) {
   1930 ; ALL-LABEL: store_cvt_4f32_to_4i16:
   1931 ; ALL:       # BB#0:
   1932 ; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   1933 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1934 ; ALL-NEXT:    vmovd %xmm1, %eax
   1935 ; ALL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   1936 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1937 ; ALL-NEXT:    vmovd %xmm1, %ecx
   1938 ; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   1939 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1940 ; ALL-NEXT:    vmovd %xmm1, %edx
   1941 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   1942 ; ALL-NEXT:    vmovd %xmm0, %esi
   1943 ; ALL-NEXT:    movw %si, (%rdi)
   1944 ; ALL-NEXT:    movw %dx, 6(%rdi)
   1945 ; ALL-NEXT:    movw %cx, 4(%rdi)
   1946 ; ALL-NEXT:    movw %ax, 2(%rdi)
   1947 ; ALL-NEXT:    retq
   1948   %1 = fptrunc <4 x float> %a0 to <4 x half>
   1949   %2 = bitcast <4 x half> %1 to <4 x i16>
   1950   store <4 x i16> %2, <4 x i16>* %a1
   1951   ret void
   1952 }
   1953 
   1954 define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) {
   1955 ; ALL-LABEL: store_cvt_4f32_to_8i16_undef:
   1956 ; ALL:       # BB#0:
   1957 ; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   1958 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1959 ; ALL-NEXT:    vmovd %xmm1, %eax
   1960 ; ALL-NEXT:    shll $16, %eax
   1961 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   1962 ; ALL-NEXT:    vmovd %xmm1, %ecx
   1963 ; ALL-NEXT:    movzwl %cx, %ecx
   1964 ; ALL-NEXT:    orl %eax, %ecx
   1965 ; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   1966 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1967 ; ALL-NEXT:    vmovd %xmm1, %eax
   1968 ; ALL-NEXT:    shll $16, %eax
   1969 ; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1970 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   1971 ; ALL-NEXT:    vmovd %xmm0, %edx
   1972 ; ALL-NEXT:    movzwl %dx, %edx
   1973 ; ALL-NEXT:    orl %eax, %edx
   1974 ; ALL-NEXT:    shlq $32, %rdx
   1975 ; ALL-NEXT:    orq %rcx, %rdx
   1976 ; ALL-NEXT:    vmovq %rdx, %xmm0
   1977 ; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1978 ; ALL-NEXT:    vmovdqa %xmm0, (%rdi)
   1979 ; ALL-NEXT:    retq
   1980   %1 = fptrunc <4 x float> %a0 to <4 x half>
   1981   %2 = bitcast <4 x half> %1 to <4 x i16>
   1982   %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1983   store <8 x i16> %3, <8 x i16>* %a1
   1984   ret void
   1985 }
   1986 
   1987 define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) {
   1988 ; ALL-LABEL: store_cvt_4f32_to_8i16_zero:
   1989 ; ALL:       # BB#0:
   1990 ; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   1991 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   1992 ; ALL-NEXT:    vmovd %xmm1, %eax
   1993 ; ALL-NEXT:    shll $16, %eax
   1994 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
   1995 ; ALL-NEXT:    vmovd %xmm1, %ecx
   1996 ; ALL-NEXT:    movzwl %cx, %ecx
   1997 ; ALL-NEXT:    orl %eax, %ecx
   1998 ; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   1999 ; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2000 ; ALL-NEXT:    vmovd %xmm1, %eax
   2001 ; ALL-NEXT:    shll $16, %eax
   2002 ; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2003 ; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2004 ; ALL-NEXT:    vmovd %xmm0, %edx
   2005 ; ALL-NEXT:    movzwl %dx, %edx
   2006 ; ALL-NEXT:    orl %eax, %edx
   2007 ; ALL-NEXT:    shlq $32, %rdx
   2008 ; ALL-NEXT:    orq %rcx, %rdx
   2009 ; ALL-NEXT:    vmovq %rdx, %xmm0
   2010 ; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
   2011 ; ALL-NEXT:    vmovdqa %xmm0, (%rdi)
   2012 ; ALL-NEXT:    retq
   2013   %1 = fptrunc <4 x float> %a0 to <4 x half>
   2014   %2 = bitcast <4 x half> %1 to <4 x i16>
   2015   %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   2016   store <8 x i16> %3, <8 x i16>* %a1
   2017   ret void
   2018 }
   2019 
   2020 define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) {
   2021 ; AVX1-LABEL: store_cvt_8f32_to_8i16:
   2022 ; AVX1:       # BB#0:
   2023 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2024 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2025 ; AVX1-NEXT:    vmovd %xmm1, %r8d
   2026 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   2027 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2028 ; AVX1-NEXT:    vmovd %xmm1, %r9d
   2029 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   2030 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2031 ; AVX1-NEXT:    vmovd %xmm1, %r10d
   2032 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   2033 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
   2034 ; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2035 ; AVX1-NEXT:    vmovd %xmm2, %r11d
   2036 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   2037 ; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2038 ; AVX1-NEXT:    vmovd %xmm2, %eax
   2039 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
   2040 ; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2041 ; AVX1-NEXT:    vmovd %xmm2, %ecx
   2042 ; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2043 ; AVX1-NEXT:    vmovd %xmm0, %edx
   2044 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
   2045 ; AVX1-NEXT:    vmovd %xmm0, %esi
   2046 ; AVX1-NEXT:    movw %si, 8(%rdi)
   2047 ; AVX1-NEXT:    movw %dx, (%rdi)
   2048 ; AVX1-NEXT:    movw %cx, 14(%rdi)
   2049 ; AVX1-NEXT:    movw %ax, 12(%rdi)
   2050 ; AVX1-NEXT:    movw %r11w, 10(%rdi)
   2051 ; AVX1-NEXT:    movw %r10w, 6(%rdi)
   2052 ; AVX1-NEXT:    movw %r9w, 4(%rdi)
   2053 ; AVX1-NEXT:    movw %r8w, 2(%rdi)
   2054 ; AVX1-NEXT:    vzeroupper
   2055 ; AVX1-NEXT:    retq
   2056 ;
   2057 ; AVX2-LABEL: store_cvt_8f32_to_8i16:
   2058 ; AVX2:       # BB#0:
   2059 ; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2060 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2061 ; AVX2-NEXT:    vmovd %xmm1, %r8d
   2062 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   2063 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2064 ; AVX2-NEXT:    vmovd %xmm1, %r9d
   2065 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   2066 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2067 ; AVX2-NEXT:    vmovd %xmm1, %r10d
   2068 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
   2069 ; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
   2070 ; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2071 ; AVX2-NEXT:    vmovd %xmm2, %r11d
   2072 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   2073 ; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2074 ; AVX2-NEXT:    vmovd %xmm2, %eax
   2075 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
   2076 ; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2077 ; AVX2-NEXT:    vmovd %xmm2, %ecx
   2078 ; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2079 ; AVX2-NEXT:    vmovd %xmm0, %edx
   2080 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
   2081 ; AVX2-NEXT:    vmovd %xmm0, %esi
   2082 ; AVX2-NEXT:    movw %si, 8(%rdi)
   2083 ; AVX2-NEXT:    movw %dx, (%rdi)
   2084 ; AVX2-NEXT:    movw %cx, 14(%rdi)
   2085 ; AVX2-NEXT:    movw %ax, 12(%rdi)
   2086 ; AVX2-NEXT:    movw %r11w, 10(%rdi)
   2087 ; AVX2-NEXT:    movw %r10w, 6(%rdi)
   2088 ; AVX2-NEXT:    movw %r9w, 4(%rdi)
   2089 ; AVX2-NEXT:    movw %r8w, 2(%rdi)
   2090 ; AVX2-NEXT:    vzeroupper
   2091 ; AVX2-NEXT:    retq
   2092 ;
   2093 ; AVX512-LABEL: store_cvt_8f32_to_8i16:
   2094 ; AVX512:       # BB#0:
   2095 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
   2096 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2097 ; AVX512-NEXT:    vmovd %xmm1, %r8d
   2098 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   2099 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2100 ; AVX512-NEXT:    vmovd %xmm1, %r9d
   2101 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
   2102 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2103 ; AVX512-NEXT:    vmovd %xmm1, %r10d
   2104 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
   2105 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
   2106 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2107 ; AVX512-NEXT:    vmovd %xmm2, %r11d
   2108 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   2109 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2110 ; AVX512-NEXT:    vmovd %xmm2, %eax
   2111 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
   2112 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2113 ; AVX512-NEXT:    vmovd %xmm2, %ecx
   2114 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2115 ; AVX512-NEXT:    vmovd %xmm0, %edx
   2116 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
   2117 ; AVX512-NEXT:    vmovd %xmm0, %esi
   2118 ; AVX512-NEXT:    movw %si, 8(%rdi)
   2119 ; AVX512-NEXT:    movw %dx, (%rdi)
   2120 ; AVX512-NEXT:    movw %cx, 14(%rdi)
   2121 ; AVX512-NEXT:    movw %ax, 12(%rdi)
   2122 ; AVX512-NEXT:    movw %r11w, 10(%rdi)
   2123 ; AVX512-NEXT:    movw %r10w, 6(%rdi)
   2124 ; AVX512-NEXT:    movw %r9w, 4(%rdi)
   2125 ; AVX512-NEXT:    movw %r8w, 2(%rdi)
   2126 ; AVX512-NEXT:    retq
   2127   %1 = fptrunc <8 x float> %a0 to <8 x half>
   2128   %2 = bitcast <8 x half> %1 to <8 x i16>
   2129   store <8 x i16> %2, <8 x i16>* %a1
   2130   ret void
   2131 }
   2132 
   2133 define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) {
   2134 ; AVX1-LABEL: store_cvt_16f32_to_16i16:
   2135 ; AVX1:       # BB#0:
   2136 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   2137 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
   2138 ; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm4
   2139 ; AVX1-NEXT:    vmovd %xmm4, %eax
   2140 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm4
   2141 ; AVX1-NEXT:    movw %ax, 24(%rdi)
   2142 ; AVX1-NEXT:    vmovd %xmm4, %eax
   2143 ; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm4
   2144 ; AVX1-NEXT:    movw %ax, 16(%rdi)
   2145 ; AVX1-NEXT:    vmovd %xmm4, %eax
   2146 ; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm4
   2147 ; AVX1-NEXT:    movw %ax, 8(%rdi)
   2148 ; AVX1-NEXT:    vmovd %xmm4, %eax
   2149 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
   2150 ; AVX1-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
   2151 ; AVX1-NEXT:    movw %ax, (%rdi)
   2152 ; AVX1-NEXT:    vmovd %xmm4, %eax
   2153 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
   2154 ; AVX1-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
   2155 ; AVX1-NEXT:    movw %ax, 30(%rdi)
   2156 ; AVX1-NEXT:    vmovd %xmm4, %eax
   2157 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
   2158 ; AVX1-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
   2159 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
   2160 ; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2161 ; AVX1-NEXT:    movw %ax, 28(%rdi)
   2162 ; AVX1-NEXT:    vmovd %xmm3, %eax
   2163 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
   2164 ; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2165 ; AVX1-NEXT:    movw %ax, 26(%rdi)
   2166 ; AVX1-NEXT:    vmovd %xmm3, %eax
   2167 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
   2168 ; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2169 ; AVX1-NEXT:    movw %ax, 22(%rdi)
   2170 ; AVX1-NEXT:    vmovd %xmm3, %eax
   2171 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
   2172 ; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2173 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
   2174 ; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2175 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
   2176 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2177 ; AVX1-NEXT:    movw %ax, 20(%rdi)
   2178 ; AVX1-NEXT:    vmovd %xmm1, %eax
   2179 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
   2180 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2181 ; AVX1-NEXT:    movw %ax, 18(%rdi)
   2182 ; AVX1-NEXT:    vmovd %xmm1, %eax
   2183 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
   2184 ; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2185 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   2186 ; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2187 ; AVX1-NEXT:    movw %ax, 14(%rdi)
   2188 ; AVX1-NEXT:    vmovd %xmm2, %eax
   2189 ; AVX1-NEXT:    movw %ax, 12(%rdi)
   2190 ; AVX1-NEXT:    vmovd %xmm1, %eax
   2191 ; AVX1-NEXT:    movw %ax, 10(%rdi)
   2192 ; AVX1-NEXT:    vmovd %xmm0, %eax
   2193 ; AVX1-NEXT:    movw %ax, 6(%rdi)
   2194 ; AVX1-NEXT:    vmovd %xmm3, %eax
   2195 ; AVX1-NEXT:    movw %ax, 4(%rdi)
   2196 ; AVX1-NEXT:    vmovd %xmm4, %eax
   2197 ; AVX1-NEXT:    movw %ax, 2(%rdi)
   2198 ; AVX1-NEXT:    vzeroupper
   2199 ; AVX1-NEXT:    retq
   2200 ;
   2201 ; AVX2-LABEL: store_cvt_16f32_to_16i16:
   2202 ; AVX2:       # BB#0:
   2203 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm2
   2204 ; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm3
   2205 ; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm4
   2206 ; AVX2-NEXT:    vmovd %xmm4, %eax
   2207 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm4
   2208 ; AVX2-NEXT:    movw %ax, 24(%rdi)
   2209 ; AVX2-NEXT:    vmovd %xmm4, %eax
   2210 ; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm4
   2211 ; AVX2-NEXT:    movw %ax, 16(%rdi)
   2212 ; AVX2-NEXT:    vmovd %xmm4, %eax
   2213 ; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm4
   2214 ; AVX2-NEXT:    movw %ax, 8(%rdi)
   2215 ; AVX2-NEXT:    vmovd %xmm4, %eax
   2216 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
   2217 ; AVX2-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
   2218 ; AVX2-NEXT:    movw %ax, (%rdi)
   2219 ; AVX2-NEXT:    vmovd %xmm4, %eax
   2220 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
   2221 ; AVX2-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
   2222 ; AVX2-NEXT:    movw %ax, 30(%rdi)
   2223 ; AVX2-NEXT:    vmovd %xmm4, %eax
   2224 ; AVX2-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
   2225 ; AVX2-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
   2226 ; AVX2-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
   2227 ; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2228 ; AVX2-NEXT:    movw %ax, 28(%rdi)
   2229 ; AVX2-NEXT:    vmovd %xmm3, %eax
   2230 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
   2231 ; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2232 ; AVX2-NEXT:    movw %ax, 26(%rdi)
   2233 ; AVX2-NEXT:    vmovd %xmm3, %eax
   2234 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
   2235 ; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2236 ; AVX2-NEXT:    movw %ax, 22(%rdi)
   2237 ; AVX2-NEXT:    vmovd %xmm3, %eax
   2238 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
   2239 ; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2240 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
   2241 ; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2242 ; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
   2243 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2244 ; AVX2-NEXT:    movw %ax, 20(%rdi)
   2245 ; AVX2-NEXT:    vmovd %xmm1, %eax
   2246 ; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
   2247 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2248 ; AVX2-NEXT:    movw %ax, 18(%rdi)
   2249 ; AVX2-NEXT:    vmovd %xmm1, %eax
   2250 ; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
   2251 ; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2252 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   2253 ; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2254 ; AVX2-NEXT:    movw %ax, 14(%rdi)
   2255 ; AVX2-NEXT:    vmovd %xmm2, %eax
   2256 ; AVX2-NEXT:    movw %ax, 12(%rdi)
   2257 ; AVX2-NEXT:    vmovd %xmm1, %eax
   2258 ; AVX2-NEXT:    movw %ax, 10(%rdi)
   2259 ; AVX2-NEXT:    vmovd %xmm0, %eax
   2260 ; AVX2-NEXT:    movw %ax, 6(%rdi)
   2261 ; AVX2-NEXT:    vmovd %xmm3, %eax
   2262 ; AVX2-NEXT:    movw %ax, 4(%rdi)
   2263 ; AVX2-NEXT:    vmovd %xmm4, %eax
   2264 ; AVX2-NEXT:    movw %ax, 2(%rdi)
   2265 ; AVX2-NEXT:    vzeroupper
   2266 ; AVX2-NEXT:    retq
   2267 ;
   2268 ; AVX512-LABEL: store_cvt_16f32_to_16i16:
   2269 ; AVX512:       # BB#0:
   2270 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
   2271 ; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
   2272 ; AVX512-NEXT:    vextractf128 $1, %ymm2, %xmm3
   2273 ; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm4
   2274 ; AVX512-NEXT:    vmovd %xmm4, %eax
   2275 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm4
   2276 ; AVX512-NEXT:    movw %ax, 24(%rdi)
   2277 ; AVX512-NEXT:    vmovd %xmm4, %eax
   2278 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm4
   2279 ; AVX512-NEXT:    movw %ax, 16(%rdi)
   2280 ; AVX512-NEXT:    vmovd %xmm4, %eax
   2281 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm4
   2282 ; AVX512-NEXT:    movw %ax, 8(%rdi)
   2283 ; AVX512-NEXT:    vmovd %xmm4, %eax
   2284 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
   2285 ; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
   2286 ; AVX512-NEXT:    movw %ax, (%rdi)
   2287 ; AVX512-NEXT:    vmovd %xmm4, %eax
   2288 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
   2289 ; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
   2290 ; AVX512-NEXT:    movw %ax, 30(%rdi)
   2291 ; AVX512-NEXT:    vmovd %xmm4, %eax
   2292 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
   2293 ; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
   2294 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
   2295 ; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2296 ; AVX512-NEXT:    movw %ax, 28(%rdi)
   2297 ; AVX512-NEXT:    vmovd %xmm3, %eax
   2298 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
   2299 ; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2300 ; AVX512-NEXT:    movw %ax, 26(%rdi)
   2301 ; AVX512-NEXT:    vmovd %xmm3, %eax
   2302 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
   2303 ; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2304 ; AVX512-NEXT:    movw %ax, 22(%rdi)
   2305 ; AVX512-NEXT:    vmovd %xmm3, %eax
   2306 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
   2307 ; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
   2308 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
   2309 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
   2310 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
   2311 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2312 ; AVX512-NEXT:    movw %ax, 20(%rdi)
   2313 ; AVX512-NEXT:    vmovd %xmm2, %eax
   2314 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
   2315 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2316 ; AVX512-NEXT:    movw %ax, 18(%rdi)
   2317 ; AVX512-NEXT:    vmovd %xmm2, %eax
   2318 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
   2319 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
   2320 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   2321 ; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
   2322 ; AVX512-NEXT:    movw %ax, 14(%rdi)
   2323 ; AVX512-NEXT:    vmovd %xmm1, %eax
   2324 ; AVX512-NEXT:    movw %ax, 12(%rdi)
   2325 ; AVX512-NEXT:    vmovd %xmm2, %eax
   2326 ; AVX512-NEXT:    movw %ax, 10(%rdi)
   2327 ; AVX512-NEXT:    vmovd %xmm0, %eax
   2328 ; AVX512-NEXT:    movw %ax, 6(%rdi)
   2329 ; AVX512-NEXT:    vmovd %xmm3, %eax
   2330 ; AVX512-NEXT:    movw %ax, 4(%rdi)
   2331 ; AVX512-NEXT:    vmovd %xmm4, %eax
   2332 ; AVX512-NEXT:    movw %ax, 2(%rdi)
   2333 ; AVX512-NEXT:    retq
   2334   %1 = fptrunc <16 x float> %a0 to <16 x half>
   2335   %2 = bitcast <16 x half> %1 to <16 x i16>
   2336   store <16 x i16> %2, <16 x i16>* %a1
   2337   ret void
   2338 }
   2339 
   2340 ;
   2341 ; Double to Half
   2342 ;
   2343 
   2344 define i16 @cvt_f64_to_i16(double %a0) {
   2345 ; ALL-LABEL: cvt_f64_to_i16:
   2346 ; ALL:       # BB#0:
   2347 ; ALL-NEXT:    jmp __truncdfhf2 # TAILCALL
   2348   %1 = fptrunc double %a0 to half
   2349   %2 = bitcast half %1 to i16
   2350   ret i16 %2
   2351 }
   2352 
   2353 define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) {
   2354 ; ALL-LABEL: cvt_2f64_to_2i16:
   2355 ; ALL:       # BB#0:
   2356 ; ALL-NEXT:    pushq %rbx
   2357 ; ALL-NEXT:  .Ltmp0:
   2358 ; ALL-NEXT:    .cfi_def_cfa_offset 16
   2359 ; ALL-NEXT:    subq $16, %rsp
   2360 ; ALL-NEXT:  .Ltmp1:
   2361 ; ALL-NEXT:    .cfi_def_cfa_offset 32
   2362 ; ALL-NEXT:  .Ltmp2:
   2363 ; ALL-NEXT:    .cfi_offset %rbx, -16
   2364 ; ALL-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   2365 ; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2366 ; ALL-NEXT:    callq __truncdfhf2
   2367 ; ALL-NEXT:    movw %ax, %bx
   2368 ; ALL-NEXT:    shll $16, %ebx
   2369 ; ALL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   2370 ; ALL-NEXT:    callq __truncdfhf2
   2371 ; ALL-NEXT:    movzwl %ax, %eax
   2372 ; ALL-NEXT:    orl %ebx, %eax
   2373 ; ALL-NEXT:    vmovd %eax, %xmm0
   2374 ; ALL-NEXT:    addq $16, %rsp
   2375 ; ALL-NEXT:    popq %rbx
   2376 ; ALL-NEXT:    retq
   2377   %1 = fptrunc <2 x double> %a0 to <2 x half>
   2378   %2 = bitcast <2 x half> %1 to <2 x i16>
   2379   ret <2 x i16> %2
   2380 }
   2381 
   2382 define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) {
   2383 ; AVX1-LABEL: cvt_4f64_to_4i16:
   2384 ; AVX1:       # BB#0:
   2385 ; AVX1-NEXT:    pushq %r14
   2386 ; AVX1-NEXT:  .Ltmp3:
   2387 ; AVX1-NEXT:    .cfi_def_cfa_offset 16
   2388 ; AVX1-NEXT:    pushq %rbx
   2389 ; AVX1-NEXT:  .Ltmp4:
   2390 ; AVX1-NEXT:    .cfi_def_cfa_offset 24
   2391 ; AVX1-NEXT:    subq $40, %rsp
   2392 ; AVX1-NEXT:  .Ltmp5:
   2393 ; AVX1-NEXT:    .cfi_def_cfa_offset 64
   2394 ; AVX1-NEXT:  .Ltmp6:
   2395 ; AVX1-NEXT:    .cfi_offset %rbx, -24
   2396 ; AVX1-NEXT:  .Ltmp7:
   2397 ; AVX1-NEXT:    .cfi_offset %r14, -16
   2398 ; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   2399 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2400 ; AVX1-NEXT:    vzeroupper
   2401 ; AVX1-NEXT:    callq __truncdfhf2
   2402 ; AVX1-NEXT:    movw %ax, %bx
   2403 ; AVX1-NEXT:    shll $16, %ebx
   2404 ; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   2405 ; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   2406 ; AVX1-NEXT:    vzeroupper
   2407 ; AVX1-NEXT:    callq __truncdfhf2
   2408 ; AVX1-NEXT:    movzwl %ax, %r14d
   2409 ; AVX1-NEXT:    orl %ebx, %r14d
   2410 ; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   2411 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2412 ; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   2413 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2414 ; AVX1-NEXT:    vzeroupper
   2415 ; AVX1-NEXT:    callq __truncdfhf2
   2416 ; AVX1-NEXT:    movw %ax, %bx
   2417 ; AVX1-NEXT:    shll $16, %ebx
   2418 ; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   2419 ; AVX1-NEXT:    callq __truncdfhf2
   2420 ; AVX1-NEXT:    movzwl %ax, %eax
   2421 ; AVX1-NEXT:    orl %ebx, %eax
   2422 ; AVX1-NEXT:    shlq $32, %rax
   2423 ; AVX1-NEXT:    orq %r14, %rax
   2424 ; AVX1-NEXT:    vmovq %rax, %xmm0
   2425 ; AVX1-NEXT:    addq $40, %rsp
   2426 ; AVX1-NEXT:    popq %rbx
   2427 ; AVX1-NEXT:    popq %r14
   2428 ; AVX1-NEXT:    retq
   2429 ;
   2430 ; AVX2-LABEL: cvt_4f64_to_4i16:
   2431 ; AVX2:       # BB#0:
   2432 ; AVX2-NEXT:    pushq %r14
   2433 ; AVX2-NEXT:  .Ltmp3:
   2434 ; AVX2-NEXT:    .cfi_def_cfa_offset 16
   2435 ; AVX2-NEXT:    pushq %rbx
   2436 ; AVX2-NEXT:  .Ltmp4:
   2437 ; AVX2-NEXT:    .cfi_def_cfa_offset 24
   2438 ; AVX2-NEXT:    subq $40, %rsp
   2439 ; AVX2-NEXT:  .Ltmp5:
   2440 ; AVX2-NEXT:    .cfi_def_cfa_offset 64
   2441 ; AVX2-NEXT:  .Ltmp6:
   2442 ; AVX2-NEXT:    .cfi_offset %rbx, -24
   2443 ; AVX2-NEXT:  .Ltmp7:
   2444 ; AVX2-NEXT:    .cfi_offset %r14, -16
   2445 ; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   2446 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2447 ; AVX2-NEXT:    vzeroupper
   2448 ; AVX2-NEXT:    callq __truncdfhf2
   2449 ; AVX2-NEXT:    movw %ax, %bx
   2450 ; AVX2-NEXT:    shll $16, %ebx
   2451 ; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   2452 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   2453 ; AVX2-NEXT:    vzeroupper
   2454 ; AVX2-NEXT:    callq __truncdfhf2
   2455 ; AVX2-NEXT:    movzwl %ax, %r14d
   2456 ; AVX2-NEXT:    orl %ebx, %r14d
   2457 ; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   2458 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2459 ; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   2460 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2461 ; AVX2-NEXT:    vzeroupper
   2462 ; AVX2-NEXT:    callq __truncdfhf2
   2463 ; AVX2-NEXT:    movw %ax, %bx
   2464 ; AVX2-NEXT:    shll $16, %ebx
   2465 ; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   2466 ; AVX2-NEXT:    callq __truncdfhf2
   2467 ; AVX2-NEXT:    movzwl %ax, %eax
   2468 ; AVX2-NEXT:    orl %ebx, %eax
   2469 ; AVX2-NEXT:    shlq $32, %rax
   2470 ; AVX2-NEXT:    orq %r14, %rax
   2471 ; AVX2-NEXT:    vmovq %rax, %xmm0
   2472 ; AVX2-NEXT:    addq $40, %rsp
   2473 ; AVX2-NEXT:    popq %rbx
   2474 ; AVX2-NEXT:    popq %r14
   2475 ; AVX2-NEXT:    retq
   2476 ;
   2477 ; AVX512-LABEL: cvt_4f64_to_4i16:
   2478 ; AVX512:       # BB#0:
   2479 ; AVX512-NEXT:    pushq %r14
   2480 ; AVX512-NEXT:  .Ltmp3:
   2481 ; AVX512-NEXT:    .cfi_def_cfa_offset 16
   2482 ; AVX512-NEXT:    pushq %rbx
   2483 ; AVX512-NEXT:  .Ltmp4:
   2484 ; AVX512-NEXT:    .cfi_def_cfa_offset 24
   2485 ; AVX512-NEXT:    subq $40, %rsp
   2486 ; AVX512-NEXT:  .Ltmp5:
   2487 ; AVX512-NEXT:    .cfi_def_cfa_offset 64
   2488 ; AVX512-NEXT:  .Ltmp6:
   2489 ; AVX512-NEXT:    .cfi_offset %rbx, -24
   2490 ; AVX512-NEXT:  .Ltmp7:
   2491 ; AVX512-NEXT:    .cfi_offset %r14, -16
   2492 ; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   2493 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2494 ; AVX512-NEXT:    callq __truncdfhf2
   2495 ; AVX512-NEXT:    movw %ax, %bx
   2496 ; AVX512-NEXT:    shll $16, %ebx
   2497 ; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   2498 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   2499 ; AVX512-NEXT:    callq __truncdfhf2
   2500 ; AVX512-NEXT:    movzwl %ax, %r14d
   2501 ; AVX512-NEXT:    orl %ebx, %r14d
   2502 ; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   2503 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2504 ; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   2505 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2506 ; AVX512-NEXT:    callq __truncdfhf2
   2507 ; AVX512-NEXT:    movw %ax, %bx
   2508 ; AVX512-NEXT:    shll $16, %ebx
   2509 ; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   2510 ; AVX512-NEXT:    callq __truncdfhf2
   2511 ; AVX512-NEXT:    movzwl %ax, %eax
   2512 ; AVX512-NEXT:    orl %ebx, %eax
   2513 ; AVX512-NEXT:    shlq $32, %rax
   2514 ; AVX512-NEXT:    orq %r14, %rax
   2515 ; AVX512-NEXT:    vmovq %rax, %xmm0
   2516 ; AVX512-NEXT:    addq $40, %rsp
   2517 ; AVX512-NEXT:    popq %rbx
   2518 ; AVX512-NEXT:    popq %r14
   2519 ; AVX512-NEXT:    retq
   2520   %1 = fptrunc <4 x double> %a0 to <4 x half>
   2521   %2 = bitcast <4 x half> %1 to <4 x i16>
   2522   ret <4 x i16> %2
   2523 }
   2524 
   2525 define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) {
   2526 ; AVX1-LABEL: cvt_4f64_to_8i16_undef:
   2527 ; AVX1:       # BB#0:
   2528 ; AVX1-NEXT:    pushq %r14
   2529 ; AVX1-NEXT:  .Ltmp8:
   2530 ; AVX1-NEXT:    .cfi_def_cfa_offset 16
   2531 ; AVX1-NEXT:    pushq %rbx
   2532 ; AVX1-NEXT:  .Ltmp9:
   2533 ; AVX1-NEXT:    .cfi_def_cfa_offset 24
   2534 ; AVX1-NEXT:    subq $40, %rsp
   2535 ; AVX1-NEXT:  .Ltmp10:
   2536 ; AVX1-NEXT:    .cfi_def_cfa_offset 64
   2537 ; AVX1-NEXT:  .Ltmp11:
   2538 ; AVX1-NEXT:    .cfi_offset %rbx, -24
   2539 ; AVX1-NEXT:  .Ltmp12:
   2540 ; AVX1-NEXT:    .cfi_offset %r14, -16
   2541 ; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   2542 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2543 ; AVX1-NEXT:    vzeroupper
   2544 ; AVX1-NEXT:    callq __truncdfhf2
   2545 ; AVX1-NEXT:    movw %ax, %bx
   2546 ; AVX1-NEXT:    shll $16, %ebx
   2547 ; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   2548 ; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   2549 ; AVX1-NEXT:    vzeroupper
   2550 ; AVX1-NEXT:    callq __truncdfhf2
   2551 ; AVX1-NEXT:    movzwl %ax, %r14d
   2552 ; AVX1-NEXT:    orl %ebx, %r14d
   2553 ; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   2554 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2555 ; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   2556 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2557 ; AVX1-NEXT:    vzeroupper
   2558 ; AVX1-NEXT:    callq __truncdfhf2
   2559 ; AVX1-NEXT:    movw %ax, %bx
   2560 ; AVX1-NEXT:    shll $16, %ebx
   2561 ; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   2562 ; AVX1-NEXT:    callq __truncdfhf2
   2563 ; AVX1-NEXT:    movzwl %ax, %eax
   2564 ; AVX1-NEXT:    orl %ebx, %eax
   2565 ; AVX1-NEXT:    shlq $32, %rax
   2566 ; AVX1-NEXT:    orq %r14, %rax
   2567 ; AVX1-NEXT:    vmovq %rax, %xmm0
   2568 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   2569 ; AVX1-NEXT:    addq $40, %rsp
   2570 ; AVX1-NEXT:    popq %rbx
   2571 ; AVX1-NEXT:    popq %r14
   2572 ; AVX1-NEXT:    retq
   2573 ;
   2574 ; AVX2-LABEL: cvt_4f64_to_8i16_undef:
   2575 ; AVX2:       # BB#0:
   2576 ; AVX2-NEXT:    pushq %r14
   2577 ; AVX2-NEXT:  .Ltmp8:
   2578 ; AVX2-NEXT:    .cfi_def_cfa_offset 16
   2579 ; AVX2-NEXT:    pushq %rbx
   2580 ; AVX2-NEXT:  .Ltmp9:
   2581 ; AVX2-NEXT:    .cfi_def_cfa_offset 24
   2582 ; AVX2-NEXT:    subq $40, %rsp
   2583 ; AVX2-NEXT:  .Ltmp10:
   2584 ; AVX2-NEXT:    .cfi_def_cfa_offset 64
   2585 ; AVX2-NEXT:  .Ltmp11:
   2586 ; AVX2-NEXT:    .cfi_offset %rbx, -24
   2587 ; AVX2-NEXT:  .Ltmp12:
   2588 ; AVX2-NEXT:    .cfi_offset %r14, -16
   2589 ; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   2590 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2591 ; AVX2-NEXT:    vzeroupper
   2592 ; AVX2-NEXT:    callq __truncdfhf2
   2593 ; AVX2-NEXT:    movw %ax, %bx
   2594 ; AVX2-NEXT:    shll $16, %ebx
   2595 ; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   2596 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   2597 ; AVX2-NEXT:    vzeroupper
   2598 ; AVX2-NEXT:    callq __truncdfhf2
   2599 ; AVX2-NEXT:    movzwl %ax, %r14d
   2600 ; AVX2-NEXT:    orl %ebx, %r14d
   2601 ; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   2602 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2603 ; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   2604 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2605 ; AVX2-NEXT:    vzeroupper
   2606 ; AVX2-NEXT:    callq __truncdfhf2
   2607 ; AVX2-NEXT:    movw %ax, %bx
   2608 ; AVX2-NEXT:    shll $16, %ebx
   2609 ; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   2610 ; AVX2-NEXT:    callq __truncdfhf2
   2611 ; AVX2-NEXT:    movzwl %ax, %eax
   2612 ; AVX2-NEXT:    orl %ebx, %eax
   2613 ; AVX2-NEXT:    shlq $32, %rax
   2614 ; AVX2-NEXT:    orq %r14, %rax
   2615 ; AVX2-NEXT:    vmovq %rax, %xmm0
   2616 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   2617 ; AVX2-NEXT:    addq $40, %rsp
   2618 ; AVX2-NEXT:    popq %rbx
   2619 ; AVX2-NEXT:    popq %r14
   2620 ; AVX2-NEXT:    retq
   2621 ;
   2622 ; AVX512-LABEL: cvt_4f64_to_8i16_undef:
   2623 ; AVX512:       # BB#0:
   2624 ; AVX512-NEXT:    pushq %r14
   2625 ; AVX512-NEXT:  .Ltmp8:
   2626 ; AVX512-NEXT:    .cfi_def_cfa_offset 16
   2627 ; AVX512-NEXT:    pushq %rbx
   2628 ; AVX512-NEXT:  .Ltmp9:
   2629 ; AVX512-NEXT:    .cfi_def_cfa_offset 24
   2630 ; AVX512-NEXT:    subq $40, %rsp
   2631 ; AVX512-NEXT:  .Ltmp10:
   2632 ; AVX512-NEXT:    .cfi_def_cfa_offset 64
   2633 ; AVX512-NEXT:  .Ltmp11:
   2634 ; AVX512-NEXT:    .cfi_offset %rbx, -24
   2635 ; AVX512-NEXT:  .Ltmp12:
   2636 ; AVX512-NEXT:    .cfi_offset %r14, -16
   2637 ; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   2638 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2639 ; AVX512-NEXT:    callq __truncdfhf2
   2640 ; AVX512-NEXT:    movw %ax, %bx
   2641 ; AVX512-NEXT:    shll $16, %ebx
   2642 ; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   2643 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   2644 ; AVX512-NEXT:    callq __truncdfhf2
   2645 ; AVX512-NEXT:    movzwl %ax, %r14d
   2646 ; AVX512-NEXT:    orl %ebx, %r14d
   2647 ; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   2648 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2649 ; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   2650 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2651 ; AVX512-NEXT:    callq __truncdfhf2
   2652 ; AVX512-NEXT:    movw %ax, %bx
   2653 ; AVX512-NEXT:    shll $16, %ebx
   2654 ; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   2655 ; AVX512-NEXT:    callq __truncdfhf2
   2656 ; AVX512-NEXT:    movzwl %ax, %eax
   2657 ; AVX512-NEXT:    orl %ebx, %eax
   2658 ; AVX512-NEXT:    shlq $32, %rax
   2659 ; AVX512-NEXT:    orq %r14, %rax
   2660 ; AVX512-NEXT:    vmovq %rax, %xmm0
   2661 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   2662 ; AVX512-NEXT:    addq $40, %rsp
   2663 ; AVX512-NEXT:    popq %rbx
   2664 ; AVX512-NEXT:    popq %r14
   2665 ; AVX512-NEXT:    retq
   2666   %1 = fptrunc <4 x double> %a0 to <4 x half>
   2667   %2 = bitcast <4 x half> %1 to <4 x i16>
   2668   %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   2669   ret <8 x i16> %3
   2670 }
   2671 
   2672 define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) {
   2673 ; AVX1-LABEL: cvt_4f64_to_8i16_zero:
   2674 ; AVX1:       # BB#0:
   2675 ; AVX1-NEXT:    pushq %r14
   2676 ; AVX1-NEXT:  .Ltmp13:
   2677 ; AVX1-NEXT:    .cfi_def_cfa_offset 16
   2678 ; AVX1-NEXT:    pushq %rbx
   2679 ; AVX1-NEXT:  .Ltmp14:
   2680 ; AVX1-NEXT:    .cfi_def_cfa_offset 24
   2681 ; AVX1-NEXT:    subq $40, %rsp
   2682 ; AVX1-NEXT:  .Ltmp15:
   2683 ; AVX1-NEXT:    .cfi_def_cfa_offset 64
   2684 ; AVX1-NEXT:  .Ltmp16:
   2685 ; AVX1-NEXT:    .cfi_offset %rbx, -24
   2686 ; AVX1-NEXT:  .Ltmp17:
   2687 ; AVX1-NEXT:    .cfi_offset %r14, -16
   2688 ; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   2689 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2690 ; AVX1-NEXT:    vzeroupper
   2691 ; AVX1-NEXT:    callq __truncdfhf2
   2692 ; AVX1-NEXT:    movw %ax, %bx
   2693 ; AVX1-NEXT:    shll $16, %ebx
   2694 ; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   2695 ; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   2696 ; AVX1-NEXT:    vzeroupper
   2697 ; AVX1-NEXT:    callq __truncdfhf2
   2698 ; AVX1-NEXT:    movzwl %ax, %r14d
   2699 ; AVX1-NEXT:    orl %ebx, %r14d
   2700 ; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   2701 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2702 ; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   2703 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2704 ; AVX1-NEXT:    vzeroupper
   2705 ; AVX1-NEXT:    callq __truncdfhf2
   2706 ; AVX1-NEXT:    movw %ax, %bx
   2707 ; AVX1-NEXT:    shll $16, %ebx
   2708 ; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   2709 ; AVX1-NEXT:    callq __truncdfhf2
   2710 ; AVX1-NEXT:    movzwl %ax, %eax
   2711 ; AVX1-NEXT:    orl %ebx, %eax
   2712 ; AVX1-NEXT:    shlq $32, %rax
   2713 ; AVX1-NEXT:    orq %r14, %rax
   2714 ; AVX1-NEXT:    vmovq %rax, %xmm0
   2715 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
   2716 ; AVX1-NEXT:    addq $40, %rsp
   2717 ; AVX1-NEXT:    popq %rbx
   2718 ; AVX1-NEXT:    popq %r14
   2719 ; AVX1-NEXT:    retq
   2720 ;
   2721 ; AVX2-LABEL: cvt_4f64_to_8i16_zero:
   2722 ; AVX2:       # BB#0:
   2723 ; AVX2-NEXT:    pushq %r14
   2724 ; AVX2-NEXT:  .Ltmp13:
   2725 ; AVX2-NEXT:    .cfi_def_cfa_offset 16
   2726 ; AVX2-NEXT:    pushq %rbx
   2727 ; AVX2-NEXT:  .Ltmp14:
   2728 ; AVX2-NEXT:    .cfi_def_cfa_offset 24
   2729 ; AVX2-NEXT:    subq $40, %rsp
   2730 ; AVX2-NEXT:  .Ltmp15:
   2731 ; AVX2-NEXT:    .cfi_def_cfa_offset 64
   2732 ; AVX2-NEXT:  .Ltmp16:
   2733 ; AVX2-NEXT:    .cfi_offset %rbx, -24
   2734 ; AVX2-NEXT:  .Ltmp17:
   2735 ; AVX2-NEXT:    .cfi_offset %r14, -16
   2736 ; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   2737 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2738 ; AVX2-NEXT:    vzeroupper
   2739 ; AVX2-NEXT:    callq __truncdfhf2
   2740 ; AVX2-NEXT:    movw %ax, %bx
   2741 ; AVX2-NEXT:    shll $16, %ebx
   2742 ; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   2743 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   2744 ; AVX2-NEXT:    vzeroupper
   2745 ; AVX2-NEXT:    callq __truncdfhf2
   2746 ; AVX2-NEXT:    movzwl %ax, %r14d
   2747 ; AVX2-NEXT:    orl %ebx, %r14d
   2748 ; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   2749 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2750 ; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   2751 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2752 ; AVX2-NEXT:    vzeroupper
   2753 ; AVX2-NEXT:    callq __truncdfhf2
   2754 ; AVX2-NEXT:    movw %ax, %bx
   2755 ; AVX2-NEXT:    shll $16, %ebx
   2756 ; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   2757 ; AVX2-NEXT:    callq __truncdfhf2
   2758 ; AVX2-NEXT:    movzwl %ax, %eax
   2759 ; AVX2-NEXT:    orl %ebx, %eax
   2760 ; AVX2-NEXT:    shlq $32, %rax
   2761 ; AVX2-NEXT:    orq %r14, %rax
   2762 ; AVX2-NEXT:    vmovq %rax, %xmm0
   2763 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
   2764 ; AVX2-NEXT:    addq $40, %rsp
   2765 ; AVX2-NEXT:    popq %rbx
   2766 ; AVX2-NEXT:    popq %r14
   2767 ; AVX2-NEXT:    retq
   2768 ;
   2769 ; AVX512-LABEL: cvt_4f64_to_8i16_zero:
   2770 ; AVX512:       # BB#0:
   2771 ; AVX512-NEXT:    pushq %r14
   2772 ; AVX512-NEXT:  .Ltmp13:
   2773 ; AVX512-NEXT:    .cfi_def_cfa_offset 16
   2774 ; AVX512-NEXT:    pushq %rbx
   2775 ; AVX512-NEXT:  .Ltmp14:
   2776 ; AVX512-NEXT:    .cfi_def_cfa_offset 24
   2777 ; AVX512-NEXT:    subq $40, %rsp
   2778 ; AVX512-NEXT:  .Ltmp15:
   2779 ; AVX512-NEXT:    .cfi_def_cfa_offset 64
   2780 ; AVX512-NEXT:  .Ltmp16:
   2781 ; AVX512-NEXT:    .cfi_offset %rbx, -24
   2782 ; AVX512-NEXT:  .Ltmp17:
   2783 ; AVX512-NEXT:    .cfi_offset %r14, -16
   2784 ; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   2785 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2786 ; AVX512-NEXT:    callq __truncdfhf2
   2787 ; AVX512-NEXT:    movw %ax, %bx
   2788 ; AVX512-NEXT:    shll $16, %ebx
   2789 ; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   2790 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   2791 ; AVX512-NEXT:    callq __truncdfhf2
   2792 ; AVX512-NEXT:    movzwl %ax, %r14d
   2793 ; AVX512-NEXT:    orl %ebx, %r14d
   2794 ; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   2795 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2796 ; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   2797 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2798 ; AVX512-NEXT:    callq __truncdfhf2
   2799 ; AVX512-NEXT:    movw %ax, %bx
   2800 ; AVX512-NEXT:    shll $16, %ebx
   2801 ; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   2802 ; AVX512-NEXT:    callq __truncdfhf2
   2803 ; AVX512-NEXT:    movzwl %ax, %eax
   2804 ; AVX512-NEXT:    orl %ebx, %eax
   2805 ; AVX512-NEXT:    shlq $32, %rax
   2806 ; AVX512-NEXT:    orq %r14, %rax
   2807 ; AVX512-NEXT:    vmovq %rax, %xmm0
   2808 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
   2809 ; AVX512-NEXT:    addq $40, %rsp
   2810 ; AVX512-NEXT:    popq %rbx
   2811 ; AVX512-NEXT:    popq %r14
   2812 ; AVX512-NEXT:    retq
   2813   %1 = fptrunc <4 x double> %a0 to <4 x half>
   2814   %2 = bitcast <4 x half> %1 to <4 x i16>
   2815   %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   2816   ret <8 x i16> %3
   2817 }
   2818 
   2819 define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) {
   2820 ; AVX1-LABEL: cvt_8f64_to_8i16:
   2821 ; AVX1:       # BB#0:
   2822 ; AVX1-NEXT:    pushq %r15
   2823 ; AVX1-NEXT:  .Ltmp18:
   2824 ; AVX1-NEXT:    .cfi_def_cfa_offset 16
   2825 ; AVX1-NEXT:    pushq %r14
   2826 ; AVX1-NEXT:  .Ltmp19:
   2827 ; AVX1-NEXT:    .cfi_def_cfa_offset 24
   2828 ; AVX1-NEXT:    pushq %rbx
   2829 ; AVX1-NEXT:  .Ltmp20:
   2830 ; AVX1-NEXT:    .cfi_def_cfa_offset 32
   2831 ; AVX1-NEXT:    subq $64, %rsp
   2832 ; AVX1-NEXT:  .Ltmp21:
   2833 ; AVX1-NEXT:    .cfi_def_cfa_offset 96
   2834 ; AVX1-NEXT:  .Ltmp22:
   2835 ; AVX1-NEXT:    .cfi_offset %rbx, -32
   2836 ; AVX1-NEXT:  .Ltmp23:
   2837 ; AVX1-NEXT:    .cfi_offset %r14, -24
   2838 ; AVX1-NEXT:  .Ltmp24:
   2839 ; AVX1-NEXT:    .cfi_offset %r15, -16
   2840 ; AVX1-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
   2841 ; AVX1-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
   2842 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2843 ; AVX1-NEXT:    vzeroupper
   2844 ; AVX1-NEXT:    callq __truncdfhf2
   2845 ; AVX1-NEXT:    movw %ax, %bx
   2846 ; AVX1-NEXT:    shll $16, %ebx
   2847 ; AVX1-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
   2848 ; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   2849 ; AVX1-NEXT:    vzeroupper
   2850 ; AVX1-NEXT:    callq __truncdfhf2
   2851 ; AVX1-NEXT:    movzwl %ax, %r15d
   2852 ; AVX1-NEXT:    orl %ebx, %r15d
   2853 ; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
   2854 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2855 ; AVX1-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
   2856 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2857 ; AVX1-NEXT:    vzeroupper
   2858 ; AVX1-NEXT:    callq __truncdfhf2
   2859 ; AVX1-NEXT:    movw %ax, %bx
   2860 ; AVX1-NEXT:    shll $16, %ebx
   2861 ; AVX1-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
   2862 ; AVX1-NEXT:    callq __truncdfhf2
   2863 ; AVX1-NEXT:    movzwl %ax, %r14d
   2864 ; AVX1-NEXT:    orl %ebx, %r14d
   2865 ; AVX1-NEXT:    shlq $32, %r14
   2866 ; AVX1-NEXT:    orq %r15, %r14
   2867 ; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   2868 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2869 ; AVX1-NEXT:    vzeroupper
   2870 ; AVX1-NEXT:    callq __truncdfhf2
   2871 ; AVX1-NEXT:    movw %ax, %bx
   2872 ; AVX1-NEXT:    shll $16, %ebx
   2873 ; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   2874 ; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   2875 ; AVX1-NEXT:    vzeroupper
   2876 ; AVX1-NEXT:    callq __truncdfhf2
   2877 ; AVX1-NEXT:    movzwl %ax, %r15d
   2878 ; AVX1-NEXT:    orl %ebx, %r15d
   2879 ; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   2880 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2881 ; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   2882 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2883 ; AVX1-NEXT:    vzeroupper
   2884 ; AVX1-NEXT:    callq __truncdfhf2
   2885 ; AVX1-NEXT:    movw %ax, %bx
   2886 ; AVX1-NEXT:    shll $16, %ebx
   2887 ; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   2888 ; AVX1-NEXT:    callq __truncdfhf2
   2889 ; AVX1-NEXT:    movzwl %ax, %eax
   2890 ; AVX1-NEXT:    orl %ebx, %eax
   2891 ; AVX1-NEXT:    shlq $32, %rax
   2892 ; AVX1-NEXT:    orq %r15, %rax
   2893 ; AVX1-NEXT:    vmovq %rax, %xmm0
   2894 ; AVX1-NEXT:    vmovq %r14, %xmm1
   2895 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   2896 ; AVX1-NEXT:    addq $64, %rsp
   2897 ; AVX1-NEXT:    popq %rbx
   2898 ; AVX1-NEXT:    popq %r14
   2899 ; AVX1-NEXT:    popq %r15
   2900 ; AVX1-NEXT:    retq
   2901 ;
   2902 ; AVX2-LABEL: cvt_8f64_to_8i16:
   2903 ; AVX2:       # BB#0:
   2904 ; AVX2-NEXT:    pushq %r15
   2905 ; AVX2-NEXT:  .Ltmp18:
   2906 ; AVX2-NEXT:    .cfi_def_cfa_offset 16
   2907 ; AVX2-NEXT:    pushq %r14
   2908 ; AVX2-NEXT:  .Ltmp19:
   2909 ; AVX2-NEXT:    .cfi_def_cfa_offset 24
   2910 ; AVX2-NEXT:    pushq %rbx
   2911 ; AVX2-NEXT:  .Ltmp20:
   2912 ; AVX2-NEXT:    .cfi_def_cfa_offset 32
   2913 ; AVX2-NEXT:    subq $64, %rsp
   2914 ; AVX2-NEXT:  .Ltmp21:
   2915 ; AVX2-NEXT:    .cfi_def_cfa_offset 96
   2916 ; AVX2-NEXT:  .Ltmp22:
   2917 ; AVX2-NEXT:    .cfi_offset %rbx, -32
   2918 ; AVX2-NEXT:  .Ltmp23:
   2919 ; AVX2-NEXT:    .cfi_offset %r14, -24
   2920 ; AVX2-NEXT:  .Ltmp24:
   2921 ; AVX2-NEXT:    .cfi_offset %r15, -16
   2922 ; AVX2-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
   2923 ; AVX2-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
   2924 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2925 ; AVX2-NEXT:    vzeroupper
   2926 ; AVX2-NEXT:    callq __truncdfhf2
   2927 ; AVX2-NEXT:    movw %ax, %bx
   2928 ; AVX2-NEXT:    shll $16, %ebx
   2929 ; AVX2-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
   2930 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   2931 ; AVX2-NEXT:    vzeroupper
   2932 ; AVX2-NEXT:    callq __truncdfhf2
   2933 ; AVX2-NEXT:    movzwl %ax, %r15d
   2934 ; AVX2-NEXT:    orl %ebx, %r15d
   2935 ; AVX2-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
   2936 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2937 ; AVX2-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
   2938 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2939 ; AVX2-NEXT:    vzeroupper
   2940 ; AVX2-NEXT:    callq __truncdfhf2
   2941 ; AVX2-NEXT:    movw %ax, %bx
   2942 ; AVX2-NEXT:    shll $16, %ebx
   2943 ; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
   2944 ; AVX2-NEXT:    callq __truncdfhf2
   2945 ; AVX2-NEXT:    movzwl %ax, %r14d
   2946 ; AVX2-NEXT:    orl %ebx, %r14d
   2947 ; AVX2-NEXT:    shlq $32, %r14
   2948 ; AVX2-NEXT:    orq %r15, %r14
   2949 ; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   2950 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2951 ; AVX2-NEXT:    vzeroupper
   2952 ; AVX2-NEXT:    callq __truncdfhf2
   2953 ; AVX2-NEXT:    movw %ax, %bx
   2954 ; AVX2-NEXT:    shll $16, %ebx
   2955 ; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   2956 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   2957 ; AVX2-NEXT:    vzeroupper
   2958 ; AVX2-NEXT:    callq __truncdfhf2
   2959 ; AVX2-NEXT:    movzwl %ax, %r15d
   2960 ; AVX2-NEXT:    orl %ebx, %r15d
   2961 ; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   2962 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2963 ; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   2964 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   2965 ; AVX2-NEXT:    vzeroupper
   2966 ; AVX2-NEXT:    callq __truncdfhf2
   2967 ; AVX2-NEXT:    movw %ax, %bx
   2968 ; AVX2-NEXT:    shll $16, %ebx
   2969 ; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   2970 ; AVX2-NEXT:    callq __truncdfhf2
   2971 ; AVX2-NEXT:    movzwl %ax, %eax
   2972 ; AVX2-NEXT:    orl %ebx, %eax
   2973 ; AVX2-NEXT:    shlq $32, %rax
   2974 ; AVX2-NEXT:    orq %r15, %rax
   2975 ; AVX2-NEXT:    vmovq %rax, %xmm0
   2976 ; AVX2-NEXT:    vmovq %r14, %xmm1
   2977 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   2978 ; AVX2-NEXT:    addq $64, %rsp
   2979 ; AVX2-NEXT:    popq %rbx
   2980 ; AVX2-NEXT:    popq %r14
   2981 ; AVX2-NEXT:    popq %r15
   2982 ; AVX2-NEXT:    retq
   2983 ;
   2984 ; AVX512-LABEL: cvt_8f64_to_8i16:
   2985 ; AVX512:       # BB#0:
   2986 ; AVX512-NEXT:    pushq %r15
   2987 ; AVX512-NEXT:  .Ltmp18:
   2988 ; AVX512-NEXT:    .cfi_def_cfa_offset 16
   2989 ; AVX512-NEXT:    pushq %r14
   2990 ; AVX512-NEXT:  .Ltmp19:
   2991 ; AVX512-NEXT:    .cfi_def_cfa_offset 24
   2992 ; AVX512-NEXT:    pushq %rbx
   2993 ; AVX512-NEXT:  .Ltmp20:
   2994 ; AVX512-NEXT:    .cfi_def_cfa_offset 32
   2995 ; AVX512-NEXT:    subq $96, %rsp
   2996 ; AVX512-NEXT:  .Ltmp21:
   2997 ; AVX512-NEXT:    .cfi_def_cfa_offset 128
   2998 ; AVX512-NEXT:  .Ltmp22:
   2999 ; AVX512-NEXT:    .cfi_offset %rbx, -32
   3000 ; AVX512-NEXT:  .Ltmp23:
   3001 ; AVX512-NEXT:    .cfi_offset %r14, -24
   3002 ; AVX512-NEXT:  .Ltmp24:
   3003 ; AVX512-NEXT:    .cfi_offset %r15, -16
   3004 ; AVX512-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
   3005 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3006 ; AVX512-NEXT:    callq __truncdfhf2
   3007 ; AVX512-NEXT:    movw %ax, %bx
   3008 ; AVX512-NEXT:    shll $16, %ebx
   3009 ; AVX512-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
   3010 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
   3011 ; AVX512-NEXT:    callq __truncdfhf2
   3012 ; AVX512-NEXT:    movzwl %ax, %r15d
   3013 ; AVX512-NEXT:    orl %ebx, %r15d
   3014 ; AVX512-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
   3015 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3016 ; AVX512-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
   3017 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3018 ; AVX512-NEXT:    callq __truncdfhf2
   3019 ; AVX512-NEXT:    movw %ax, %bx
   3020 ; AVX512-NEXT:    shll $16, %ebx
   3021 ; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
   3022 ; AVX512-NEXT:    callq __truncdfhf2
   3023 ; AVX512-NEXT:    movzwl %ax, %r14d
   3024 ; AVX512-NEXT:    orl %ebx, %r14d
   3025 ; AVX512-NEXT:    shlq $32, %r14
   3026 ; AVX512-NEXT:    orq %r15, %r14
   3027 ; AVX512-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
   3028 ; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
   3029 ; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3030 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3031 ; AVX512-NEXT:    callq __truncdfhf2
   3032 ; AVX512-NEXT:    movw %ax, %bx
   3033 ; AVX512-NEXT:    shll $16, %ebx
   3034 ; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3035 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   3036 ; AVX512-NEXT:    callq __truncdfhf2
   3037 ; AVX512-NEXT:    movzwl %ax, %r15d
   3038 ; AVX512-NEXT:    orl %ebx, %r15d
   3039 ; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   3040 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3041 ; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3042 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3043 ; AVX512-NEXT:    callq __truncdfhf2
   3044 ; AVX512-NEXT:    movw %ax, %bx
   3045 ; AVX512-NEXT:    shll $16, %ebx
   3046 ; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3047 ; AVX512-NEXT:    callq __truncdfhf2
   3048 ; AVX512-NEXT:    movzwl %ax, %eax
   3049 ; AVX512-NEXT:    orl %ebx, %eax
   3050 ; AVX512-NEXT:    shlq $32, %rax
   3051 ; AVX512-NEXT:    orq %r15, %rax
   3052 ; AVX512-NEXT:    vmovq %rax, %xmm0
   3053 ; AVX512-NEXT:    vmovq %r14, %xmm1
   3054 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   3055 ; AVX512-NEXT:    addq $96, %rsp
   3056 ; AVX512-NEXT:    popq %rbx
   3057 ; AVX512-NEXT:    popq %r14
   3058 ; AVX512-NEXT:    popq %r15
   3059 ; AVX512-NEXT:    retq
   3060   %1 = fptrunc <8 x double> %a0 to <8 x half>
   3061   %2 = bitcast <8 x half> %1 to <8 x i16>
   3062   ret <8 x i16> %2
   3063 }
   3064 
   3065 ;
   3066 ; Double to Half (Store)
   3067 ;
   3068 
   3069 define void @store_cvt_f64_to_i16(double %a0, i16* %a1) {
   3070 ; ALL-LABEL: store_cvt_f64_to_i16:
   3071 ; ALL:       # BB#0:
   3072 ; ALL-NEXT:    pushq %rbx
   3073 ; ALL-NEXT:  .Ltmp25:
   3074 ; ALL-NEXT:    .cfi_def_cfa_offset 16
   3075 ; ALL-NEXT:  .Ltmp26:
   3076 ; ALL-NEXT:    .cfi_offset %rbx, -16
   3077 ; ALL-NEXT:    movq %rdi, %rbx
   3078 ; ALL-NEXT:    callq __truncdfhf2
   3079 ; ALL-NEXT:    movw %ax, (%rbx)
   3080 ; ALL-NEXT:    popq %rbx
   3081 ; ALL-NEXT:    retq
   3082   %1 = fptrunc double %a0 to half
   3083   %2 = bitcast half %1 to i16
   3084   store i16 %2, i16* %a1
   3085   ret void
   3086 }
   3087 
   3088 define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) {
   3089 ; ALL-LABEL: store_cvt_2f64_to_2i16:
   3090 ; ALL:       # BB#0:
   3091 ; ALL-NEXT:    pushq %rbp
   3092 ; ALL-NEXT:  .Ltmp27:
   3093 ; ALL-NEXT:    .cfi_def_cfa_offset 16
   3094 ; ALL-NEXT:    pushq %rbx
   3095 ; ALL-NEXT:  .Ltmp28:
   3096 ; ALL-NEXT:    .cfi_def_cfa_offset 24
   3097 ; ALL-NEXT:    subq $24, %rsp
   3098 ; ALL-NEXT:  .Ltmp29:
   3099 ; ALL-NEXT:    .cfi_def_cfa_offset 48
   3100 ; ALL-NEXT:  .Ltmp30:
   3101 ; ALL-NEXT:    .cfi_offset %rbx, -24
   3102 ; ALL-NEXT:  .Ltmp31:
   3103 ; ALL-NEXT:    .cfi_offset %rbp, -16
   3104 ; ALL-NEXT:    movq %rdi, %rbx
   3105 ; ALL-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3106 ; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3107 ; ALL-NEXT:    callq __truncdfhf2
   3108 ; ALL-NEXT:    movl %eax, %ebp
   3109 ; ALL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3110 ; ALL-NEXT:    callq __truncdfhf2
   3111 ; ALL-NEXT:    movw %ax, (%rbx)
   3112 ; ALL-NEXT:    movw %bp, 2(%rbx)
   3113 ; ALL-NEXT:    addq $24, %rsp
   3114 ; ALL-NEXT:    popq %rbx
   3115 ; ALL-NEXT:    popq %rbp
   3116 ; ALL-NEXT:    retq
   3117   %1 = fptrunc <2 x double> %a0 to <2 x half>
   3118   %2 = bitcast <2 x half> %1 to <2 x i16>
   3119   store <2 x i16> %2, <2 x i16>* %a1
   3120   ret void
   3121 }
   3122 
   3123 define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) {
   3124 ; AVX1-LABEL: store_cvt_4f64_to_4i16:
   3125 ; AVX1:       # BB#0:
   3126 ; AVX1-NEXT:    pushq %rbp
   3127 ; AVX1-NEXT:  .Ltmp32:
   3128 ; AVX1-NEXT:    .cfi_def_cfa_offset 16
   3129 ; AVX1-NEXT:    pushq %r15
   3130 ; AVX1-NEXT:  .Ltmp33:
   3131 ; AVX1-NEXT:    .cfi_def_cfa_offset 24
   3132 ; AVX1-NEXT:    pushq %r14
   3133 ; AVX1-NEXT:  .Ltmp34:
   3134 ; AVX1-NEXT:    .cfi_def_cfa_offset 32
   3135 ; AVX1-NEXT:    pushq %rbx
   3136 ; AVX1-NEXT:  .Ltmp35:
   3137 ; AVX1-NEXT:    .cfi_def_cfa_offset 40
   3138 ; AVX1-NEXT:    subq $88, %rsp
   3139 ; AVX1-NEXT:  .Ltmp36:
   3140 ; AVX1-NEXT:    .cfi_def_cfa_offset 128
   3141 ; AVX1-NEXT:  .Ltmp37:
   3142 ; AVX1-NEXT:    .cfi_offset %rbx, -40
   3143 ; AVX1-NEXT:  .Ltmp38:
   3144 ; AVX1-NEXT:    .cfi_offset %r14, -32
   3145 ; AVX1-NEXT:  .Ltmp39:
   3146 ; AVX1-NEXT:    .cfi_offset %r15, -24
   3147 ; AVX1-NEXT:  .Ltmp40:
   3148 ; AVX1-NEXT:    .cfi_offset %rbp, -16
   3149 ; AVX1-NEXT:    movq %rdi, %rbx
   3150 ; AVX1-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
   3151 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3152 ; AVX1-NEXT:    vzeroupper
   3153 ; AVX1-NEXT:    callq __truncdfhf2
   3154 ; AVX1-NEXT:    movl %eax, %r14d
   3155 ; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
   3156 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3157 ; AVX1-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
   3158 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3159 ; AVX1-NEXT:    vzeroupper
   3160 ; AVX1-NEXT:    callq __truncdfhf2
   3161 ; AVX1-NEXT:    movl %eax, %r15d
   3162 ; AVX1-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
   3163 ; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   3164 ; AVX1-NEXT:    vzeroupper
   3165 ; AVX1-NEXT:    callq __truncdfhf2
   3166 ; AVX1-NEXT:    movl %eax, %ebp
   3167 ; AVX1-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
   3168 ; AVX1-NEXT:    callq __truncdfhf2
   3169 ; AVX1-NEXT:    movw %ax, 4(%rbx)
   3170 ; AVX1-NEXT:    movw %bp, (%rbx)
   3171 ; AVX1-NEXT:    movw %r15w, 6(%rbx)
   3172 ; AVX1-NEXT:    movw %r14w, 2(%rbx)
   3173 ; AVX1-NEXT:    addq $88, %rsp
   3174 ; AVX1-NEXT:    popq %rbx
   3175 ; AVX1-NEXT:    popq %r14
   3176 ; AVX1-NEXT:    popq %r15
   3177 ; AVX1-NEXT:    popq %rbp
   3178 ; AVX1-NEXT:    retq
   3179 ;
   3180 ; AVX2-LABEL: store_cvt_4f64_to_4i16:
   3181 ; AVX2:       # BB#0:
   3182 ; AVX2-NEXT:    pushq %rbp
   3183 ; AVX2-NEXT:  .Ltmp32:
   3184 ; AVX2-NEXT:    .cfi_def_cfa_offset 16
   3185 ; AVX2-NEXT:    pushq %r15
   3186 ; AVX2-NEXT:  .Ltmp33:
   3187 ; AVX2-NEXT:    .cfi_def_cfa_offset 24
   3188 ; AVX2-NEXT:    pushq %r14
   3189 ; AVX2-NEXT:  .Ltmp34:
   3190 ; AVX2-NEXT:    .cfi_def_cfa_offset 32
   3191 ; AVX2-NEXT:    pushq %rbx
   3192 ; AVX2-NEXT:  .Ltmp35:
   3193 ; AVX2-NEXT:    .cfi_def_cfa_offset 40
   3194 ; AVX2-NEXT:    subq $88, %rsp
   3195 ; AVX2-NEXT:  .Ltmp36:
   3196 ; AVX2-NEXT:    .cfi_def_cfa_offset 128
   3197 ; AVX2-NEXT:  .Ltmp37:
   3198 ; AVX2-NEXT:    .cfi_offset %rbx, -40
   3199 ; AVX2-NEXT:  .Ltmp38:
   3200 ; AVX2-NEXT:    .cfi_offset %r14, -32
   3201 ; AVX2-NEXT:  .Ltmp39:
   3202 ; AVX2-NEXT:    .cfi_offset %r15, -24
   3203 ; AVX2-NEXT:  .Ltmp40:
   3204 ; AVX2-NEXT:    .cfi_offset %rbp, -16
   3205 ; AVX2-NEXT:    movq %rdi, %rbx
   3206 ; AVX2-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
   3207 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3208 ; AVX2-NEXT:    vzeroupper
   3209 ; AVX2-NEXT:    callq __truncdfhf2
   3210 ; AVX2-NEXT:    movl %eax, %r14d
   3211 ; AVX2-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
   3212 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3213 ; AVX2-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
   3214 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3215 ; AVX2-NEXT:    vzeroupper
   3216 ; AVX2-NEXT:    callq __truncdfhf2
   3217 ; AVX2-NEXT:    movl %eax, %r15d
   3218 ; AVX2-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
   3219 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   3220 ; AVX2-NEXT:    vzeroupper
   3221 ; AVX2-NEXT:    callq __truncdfhf2
   3222 ; AVX2-NEXT:    movl %eax, %ebp
   3223 ; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
   3224 ; AVX2-NEXT:    callq __truncdfhf2
   3225 ; AVX2-NEXT:    movw %ax, 4(%rbx)
   3226 ; AVX2-NEXT:    movw %bp, (%rbx)
   3227 ; AVX2-NEXT:    movw %r15w, 6(%rbx)
   3228 ; AVX2-NEXT:    movw %r14w, 2(%rbx)
   3229 ; AVX2-NEXT:    addq $88, %rsp
   3230 ; AVX2-NEXT:    popq %rbx
   3231 ; AVX2-NEXT:    popq %r14
   3232 ; AVX2-NEXT:    popq %r15
   3233 ; AVX2-NEXT:    popq %rbp
   3234 ; AVX2-NEXT:    retq
   3235 ;
   3236 ; AVX512-LABEL: store_cvt_4f64_to_4i16:
   3237 ; AVX512:       # BB#0:
   3238 ; AVX512-NEXT:    pushq %rbp
   3239 ; AVX512-NEXT:  .Ltmp32:
   3240 ; AVX512-NEXT:    .cfi_def_cfa_offset 16
   3241 ; AVX512-NEXT:    pushq %r15
   3242 ; AVX512-NEXT:  .Ltmp33:
   3243 ; AVX512-NEXT:    .cfi_def_cfa_offset 24
   3244 ; AVX512-NEXT:    pushq %r14
   3245 ; AVX512-NEXT:  .Ltmp34:
   3246 ; AVX512-NEXT:    .cfi_def_cfa_offset 32
   3247 ; AVX512-NEXT:    pushq %rbx
   3248 ; AVX512-NEXT:  .Ltmp35:
   3249 ; AVX512-NEXT:    .cfi_def_cfa_offset 40
   3250 ; AVX512-NEXT:    subq $88, %rsp
   3251 ; AVX512-NEXT:  .Ltmp36:
   3252 ; AVX512-NEXT:    .cfi_def_cfa_offset 128
   3253 ; AVX512-NEXT:  .Ltmp37:
   3254 ; AVX512-NEXT:    .cfi_offset %rbx, -40
   3255 ; AVX512-NEXT:  .Ltmp38:
   3256 ; AVX512-NEXT:    .cfi_offset %r14, -32
   3257 ; AVX512-NEXT:  .Ltmp39:
   3258 ; AVX512-NEXT:    .cfi_offset %r15, -24
   3259 ; AVX512-NEXT:  .Ltmp40:
   3260 ; AVX512-NEXT:    .cfi_offset %rbp, -16
   3261 ; AVX512-NEXT:    movq %rdi, %rbx
   3262 ; AVX512-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
   3263 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3264 ; AVX512-NEXT:    callq __truncdfhf2
   3265 ; AVX512-NEXT:    movl %eax, %r14d
   3266 ; AVX512-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
   3267 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3268 ; AVX512-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
   3269 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3270 ; AVX512-NEXT:    callq __truncdfhf2
   3271 ; AVX512-NEXT:    movl %eax, %r15d
   3272 ; AVX512-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
   3273 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   3274 ; AVX512-NEXT:    callq __truncdfhf2
   3275 ; AVX512-NEXT:    movl %eax, %ebp
   3276 ; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
   3277 ; AVX512-NEXT:    callq __truncdfhf2
   3278 ; AVX512-NEXT:    movw %ax, 4(%rbx)
   3279 ; AVX512-NEXT:    movw %bp, (%rbx)
   3280 ; AVX512-NEXT:    movw %r15w, 6(%rbx)
   3281 ; AVX512-NEXT:    movw %r14w, 2(%rbx)
   3282 ; AVX512-NEXT:    addq $88, %rsp
   3283 ; AVX512-NEXT:    popq %rbx
   3284 ; AVX512-NEXT:    popq %r14
   3285 ; AVX512-NEXT:    popq %r15
   3286 ; AVX512-NEXT:    popq %rbp
   3287 ; AVX512-NEXT:    retq
   3288   %1 = fptrunc <4 x double> %a0 to <4 x half>
   3289   %2 = bitcast <4 x half> %1 to <4 x i16>
   3290   store <4 x i16> %2, <4 x i16>* %a1
   3291   ret void
   3292 }
   3293 
   3294 define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) {
   3295 ; AVX1-LABEL: store_cvt_4f64_to_8i16_undef:
   3296 ; AVX1:       # BB#0:
   3297 ; AVX1-NEXT:    pushq %rbp
   3298 ; AVX1-NEXT:  .Ltmp41:
   3299 ; AVX1-NEXT:    .cfi_def_cfa_offset 16
   3300 ; AVX1-NEXT:    pushq %r14
   3301 ; AVX1-NEXT:  .Ltmp42:
   3302 ; AVX1-NEXT:    .cfi_def_cfa_offset 24
   3303 ; AVX1-NEXT:    pushq %rbx
   3304 ; AVX1-NEXT:  .Ltmp43:
   3305 ; AVX1-NEXT:    .cfi_def_cfa_offset 32
   3306 ; AVX1-NEXT:    subq $32, %rsp
   3307 ; AVX1-NEXT:  .Ltmp44:
   3308 ; AVX1-NEXT:    .cfi_def_cfa_offset 64
   3309 ; AVX1-NEXT:  .Ltmp45:
   3310 ; AVX1-NEXT:    .cfi_offset %rbx, -32
   3311 ; AVX1-NEXT:  .Ltmp46:
   3312 ; AVX1-NEXT:    .cfi_offset %r14, -24
   3313 ; AVX1-NEXT:  .Ltmp47:
   3314 ; AVX1-NEXT:    .cfi_offset %rbp, -16
   3315 ; AVX1-NEXT:    movq %rdi, %r14
   3316 ; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3317 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3318 ; AVX1-NEXT:    vzeroupper
   3319 ; AVX1-NEXT:    callq __truncdfhf2
   3320 ; AVX1-NEXT:    movw %ax, %bp
   3321 ; AVX1-NEXT:    shll $16, %ebp
   3322 ; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3323 ; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   3324 ; AVX1-NEXT:    vzeroupper
   3325 ; AVX1-NEXT:    callq __truncdfhf2
   3326 ; AVX1-NEXT:    movzwl %ax, %ebx
   3327 ; AVX1-NEXT:    orl %ebp, %ebx
   3328 ; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   3329 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3330 ; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3331 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3332 ; AVX1-NEXT:    vzeroupper
   3333 ; AVX1-NEXT:    callq __truncdfhf2
   3334 ; AVX1-NEXT:    movw %ax, %bp
   3335 ; AVX1-NEXT:    shll $16, %ebp
   3336 ; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3337 ; AVX1-NEXT:    callq __truncdfhf2
   3338 ; AVX1-NEXT:    movzwl %ax, %eax
   3339 ; AVX1-NEXT:    orl %ebp, %eax
   3340 ; AVX1-NEXT:    shlq $32, %rax
   3341 ; AVX1-NEXT:    orq %rbx, %rax
   3342 ; AVX1-NEXT:    vmovq %rax, %xmm0
   3343 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   3344 ; AVX1-NEXT:    vmovdqa %xmm0, (%r14)
   3345 ; AVX1-NEXT:    addq $32, %rsp
   3346 ; AVX1-NEXT:    popq %rbx
   3347 ; AVX1-NEXT:    popq %r14
   3348 ; AVX1-NEXT:    popq %rbp
   3349 ; AVX1-NEXT:    retq
   3350 ;
   3351 ; AVX2-LABEL: store_cvt_4f64_to_8i16_undef:
   3352 ; AVX2:       # BB#0:
   3353 ; AVX2-NEXT:    pushq %rbp
   3354 ; AVX2-NEXT:  .Ltmp41:
   3355 ; AVX2-NEXT:    .cfi_def_cfa_offset 16
   3356 ; AVX2-NEXT:    pushq %r14
   3357 ; AVX2-NEXT:  .Ltmp42:
   3358 ; AVX2-NEXT:    .cfi_def_cfa_offset 24
   3359 ; AVX2-NEXT:    pushq %rbx
   3360 ; AVX2-NEXT:  .Ltmp43:
   3361 ; AVX2-NEXT:    .cfi_def_cfa_offset 32
   3362 ; AVX2-NEXT:    subq $32, %rsp
   3363 ; AVX2-NEXT:  .Ltmp44:
   3364 ; AVX2-NEXT:    .cfi_def_cfa_offset 64
   3365 ; AVX2-NEXT:  .Ltmp45:
   3366 ; AVX2-NEXT:    .cfi_offset %rbx, -32
   3367 ; AVX2-NEXT:  .Ltmp46:
   3368 ; AVX2-NEXT:    .cfi_offset %r14, -24
   3369 ; AVX2-NEXT:  .Ltmp47:
   3370 ; AVX2-NEXT:    .cfi_offset %rbp, -16
   3371 ; AVX2-NEXT:    movq %rdi, %r14
   3372 ; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3373 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3374 ; AVX2-NEXT:    vzeroupper
   3375 ; AVX2-NEXT:    callq __truncdfhf2
   3376 ; AVX2-NEXT:    movw %ax, %bp
   3377 ; AVX2-NEXT:    shll $16, %ebp
   3378 ; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3379 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   3380 ; AVX2-NEXT:    vzeroupper
   3381 ; AVX2-NEXT:    callq __truncdfhf2
   3382 ; AVX2-NEXT:    movzwl %ax, %ebx
   3383 ; AVX2-NEXT:    orl %ebp, %ebx
   3384 ; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   3385 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3386 ; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3387 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3388 ; AVX2-NEXT:    vzeroupper
   3389 ; AVX2-NEXT:    callq __truncdfhf2
   3390 ; AVX2-NEXT:    movw %ax, %bp
   3391 ; AVX2-NEXT:    shll $16, %ebp
   3392 ; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3393 ; AVX2-NEXT:    callq __truncdfhf2
   3394 ; AVX2-NEXT:    movzwl %ax, %eax
   3395 ; AVX2-NEXT:    orl %ebp, %eax
   3396 ; AVX2-NEXT:    shlq $32, %rax
   3397 ; AVX2-NEXT:    orq %rbx, %rax
   3398 ; AVX2-NEXT:    vmovq %rax, %xmm0
   3399 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   3400 ; AVX2-NEXT:    vmovdqa %xmm0, (%r14)
   3401 ; AVX2-NEXT:    addq $32, %rsp
   3402 ; AVX2-NEXT:    popq %rbx
   3403 ; AVX2-NEXT:    popq %r14
   3404 ; AVX2-NEXT:    popq %rbp
   3405 ; AVX2-NEXT:    retq
   3406 ;
   3407 ; AVX512-LABEL: store_cvt_4f64_to_8i16_undef:
   3408 ; AVX512:       # BB#0:
   3409 ; AVX512-NEXT:    pushq %rbp
   3410 ; AVX512-NEXT:  .Ltmp41:
   3411 ; AVX512-NEXT:    .cfi_def_cfa_offset 16
   3412 ; AVX512-NEXT:    pushq %r14
   3413 ; AVX512-NEXT:  .Ltmp42:
   3414 ; AVX512-NEXT:    .cfi_def_cfa_offset 24
   3415 ; AVX512-NEXT:    pushq %rbx
   3416 ; AVX512-NEXT:  .Ltmp43:
   3417 ; AVX512-NEXT:    .cfi_def_cfa_offset 32
   3418 ; AVX512-NEXT:    subq $32, %rsp
   3419 ; AVX512-NEXT:  .Ltmp44:
   3420 ; AVX512-NEXT:    .cfi_def_cfa_offset 64
   3421 ; AVX512-NEXT:  .Ltmp45:
   3422 ; AVX512-NEXT:    .cfi_offset %rbx, -32
   3423 ; AVX512-NEXT:  .Ltmp46:
   3424 ; AVX512-NEXT:    .cfi_offset %r14, -24
   3425 ; AVX512-NEXT:  .Ltmp47:
   3426 ; AVX512-NEXT:    .cfi_offset %rbp, -16
   3427 ; AVX512-NEXT:    movq %rdi, %r14
   3428 ; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3429 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3430 ; AVX512-NEXT:    callq __truncdfhf2
   3431 ; AVX512-NEXT:    movw %ax, %bp
   3432 ; AVX512-NEXT:    shll $16, %ebp
   3433 ; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3434 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   3435 ; AVX512-NEXT:    callq __truncdfhf2
   3436 ; AVX512-NEXT:    movzwl %ax, %ebx
   3437 ; AVX512-NEXT:    orl %ebp, %ebx
   3438 ; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   3439 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3440 ; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3441 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3442 ; AVX512-NEXT:    callq __truncdfhf2
   3443 ; AVX512-NEXT:    movw %ax, %bp
   3444 ; AVX512-NEXT:    shll $16, %ebp
   3445 ; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3446 ; AVX512-NEXT:    callq __truncdfhf2
   3447 ; AVX512-NEXT:    movzwl %ax, %eax
   3448 ; AVX512-NEXT:    orl %ebp, %eax
   3449 ; AVX512-NEXT:    shlq $32, %rax
   3450 ; AVX512-NEXT:    orq %rbx, %rax
   3451 ; AVX512-NEXT:    vmovq %rax, %xmm0
   3452 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   3453 ; AVX512-NEXT:    vmovdqa %xmm0, (%r14)
   3454 ; AVX512-NEXT:    addq $32, %rsp
   3455 ; AVX512-NEXT:    popq %rbx
   3456 ; AVX512-NEXT:    popq %r14
   3457 ; AVX512-NEXT:    popq %rbp
   3458 ; AVX512-NEXT:    retq
   3459   %1 = fptrunc <4 x double> %a0 to <4 x half>
   3460   %2 = bitcast <4 x half> %1 to <4 x i16>
   3461   %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   3462   store <8 x i16> %3, <8 x i16>* %a1
   3463   ret void
   3464 }
   3465 
   3466 define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) {
   3467 ; AVX1-LABEL: store_cvt_4f64_to_8i16_zero:
   3468 ; AVX1:       # BB#0:
   3469 ; AVX1-NEXT:    pushq %rbp
   3470 ; AVX1-NEXT:  .Ltmp48:
   3471 ; AVX1-NEXT:    .cfi_def_cfa_offset 16
   3472 ; AVX1-NEXT:    pushq %r14
   3473 ; AVX1-NEXT:  .Ltmp49:
   3474 ; AVX1-NEXT:    .cfi_def_cfa_offset 24
   3475 ; AVX1-NEXT:    pushq %rbx
   3476 ; AVX1-NEXT:  .Ltmp50:
   3477 ; AVX1-NEXT:    .cfi_def_cfa_offset 32
   3478 ; AVX1-NEXT:    subq $32, %rsp
   3479 ; AVX1-NEXT:  .Ltmp51:
   3480 ; AVX1-NEXT:    .cfi_def_cfa_offset 64
   3481 ; AVX1-NEXT:  .Ltmp52:
   3482 ; AVX1-NEXT:    .cfi_offset %rbx, -32
   3483 ; AVX1-NEXT:  .Ltmp53:
   3484 ; AVX1-NEXT:    .cfi_offset %r14, -24
   3485 ; AVX1-NEXT:  .Ltmp54:
   3486 ; AVX1-NEXT:    .cfi_offset %rbp, -16
   3487 ; AVX1-NEXT:    movq %rdi, %r14
   3488 ; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3489 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3490 ; AVX1-NEXT:    vzeroupper
   3491 ; AVX1-NEXT:    callq __truncdfhf2
   3492 ; AVX1-NEXT:    movw %ax, %bp
   3493 ; AVX1-NEXT:    shll $16, %ebp
   3494 ; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3495 ; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   3496 ; AVX1-NEXT:    vzeroupper
   3497 ; AVX1-NEXT:    callq __truncdfhf2
   3498 ; AVX1-NEXT:    movzwl %ax, %ebx
   3499 ; AVX1-NEXT:    orl %ebp, %ebx
   3500 ; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   3501 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3502 ; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3503 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3504 ; AVX1-NEXT:    vzeroupper
   3505 ; AVX1-NEXT:    callq __truncdfhf2
   3506 ; AVX1-NEXT:    movw %ax, %bp
   3507 ; AVX1-NEXT:    shll $16, %ebp
   3508 ; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3509 ; AVX1-NEXT:    callq __truncdfhf2
   3510 ; AVX1-NEXT:    movzwl %ax, %eax
   3511 ; AVX1-NEXT:    orl %ebp, %eax
   3512 ; AVX1-NEXT:    shlq $32, %rax
   3513 ; AVX1-NEXT:    orq %rbx, %rax
   3514 ; AVX1-NEXT:    vmovq %rax, %xmm0
   3515 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
   3516 ; AVX1-NEXT:    vmovdqa %xmm0, (%r14)
   3517 ; AVX1-NEXT:    addq $32, %rsp
   3518 ; AVX1-NEXT:    popq %rbx
   3519 ; AVX1-NEXT:    popq %r14
   3520 ; AVX1-NEXT:    popq %rbp
   3521 ; AVX1-NEXT:    retq
   3522 ;
   3523 ; AVX2-LABEL: store_cvt_4f64_to_8i16_zero:
   3524 ; AVX2:       # BB#0:
   3525 ; AVX2-NEXT:    pushq %rbp
   3526 ; AVX2-NEXT:  .Ltmp48:
   3527 ; AVX2-NEXT:    .cfi_def_cfa_offset 16
   3528 ; AVX2-NEXT:    pushq %r14
   3529 ; AVX2-NEXT:  .Ltmp49:
   3530 ; AVX2-NEXT:    .cfi_def_cfa_offset 24
   3531 ; AVX2-NEXT:    pushq %rbx
   3532 ; AVX2-NEXT:  .Ltmp50:
   3533 ; AVX2-NEXT:    .cfi_def_cfa_offset 32
   3534 ; AVX2-NEXT:    subq $32, %rsp
   3535 ; AVX2-NEXT:  .Ltmp51:
   3536 ; AVX2-NEXT:    .cfi_def_cfa_offset 64
   3537 ; AVX2-NEXT:  .Ltmp52:
   3538 ; AVX2-NEXT:    .cfi_offset %rbx, -32
   3539 ; AVX2-NEXT:  .Ltmp53:
   3540 ; AVX2-NEXT:    .cfi_offset %r14, -24
   3541 ; AVX2-NEXT:  .Ltmp54:
   3542 ; AVX2-NEXT:    .cfi_offset %rbp, -16
   3543 ; AVX2-NEXT:    movq %rdi, %r14
   3544 ; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3545 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3546 ; AVX2-NEXT:    vzeroupper
   3547 ; AVX2-NEXT:    callq __truncdfhf2
   3548 ; AVX2-NEXT:    movw %ax, %bp
   3549 ; AVX2-NEXT:    shll $16, %ebp
   3550 ; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3551 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   3552 ; AVX2-NEXT:    vzeroupper
   3553 ; AVX2-NEXT:    callq __truncdfhf2
   3554 ; AVX2-NEXT:    movzwl %ax, %ebx
   3555 ; AVX2-NEXT:    orl %ebp, %ebx
   3556 ; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   3557 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3558 ; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3559 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3560 ; AVX2-NEXT:    vzeroupper
   3561 ; AVX2-NEXT:    callq __truncdfhf2
   3562 ; AVX2-NEXT:    movw %ax, %bp
   3563 ; AVX2-NEXT:    shll $16, %ebp
   3564 ; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3565 ; AVX2-NEXT:    callq __truncdfhf2
   3566 ; AVX2-NEXT:    movzwl %ax, %eax
   3567 ; AVX2-NEXT:    orl %ebp, %eax
   3568 ; AVX2-NEXT:    shlq $32, %rax
   3569 ; AVX2-NEXT:    orq %rbx, %rax
   3570 ; AVX2-NEXT:    vmovq %rax, %xmm0
   3571 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
   3572 ; AVX2-NEXT:    vmovdqa %xmm0, (%r14)
   3573 ; AVX2-NEXT:    addq $32, %rsp
   3574 ; AVX2-NEXT:    popq %rbx
   3575 ; AVX2-NEXT:    popq %r14
   3576 ; AVX2-NEXT:    popq %rbp
   3577 ; AVX2-NEXT:    retq
   3578 ;
   3579 ; AVX512-LABEL: store_cvt_4f64_to_8i16_zero:
   3580 ; AVX512:       # BB#0:
   3581 ; AVX512-NEXT:    pushq %rbp
   3582 ; AVX512-NEXT:  .Ltmp48:
   3583 ; AVX512-NEXT:    .cfi_def_cfa_offset 16
   3584 ; AVX512-NEXT:    pushq %r14
   3585 ; AVX512-NEXT:  .Ltmp49:
   3586 ; AVX512-NEXT:    .cfi_def_cfa_offset 24
   3587 ; AVX512-NEXT:    pushq %rbx
   3588 ; AVX512-NEXT:  .Ltmp50:
   3589 ; AVX512-NEXT:    .cfi_def_cfa_offset 32
   3590 ; AVX512-NEXT:    subq $32, %rsp
   3591 ; AVX512-NEXT:  .Ltmp51:
   3592 ; AVX512-NEXT:    .cfi_def_cfa_offset 64
   3593 ; AVX512-NEXT:  .Ltmp52:
   3594 ; AVX512-NEXT:    .cfi_offset %rbx, -32
   3595 ; AVX512-NEXT:  .Ltmp53:
   3596 ; AVX512-NEXT:    .cfi_offset %r14, -24
   3597 ; AVX512-NEXT:  .Ltmp54:
   3598 ; AVX512-NEXT:    .cfi_offset %rbp, -16
   3599 ; AVX512-NEXT:    movq %rdi, %r14
   3600 ; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
   3601 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3602 ; AVX512-NEXT:    callq __truncdfhf2
   3603 ; AVX512-NEXT:    movw %ax, %bp
   3604 ; AVX512-NEXT:    shll $16, %ebp
   3605 ; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
   3606 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   3607 ; AVX512-NEXT:    callq __truncdfhf2
   3608 ; AVX512-NEXT:    movzwl %ax, %ebx
   3609 ; AVX512-NEXT:    orl %ebp, %ebx
   3610 ; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
   3611 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3612 ; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
   3613 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3614 ; AVX512-NEXT:    callq __truncdfhf2
   3615 ; AVX512-NEXT:    movw %ax, %bp
   3616 ; AVX512-NEXT:    shll $16, %ebp
   3617 ; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
   3618 ; AVX512-NEXT:    callq __truncdfhf2
   3619 ; AVX512-NEXT:    movzwl %ax, %eax
   3620 ; AVX512-NEXT:    orl %ebp, %eax
   3621 ; AVX512-NEXT:    shlq $32, %rax
   3622 ; AVX512-NEXT:    orq %rbx, %rax
   3623 ; AVX512-NEXT:    vmovq %rax, %xmm0
   3624 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
   3625 ; AVX512-NEXT:    vmovdqa %xmm0, (%r14)
   3626 ; AVX512-NEXT:    addq $32, %rsp
   3627 ; AVX512-NEXT:    popq %rbx
   3628 ; AVX512-NEXT:    popq %r14
   3629 ; AVX512-NEXT:    popq %rbp
   3630 ; AVX512-NEXT:    retq
   3631   %1 = fptrunc <4 x double> %a0 to <4 x half>
   3632   %2 = bitcast <4 x half> %1 to <4 x i16>
   3633   %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   3634   store <8 x i16> %3, <8 x i16>* %a1
   3635   ret void
   3636 }
   3637 
   3638 define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) {
   3639 ; AVX1-LABEL: store_cvt_8f64_to_8i16:
   3640 ; AVX1:       # BB#0:
   3641 ; AVX1-NEXT:    pushq %rbp
   3642 ; AVX1-NEXT:  .Ltmp55:
   3643 ; AVX1-NEXT:    .cfi_def_cfa_offset 16
   3644 ; AVX1-NEXT:    pushq %r15
   3645 ; AVX1-NEXT:  .Ltmp56:
   3646 ; AVX1-NEXT:    .cfi_def_cfa_offset 24
   3647 ; AVX1-NEXT:    pushq %r14
   3648 ; AVX1-NEXT:  .Ltmp57:
   3649 ; AVX1-NEXT:    .cfi_def_cfa_offset 32
   3650 ; AVX1-NEXT:    pushq %r13
   3651 ; AVX1-NEXT:  .Ltmp58:
   3652 ; AVX1-NEXT:    .cfi_def_cfa_offset 40
   3653 ; AVX1-NEXT:    pushq %r12
   3654 ; AVX1-NEXT:  .Ltmp59:
   3655 ; AVX1-NEXT:    .cfi_def_cfa_offset 48
   3656 ; AVX1-NEXT:    pushq %rbx
   3657 ; AVX1-NEXT:  .Ltmp60:
   3658 ; AVX1-NEXT:    .cfi_def_cfa_offset 56
   3659 ; AVX1-NEXT:    subq $136, %rsp
   3660 ; AVX1-NEXT:  .Ltmp61:
   3661 ; AVX1-NEXT:    .cfi_def_cfa_offset 192
   3662 ; AVX1-NEXT:  .Ltmp62:
   3663 ; AVX1-NEXT:    .cfi_offset %rbx, -56
   3664 ; AVX1-NEXT:  .Ltmp63:
   3665 ; AVX1-NEXT:    .cfi_offset %r12, -48
   3666 ; AVX1-NEXT:  .Ltmp64:
   3667 ; AVX1-NEXT:    .cfi_offset %r13, -40
   3668 ; AVX1-NEXT:  .Ltmp65:
   3669 ; AVX1-NEXT:    .cfi_offset %r14, -32
   3670 ; AVX1-NEXT:  .Ltmp66:
   3671 ; AVX1-NEXT:    .cfi_offset %r15, -24
   3672 ; AVX1-NEXT:  .Ltmp67:
   3673 ; AVX1-NEXT:    .cfi_offset %rbp, -16
   3674 ; AVX1-NEXT:    movq %rdi, %rbx
   3675 ; AVX1-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
   3676 ; AVX1-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
   3677 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3678 ; AVX1-NEXT:    vzeroupper
   3679 ; AVX1-NEXT:    callq __truncdfhf2
   3680 ; AVX1-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
   3681 ; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
   3682 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3683 ; AVX1-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
   3684 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3685 ; AVX1-NEXT:    vzeroupper
   3686 ; AVX1-NEXT:    callq __truncdfhf2
   3687 ; AVX1-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
   3688 ; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
   3689 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3690 ; AVX1-NEXT:    vzeroupper
   3691 ; AVX1-NEXT:    callq __truncdfhf2
   3692 ; AVX1-NEXT:    movl %eax, %r12d
   3693 ; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
   3694 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3695 ; AVX1-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
   3696 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3697 ; AVX1-NEXT:    vzeroupper
   3698 ; AVX1-NEXT:    callq __truncdfhf2
   3699 ; AVX1-NEXT:    movl %eax, %r13d
   3700 ; AVX1-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
   3701 ; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   3702 ; AVX1-NEXT:    vzeroupper
   3703 ; AVX1-NEXT:    callq __truncdfhf2
   3704 ; AVX1-NEXT:    movl %eax, %ebp
   3705 ; AVX1-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
   3706 ; AVX1-NEXT:    callq __truncdfhf2
   3707 ; AVX1-NEXT:    movl %eax, %r14d
   3708 ; AVX1-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
   3709 ; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   3710 ; AVX1-NEXT:    vzeroupper
   3711 ; AVX1-NEXT:    callq __truncdfhf2
   3712 ; AVX1-NEXT:    movl %eax, %r15d
   3713 ; AVX1-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
   3714 ; AVX1-NEXT:    callq __truncdfhf2
   3715 ; AVX1-NEXT:    movw %ax, 12(%rbx)
   3716 ; AVX1-NEXT:    movw %r15w, 8(%rbx)
   3717 ; AVX1-NEXT:    movw %r14w, 4(%rbx)
   3718 ; AVX1-NEXT:    movw %bp, (%rbx)
   3719 ; AVX1-NEXT:    movw %r13w, 14(%rbx)
   3720 ; AVX1-NEXT:    movw %r12w, 10(%rbx)
   3721 ; AVX1-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
   3722 ; AVX1-NEXT:    movw %ax, 6(%rbx)
   3723 ; AVX1-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
   3724 ; AVX1-NEXT:    movw %ax, 2(%rbx)
   3725 ; AVX1-NEXT:    addq $136, %rsp
   3726 ; AVX1-NEXT:    popq %rbx
   3727 ; AVX1-NEXT:    popq %r12
   3728 ; AVX1-NEXT:    popq %r13
   3729 ; AVX1-NEXT:    popq %r14
   3730 ; AVX1-NEXT:    popq %r15
   3731 ; AVX1-NEXT:    popq %rbp
   3732 ; AVX1-NEXT:    retq
   3733 ;
   3734 ; AVX2-LABEL: store_cvt_8f64_to_8i16:
   3735 ; AVX2:       # BB#0:
   3736 ; AVX2-NEXT:    pushq %rbp
   3737 ; AVX2-NEXT:  .Ltmp55:
   3738 ; AVX2-NEXT:    .cfi_def_cfa_offset 16
   3739 ; AVX2-NEXT:    pushq %r15
   3740 ; AVX2-NEXT:  .Ltmp56:
   3741 ; AVX2-NEXT:    .cfi_def_cfa_offset 24
   3742 ; AVX2-NEXT:    pushq %r14
   3743 ; AVX2-NEXT:  .Ltmp57:
   3744 ; AVX2-NEXT:    .cfi_def_cfa_offset 32
   3745 ; AVX2-NEXT:    pushq %r13
   3746 ; AVX2-NEXT:  .Ltmp58:
   3747 ; AVX2-NEXT:    .cfi_def_cfa_offset 40
   3748 ; AVX2-NEXT:    pushq %r12
   3749 ; AVX2-NEXT:  .Ltmp59:
   3750 ; AVX2-NEXT:    .cfi_def_cfa_offset 48
   3751 ; AVX2-NEXT:    pushq %rbx
   3752 ; AVX2-NEXT:  .Ltmp60:
   3753 ; AVX2-NEXT:    .cfi_def_cfa_offset 56
   3754 ; AVX2-NEXT:    subq $136, %rsp
   3755 ; AVX2-NEXT:  .Ltmp61:
   3756 ; AVX2-NEXT:    .cfi_def_cfa_offset 192
   3757 ; AVX2-NEXT:  .Ltmp62:
   3758 ; AVX2-NEXT:    .cfi_offset %rbx, -56
   3759 ; AVX2-NEXT:  .Ltmp63:
   3760 ; AVX2-NEXT:    .cfi_offset %r12, -48
   3761 ; AVX2-NEXT:  .Ltmp64:
   3762 ; AVX2-NEXT:    .cfi_offset %r13, -40
   3763 ; AVX2-NEXT:  .Ltmp65:
   3764 ; AVX2-NEXT:    .cfi_offset %r14, -32
   3765 ; AVX2-NEXT:  .Ltmp66:
   3766 ; AVX2-NEXT:    .cfi_offset %r15, -24
   3767 ; AVX2-NEXT:  .Ltmp67:
   3768 ; AVX2-NEXT:    .cfi_offset %rbp, -16
   3769 ; AVX2-NEXT:    movq %rdi, %rbx
   3770 ; AVX2-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
   3771 ; AVX2-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
   3772 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3773 ; AVX2-NEXT:    vzeroupper
   3774 ; AVX2-NEXT:    callq __truncdfhf2
   3775 ; AVX2-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
   3776 ; AVX2-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
   3777 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3778 ; AVX2-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
   3779 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3780 ; AVX2-NEXT:    vzeroupper
   3781 ; AVX2-NEXT:    callq __truncdfhf2
   3782 ; AVX2-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
   3783 ; AVX2-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
   3784 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3785 ; AVX2-NEXT:    vzeroupper
   3786 ; AVX2-NEXT:    callq __truncdfhf2
   3787 ; AVX2-NEXT:    movl %eax, %r12d
   3788 ; AVX2-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
   3789 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3790 ; AVX2-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
   3791 ; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3792 ; AVX2-NEXT:    vzeroupper
   3793 ; AVX2-NEXT:    callq __truncdfhf2
   3794 ; AVX2-NEXT:    movl %eax, %r13d
   3795 ; AVX2-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
   3796 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   3797 ; AVX2-NEXT:    vzeroupper
   3798 ; AVX2-NEXT:    callq __truncdfhf2
   3799 ; AVX2-NEXT:    movl %eax, %ebp
   3800 ; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
   3801 ; AVX2-NEXT:    callq __truncdfhf2
   3802 ; AVX2-NEXT:    movl %eax, %r14d
   3803 ; AVX2-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
   3804 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   3805 ; AVX2-NEXT:    vzeroupper
   3806 ; AVX2-NEXT:    callq __truncdfhf2
   3807 ; AVX2-NEXT:    movl %eax, %r15d
   3808 ; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
   3809 ; AVX2-NEXT:    callq __truncdfhf2
   3810 ; AVX2-NEXT:    movw %ax, 12(%rbx)
   3811 ; AVX2-NEXT:    movw %r15w, 8(%rbx)
   3812 ; AVX2-NEXT:    movw %r14w, 4(%rbx)
   3813 ; AVX2-NEXT:    movw %bp, (%rbx)
   3814 ; AVX2-NEXT:    movw %r13w, 14(%rbx)
   3815 ; AVX2-NEXT:    movw %r12w, 10(%rbx)
   3816 ; AVX2-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
   3817 ; AVX2-NEXT:    movw %ax, 6(%rbx)
   3818 ; AVX2-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
   3819 ; AVX2-NEXT:    movw %ax, 2(%rbx)
   3820 ; AVX2-NEXT:    addq $136, %rsp
   3821 ; AVX2-NEXT:    popq %rbx
   3822 ; AVX2-NEXT:    popq %r12
   3823 ; AVX2-NEXT:    popq %r13
   3824 ; AVX2-NEXT:    popq %r14
   3825 ; AVX2-NEXT:    popq %r15
   3826 ; AVX2-NEXT:    popq %rbp
   3827 ; AVX2-NEXT:    retq
   3828 ;
   3829 ; AVX512-LABEL: store_cvt_8f64_to_8i16:
   3830 ; AVX512:       # BB#0:
   3831 ; AVX512-NEXT:    pushq %rbp
   3832 ; AVX512-NEXT:  .Ltmp55:
   3833 ; AVX512-NEXT:    .cfi_def_cfa_offset 16
   3834 ; AVX512-NEXT:    pushq %r15
   3835 ; AVX512-NEXT:  .Ltmp56:
   3836 ; AVX512-NEXT:    .cfi_def_cfa_offset 24
   3837 ; AVX512-NEXT:    pushq %r14
   3838 ; AVX512-NEXT:  .Ltmp57:
   3839 ; AVX512-NEXT:    .cfi_def_cfa_offset 32
   3840 ; AVX512-NEXT:    pushq %r13
   3841 ; AVX512-NEXT:  .Ltmp58:
   3842 ; AVX512-NEXT:    .cfi_def_cfa_offset 40
   3843 ; AVX512-NEXT:    pushq %r12
   3844 ; AVX512-NEXT:  .Ltmp59:
   3845 ; AVX512-NEXT:    .cfi_def_cfa_offset 48
   3846 ; AVX512-NEXT:    pushq %rbx
   3847 ; AVX512-NEXT:  .Ltmp60:
   3848 ; AVX512-NEXT:    .cfi_def_cfa_offset 56
   3849 ; AVX512-NEXT:    subq $200, %rsp
   3850 ; AVX512-NEXT:  .Ltmp61:
   3851 ; AVX512-NEXT:    .cfi_def_cfa_offset 256
   3852 ; AVX512-NEXT:  .Ltmp62:
   3853 ; AVX512-NEXT:    .cfi_offset %rbx, -56
   3854 ; AVX512-NEXT:  .Ltmp63:
   3855 ; AVX512-NEXT:    .cfi_offset %r12, -48
   3856 ; AVX512-NEXT:  .Ltmp64:
   3857 ; AVX512-NEXT:    .cfi_offset %r13, -40
   3858 ; AVX512-NEXT:  .Ltmp65:
   3859 ; AVX512-NEXT:    .cfi_offset %r14, -32
   3860 ; AVX512-NEXT:  .Ltmp66:
   3861 ; AVX512-NEXT:    .cfi_offset %r15, -24
   3862 ; AVX512-NEXT:  .Ltmp67:
   3863 ; AVX512-NEXT:    .cfi_offset %rbp, -16
   3864 ; AVX512-NEXT:    movq %rdi, %rbx
   3865 ; AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill
   3866 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3867 ; AVX512-NEXT:    callq __truncdfhf2
   3868 ; AVX512-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
   3869 ; AVX512-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
   3870 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3871 ; AVX512-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
   3872 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3873 ; AVX512-NEXT:    callq __truncdfhf2
   3874 ; AVX512-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
   3875 ; AVX512-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
   3876 ; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
   3877 ; AVX512-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
   3878 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3879 ; AVX512-NEXT:    callq __truncdfhf2
   3880 ; AVX512-NEXT:    movl %eax, %r12d
   3881 ; AVX512-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
   3882 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3883 ; AVX512-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
   3884 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   3885 ; AVX512-NEXT:    callq __truncdfhf2
   3886 ; AVX512-NEXT:    movl %eax, %r13d
   3887 ; AVX512-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
   3888 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
   3889 ; AVX512-NEXT:    callq __truncdfhf2
   3890 ; AVX512-NEXT:    movl %eax, %ebp
   3891 ; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
   3892 ; AVX512-NEXT:    callq __truncdfhf2
   3893 ; AVX512-NEXT:    movl %eax, %r14d
   3894 ; AVX512-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
   3895 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
   3896 ; AVX512-NEXT:    callq __truncdfhf2
   3897 ; AVX512-NEXT:    movl %eax, %r15d
   3898 ; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
   3899 ; AVX512-NEXT:    callq __truncdfhf2
   3900 ; AVX512-NEXT:    movw %ax, 12(%rbx)
   3901 ; AVX512-NEXT:    movw %r15w, 8(%rbx)
   3902 ; AVX512-NEXT:    movw %r14w, 4(%rbx)
   3903 ; AVX512-NEXT:    movw %bp, (%rbx)
   3904 ; AVX512-NEXT:    movw %r13w, 14(%rbx)
   3905 ; AVX512-NEXT:    movw %r12w, 10(%rbx)
   3906 ; AVX512-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
   3907 ; AVX512-NEXT:    movw %ax, 6(%rbx)
   3908 ; AVX512-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
   3909 ; AVX512-NEXT:    movw %ax, 2(%rbx)
   3910 ; AVX512-NEXT:    addq $200, %rsp
   3911 ; AVX512-NEXT:    popq %rbx
   3912 ; AVX512-NEXT:    popq %r12
   3913 ; AVX512-NEXT:    popq %r13
   3914 ; AVX512-NEXT:    popq %r14
   3915 ; AVX512-NEXT:    popq %r15
   3916 ; AVX512-NEXT:    popq %rbp
   3917 ; AVX512-NEXT:    retq
   3918   %1 = fptrunc <8 x double> %a0 to <8 x half>
   3919   %2 = bitcast <8 x half> %1 to <8 x i16>
   3920   store <8 x i16> %2, <8 x i16>* %a1
   3921   ret void
   3922 }
   3923