1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c,+fast-variable-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VL 7 8 ; 9 ; Half to Float 10 ; 11 12 define float @cvt_i16_to_f32(i16 %a0) nounwind { 13 ; ALL-LABEL: cvt_i16_to_f32: 14 ; ALL: # %bb.0: 15 ; ALL-NEXT: movswl %di, %eax 16 ; ALL-NEXT: vmovd %eax, %xmm0 17 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 18 ; ALL-NEXT: retq 19 %1 = bitcast i16 %a0 to half 20 %2 = fpext half %1 to float 21 ret float %2 22 } 23 24 define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind { 25 ; AVX1-LABEL: cvt_4i16_to_4f32: 26 ; AVX1: # %bb.0: 27 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 28 ; AVX1-NEXT: vmovq %xmm0, %rax 29 ; AVX1-NEXT: movq %rax, %rcx 30 ; AVX1-NEXT: movq %rax, %rdx 31 ; AVX1-NEXT: movswl %ax, %esi 32 ; AVX1-NEXT: # kill: def $eax killed $eax killed $rax 33 ; AVX1-NEXT: shrl $16, %eax 34 ; AVX1-NEXT: shrq $32, %rcx 35 ; AVX1-NEXT: shrq $48, %rdx 36 ; AVX1-NEXT: movswl %dx, %edx 37 ; AVX1-NEXT: vmovd %edx, %xmm0 38 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 39 ; AVX1-NEXT: movswl %cx, %ecx 40 ; AVX1-NEXT: vmovd %ecx, %xmm1 41 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 42 ; AVX1-NEXT: cwtl 43 ; AVX1-NEXT: vmovd %eax, %xmm2 44 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 45 ; AVX1-NEXT: vmovd %esi, %xmm3 46 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 47 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 48 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 49 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 50 ; AVX1-NEXT: retq 51 ; 52 ; AVX2-LABEL: cvt_4i16_to_4f32: 53 ; AVX2: # %bb.0: 54 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 55 ; AVX2-NEXT: vmovq %xmm0, %rax 56 ; AVX2-NEXT: movq %rax, %rcx 57 ; AVX2-NEXT: movq %rax, %rdx 58 ; AVX2-NEXT: movswl %ax, %esi 59 ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax 60 ; AVX2-NEXT: shrl $16, %eax 61 ; AVX2-NEXT: shrq $32, %rcx 62 ; AVX2-NEXT: shrq $48, %rdx 63 ; AVX2-NEXT: movswl %dx, %edx 64 ; AVX2-NEXT: vmovd %edx, %xmm0 65 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 66 ; AVX2-NEXT: movswl %cx, %ecx 67 ; AVX2-NEXT: vmovd %ecx, %xmm1 68 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 69 ; AVX2-NEXT: cwtl 70 ; AVX2-NEXT: vmovd %eax, %xmm2 71 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 72 ; AVX2-NEXT: vmovd %esi, %xmm3 73 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 74 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 75 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 76 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 77 ; AVX2-NEXT: retq 78 ; 79 ; AVX512F-LABEL: cvt_4i16_to_4f32: 80 ; AVX512F: # %bb.0: 81 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 82 ; AVX512F-NEXT: vmovq %xmm0, %rax 83 ; AVX512F-NEXT: movq %rax, %rcx 84 ; AVX512F-NEXT: movq %rax, %rdx 85 ; AVX512F-NEXT: movswl %ax, %esi 86 ; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax 87 ; AVX512F-NEXT: shrl $16, %eax 88 ; AVX512F-NEXT: shrq $32, %rcx 89 ; AVX512F-NEXT: shrq $48, %rdx 90 ; AVX512F-NEXT: movswl %dx, %edx 91 ; AVX512F-NEXT: vmovd %edx, %xmm0 92 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 93 ; AVX512F-NEXT: movswl %cx, %ecx 94 ; AVX512F-NEXT: vmovd %ecx, %xmm1 95 ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 96 ; AVX512F-NEXT: cwtl 97 ; AVX512F-NEXT: vmovd %eax, %xmm2 98 ; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 99 ; AVX512F-NEXT: vmovd %esi, %xmm3 100 ; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 101 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 102 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 103 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 104 ; AVX512F-NEXT: retq 105 ; 106 ; AVX512VL-LABEL: cvt_4i16_to_4f32: 107 ; AVX512VL: # %bb.0: 108 ; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp) 109 ; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax 110 ; AVX512VL-NEXT: movq %rax, %rcx 111 ; AVX512VL-NEXT: movq %rax, %rdx 112 ; AVX512VL-NEXT: movswl %ax, %esi 113 ; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax 114 ; AVX512VL-NEXT: shrl $16, %eax 115 ; AVX512VL-NEXT: shrq $32, %rcx 116 ; AVX512VL-NEXT: shrq $48, %rdx 117 ; AVX512VL-NEXT: movswl %dx, %edx 118 ; AVX512VL-NEXT: vmovd %edx, %xmm0 119 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 120 ; AVX512VL-NEXT: movswl %cx, %ecx 121 ; AVX512VL-NEXT: vmovd %ecx, %xmm1 122 ; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 123 ; AVX512VL-NEXT: cwtl 124 ; AVX512VL-NEXT: vmovd %eax, %xmm2 125 ; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 126 ; AVX512VL-NEXT: vmovd %esi, %xmm3 127 ; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 128 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 129 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 130 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 131 ; AVX512VL-NEXT: retq 132 %1 = bitcast <4 x i16> %a0 to <4 x half> 133 %2 = fpext <4 x half> %1 to <4 x float> 134 ret <4 x float> %2 135 } 136 137 define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind { 138 ; AVX1-LABEL: cvt_8i16_to_4f32: 139 ; AVX1: # %bb.0: 140 ; AVX1-NEXT: vmovq %xmm0, %rax 141 ; AVX1-NEXT: movq %rax, %rcx 142 ; AVX1-NEXT: movq %rax, %rdx 143 ; AVX1-NEXT: movswl %ax, %esi 144 ; AVX1-NEXT: # kill: def $eax killed $eax killed $rax 145 ; AVX1-NEXT: shrl $16, %eax 146 ; AVX1-NEXT: shrq $32, %rcx 147 ; AVX1-NEXT: shrq $48, %rdx 148 ; AVX1-NEXT: movswl %dx, %edx 149 ; AVX1-NEXT: vmovd %edx, %xmm0 150 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 151 ; AVX1-NEXT: movswl %cx, %ecx 152 ; AVX1-NEXT: vmovd %ecx, %xmm1 153 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 154 ; AVX1-NEXT: cwtl 155 ; AVX1-NEXT: vmovd %eax, %xmm2 156 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 157 ; AVX1-NEXT: vmovd %esi, %xmm3 158 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 159 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 160 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 161 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 162 ; AVX1-NEXT: retq 163 ; 164 ; AVX2-LABEL: cvt_8i16_to_4f32: 165 ; AVX2: # %bb.0: 166 ; AVX2-NEXT: vmovq %xmm0, %rax 167 ; AVX2-NEXT: movq %rax, %rcx 168 ; AVX2-NEXT: movq %rax, %rdx 169 ; AVX2-NEXT: movswl %ax, %esi 170 ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax 171 ; AVX2-NEXT: shrl $16, %eax 172 ; AVX2-NEXT: shrq $32, %rcx 173 ; AVX2-NEXT: shrq $48, %rdx 174 ; AVX2-NEXT: movswl %dx, %edx 175 ; AVX2-NEXT: vmovd %edx, %xmm0 176 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 177 ; AVX2-NEXT: movswl %cx, %ecx 178 ; AVX2-NEXT: vmovd %ecx, %xmm1 179 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 180 ; AVX2-NEXT: cwtl 181 ; AVX2-NEXT: vmovd %eax, %xmm2 182 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 183 ; AVX2-NEXT: vmovd %esi, %xmm3 184 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 185 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 186 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 187 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 188 ; AVX2-NEXT: retq 189 ; 190 ; AVX512F-LABEL: cvt_8i16_to_4f32: 191 ; AVX512F: # %bb.0: 192 ; AVX512F-NEXT: vmovq %xmm0, %rax 193 ; AVX512F-NEXT: movq %rax, %rcx 194 ; AVX512F-NEXT: movq %rax, %rdx 195 ; AVX512F-NEXT: movswl %ax, %esi 196 ; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax 197 ; AVX512F-NEXT: shrl $16, %eax 198 ; AVX512F-NEXT: shrq $32, %rcx 199 ; AVX512F-NEXT: shrq $48, %rdx 200 ; AVX512F-NEXT: movswl %dx, %edx 201 ; AVX512F-NEXT: vmovd %edx, %xmm0 202 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 203 ; AVX512F-NEXT: movswl %cx, %ecx 204 ; AVX512F-NEXT: vmovd %ecx, %xmm1 205 ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 206 ; AVX512F-NEXT: cwtl 207 ; AVX512F-NEXT: vmovd %eax, %xmm2 208 ; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 209 ; AVX512F-NEXT: vmovd %esi, %xmm3 210 ; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 211 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 212 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 213 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 214 ; AVX512F-NEXT: retq 215 ; 216 ; AVX512VL-LABEL: cvt_8i16_to_4f32: 217 ; AVX512VL: # %bb.0: 218 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 219 ; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp) 220 ; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax 221 ; AVX512VL-NEXT: movq %rax, %rcx 222 ; AVX512VL-NEXT: movq %rax, %rdx 223 ; AVX512VL-NEXT: movswl %ax, %esi 224 ; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax 225 ; AVX512VL-NEXT: shrl $16, %eax 226 ; AVX512VL-NEXT: shrq $32, %rcx 227 ; AVX512VL-NEXT: shrq $48, %rdx 228 ; AVX512VL-NEXT: movswl %dx, %edx 229 ; AVX512VL-NEXT: vmovd %edx, %xmm0 230 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 231 ; AVX512VL-NEXT: movswl %cx, %ecx 232 ; AVX512VL-NEXT: vmovd %ecx, %xmm1 233 ; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 234 ; AVX512VL-NEXT: cwtl 235 ; AVX512VL-NEXT: vmovd %eax, %xmm2 236 ; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 237 ; AVX512VL-NEXT: vmovd %esi, %xmm3 238 ; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 239 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 240 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 241 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 242 ; AVX512VL-NEXT: retq 243 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 244 %2 = bitcast <4 x i16> %1 to <4 x half> 245 %3 = fpext <4 x half> %2 to <4 x float> 246 ret <4 x float> %3 247 } 248 249 define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind { 250 ; ALL-LABEL: cvt_8i16_to_8f32: 251 ; ALL: # %bb.0: 252 ; ALL-NEXT: vpextrq $1, %xmm0, %rdx 253 ; ALL-NEXT: movq %rdx, %r8 254 ; ALL-NEXT: movq %rdx, %r10 255 ; ALL-NEXT: movswl %dx, %r9d 256 ; ALL-NEXT: # kill: def $edx killed $edx killed $rdx 257 ; ALL-NEXT: shrl $16, %edx 258 ; ALL-NEXT: shrq $32, %r8 259 ; ALL-NEXT: shrq $48, %r10 260 ; ALL-NEXT: vmovq %xmm0, %rdi 261 ; ALL-NEXT: movq %rdi, %rax 262 ; ALL-NEXT: movq %rdi, %rsi 263 ; ALL-NEXT: movswl %di, %ecx 264 ; ALL-NEXT: # kill: def $edi killed $edi killed $rdi 265 ; ALL-NEXT: shrl $16, %edi 266 ; ALL-NEXT: shrq $32, %rax 267 ; ALL-NEXT: shrq $48, %rsi 268 ; ALL-NEXT: movswl %si, %esi 269 ; ALL-NEXT: vmovd %esi, %xmm0 270 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 271 ; ALL-NEXT: cwtl 272 ; ALL-NEXT: vmovd %eax, %xmm1 273 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 274 ; ALL-NEXT: movswl %di, %eax 275 ; ALL-NEXT: vmovd %eax, %xmm2 276 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 277 ; ALL-NEXT: vmovd %ecx, %xmm3 278 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 279 ; ALL-NEXT: movswl %r10w, %eax 280 ; ALL-NEXT: vmovd %eax, %xmm4 281 ; ALL-NEXT: vcvtph2ps %xmm4, %xmm4 282 ; ALL-NEXT: movswl %r8w, %eax 283 ; ALL-NEXT: vmovd %eax, %xmm5 284 ; ALL-NEXT: vcvtph2ps %xmm5, %xmm5 285 ; ALL-NEXT: movswl %dx, %eax 286 ; ALL-NEXT: vmovd %eax, %xmm6 287 ; ALL-NEXT: vcvtph2ps %xmm6, %xmm6 288 ; ALL-NEXT: vmovd %r9d, %xmm7 289 ; ALL-NEXT: vcvtph2ps %xmm7, %xmm7 290 ; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3] 291 ; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] 292 ; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] 293 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 294 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 295 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 296 ; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 297 ; ALL-NEXT: retq 298 %1 = bitcast <8 x i16> %a0 to <8 x half> 299 %2 = fpext <8 x half> %1 to <8 x float> 300 ret <8 x float> %2 301 } 302 303 define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind { 304 ; AVX1-LABEL: cvt_16i16_to_16f32: 305 ; AVX1: # %bb.0: 306 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 307 ; AVX1-NEXT: vmovq %xmm4, %rax 308 ; AVX1-NEXT: movq %rax, %rcx 309 ; AVX1-NEXT: shrq $48, %rcx 310 ; AVX1-NEXT: movswl %cx, %ecx 311 ; AVX1-NEXT: vmovd %ecx, %xmm8 312 ; AVX1-NEXT: movq %rax, %rcx 313 ; AVX1-NEXT: shrq $32, %rcx 314 ; AVX1-NEXT: movswl %cx, %ecx 315 ; AVX1-NEXT: vmovd %ecx, %xmm9 316 ; AVX1-NEXT: movswl %ax, %ecx 317 ; AVX1-NEXT: # kill: def $eax killed $eax killed $rax 318 ; AVX1-NEXT: shrl $16, %eax 319 ; AVX1-NEXT: cwtl 320 ; AVX1-NEXT: vmovd %eax, %xmm10 321 ; AVX1-NEXT: vpextrq $1, %xmm4, %rax 322 ; AVX1-NEXT: vmovd %ecx, %xmm11 323 ; AVX1-NEXT: movq %rax, %rcx 324 ; AVX1-NEXT: shrq $48, %rcx 325 ; AVX1-NEXT: movswl %cx, %ecx 326 ; AVX1-NEXT: vmovd %ecx, %xmm12 327 ; AVX1-NEXT: movq %rax, %rcx 328 ; AVX1-NEXT: shrq $32, %rcx 329 ; AVX1-NEXT: movswl %cx, %ecx 330 ; AVX1-NEXT: vmovd %ecx, %xmm13 331 ; AVX1-NEXT: movswl %ax, %ecx 332 ; AVX1-NEXT: # kill: def $eax killed $eax killed $rax 333 ; AVX1-NEXT: shrl $16, %eax 334 ; AVX1-NEXT: cwtl 335 ; AVX1-NEXT: vmovd %eax, %xmm14 336 ; AVX1-NEXT: vmovq %xmm0, %rax 337 ; AVX1-NEXT: vmovd %ecx, %xmm15 338 ; AVX1-NEXT: movq %rax, %rcx 339 ; AVX1-NEXT: shrq $48, %rcx 340 ; AVX1-NEXT: movswl %cx, %ecx 341 ; AVX1-NEXT: vmovd %ecx, %xmm2 342 ; AVX1-NEXT: movq %rax, %rcx 343 ; AVX1-NEXT: shrq $32, %rcx 344 ; AVX1-NEXT: movswl %cx, %ecx 345 ; AVX1-NEXT: vmovd %ecx, %xmm3 346 ; AVX1-NEXT: movswl %ax, %ecx 347 ; AVX1-NEXT: # kill: def $eax killed $eax killed $rax 348 ; AVX1-NEXT: shrl $16, %eax 349 ; AVX1-NEXT: cwtl 350 ; AVX1-NEXT: vmovd %eax, %xmm4 351 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax 352 ; AVX1-NEXT: vmovd %ecx, %xmm0 353 ; AVX1-NEXT: movq %rax, %rcx 354 ; AVX1-NEXT: shrq $48, %rcx 355 ; AVX1-NEXT: movswl %cx, %ecx 356 ; AVX1-NEXT: vmovd %ecx, %xmm5 357 ; AVX1-NEXT: movq %rax, %rcx 358 ; AVX1-NEXT: shrq $32, %rcx 359 ; AVX1-NEXT: movswl %cx, %ecx 360 ; AVX1-NEXT: vmovd %ecx, %xmm6 361 ; AVX1-NEXT: movl %eax, %ecx 362 ; AVX1-NEXT: shrl $16, %ecx 363 ; AVX1-NEXT: movswl %cx, %ecx 364 ; AVX1-NEXT: vmovd %ecx, %xmm7 365 ; AVX1-NEXT: cwtl 366 ; AVX1-NEXT: vmovd %eax, %xmm1 367 ; AVX1-NEXT: vcvtph2ps %xmm8, %xmm8 368 ; AVX1-NEXT: vcvtph2ps %xmm9, %xmm9 369 ; AVX1-NEXT: vcvtph2ps %xmm10, %xmm10 370 ; AVX1-NEXT: vcvtph2ps %xmm11, %xmm11 371 ; AVX1-NEXT: vcvtph2ps %xmm12, %xmm12 372 ; AVX1-NEXT: vcvtph2ps %xmm13, %xmm13 373 ; AVX1-NEXT: vcvtph2ps %xmm14, %xmm14 374 ; AVX1-NEXT: vcvtph2ps %xmm15, %xmm15 375 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 376 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 377 ; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 378 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 379 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 380 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 381 ; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 382 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 383 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3] 384 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] 385 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] 386 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] 387 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] 388 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] 389 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 390 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3] 391 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] 392 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] 393 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3] 394 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 395 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 396 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 397 ; AVX1-NEXT: retq 398 ; 399 ; AVX2-LABEL: cvt_16i16_to_16f32: 400 ; AVX2: # %bb.0: 401 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 402 ; AVX2-NEXT: vmovq %xmm4, %rax 403 ; AVX2-NEXT: movq %rax, %rcx 404 ; AVX2-NEXT: shrq $48, %rcx 405 ; AVX2-NEXT: movswl %cx, %ecx 406 ; AVX2-NEXT: vmovd %ecx, %xmm8 407 ; AVX2-NEXT: movq %rax, %rcx 408 ; AVX2-NEXT: shrq $32, %rcx 409 ; AVX2-NEXT: movswl %cx, %ecx 410 ; AVX2-NEXT: vmovd %ecx, %xmm9 411 ; AVX2-NEXT: movswl %ax, %ecx 412 ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax 413 ; AVX2-NEXT: shrl $16, %eax 414 ; AVX2-NEXT: cwtl 415 ; AVX2-NEXT: vmovd %eax, %xmm10 416 ; AVX2-NEXT: vpextrq $1, %xmm4, %rax 417 ; AVX2-NEXT: vmovd %ecx, %xmm11 418 ; AVX2-NEXT: movq %rax, %rcx 419 ; AVX2-NEXT: shrq $48, %rcx 420 ; AVX2-NEXT: movswl %cx, %ecx 421 ; AVX2-NEXT: vmovd %ecx, %xmm12 422 ; AVX2-NEXT: movq %rax, %rcx 423 ; AVX2-NEXT: shrq $32, %rcx 424 ; AVX2-NEXT: movswl %cx, %ecx 425 ; AVX2-NEXT: vmovd %ecx, %xmm13 426 ; AVX2-NEXT: movswl %ax, %ecx 427 ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax 428 ; AVX2-NEXT: shrl $16, %eax 429 ; AVX2-NEXT: cwtl 430 ; AVX2-NEXT: vmovd %eax, %xmm14 431 ; AVX2-NEXT: vmovq %xmm0, %rax 432 ; AVX2-NEXT: vmovd %ecx, %xmm15 433 ; AVX2-NEXT: movq %rax, %rcx 434 ; AVX2-NEXT: shrq $48, %rcx 435 ; AVX2-NEXT: movswl %cx, %ecx 436 ; AVX2-NEXT: vmovd %ecx, %xmm2 437 ; AVX2-NEXT: movq %rax, %rcx 438 ; AVX2-NEXT: shrq $32, %rcx 439 ; AVX2-NEXT: movswl %cx, %ecx 440 ; AVX2-NEXT: vmovd %ecx, %xmm3 441 ; AVX2-NEXT: movswl %ax, %ecx 442 ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax 443 ; AVX2-NEXT: shrl $16, %eax 444 ; AVX2-NEXT: cwtl 445 ; AVX2-NEXT: vmovd %eax, %xmm4 446 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax 447 ; AVX2-NEXT: vmovd %ecx, %xmm0 448 ; AVX2-NEXT: movq %rax, %rcx 449 ; AVX2-NEXT: shrq $48, %rcx 450 ; AVX2-NEXT: movswl %cx, %ecx 451 ; AVX2-NEXT: vmovd %ecx, %xmm5 452 ; AVX2-NEXT: movq %rax, %rcx 453 ; AVX2-NEXT: shrq $32, %rcx 454 ; AVX2-NEXT: movswl %cx, %ecx 455 ; AVX2-NEXT: vmovd %ecx, %xmm6 456 ; AVX2-NEXT: movl %eax, %ecx 457 ; AVX2-NEXT: shrl $16, %ecx 458 ; AVX2-NEXT: movswl %cx, %ecx 459 ; AVX2-NEXT: vmovd %ecx, %xmm7 460 ; AVX2-NEXT: cwtl 461 ; AVX2-NEXT: vmovd %eax, %xmm1 462 ; AVX2-NEXT: vcvtph2ps %xmm8, %xmm8 463 ; AVX2-NEXT: vcvtph2ps %xmm9, %xmm9 464 ; AVX2-NEXT: vcvtph2ps %xmm10, %xmm10 465 ; AVX2-NEXT: vcvtph2ps %xmm11, %xmm11 466 ; AVX2-NEXT: vcvtph2ps %xmm12, %xmm12 467 ; AVX2-NEXT: vcvtph2ps %xmm13, %xmm13 468 ; AVX2-NEXT: vcvtph2ps %xmm14, %xmm14 469 ; AVX2-NEXT: vcvtph2ps %xmm15, %xmm15 470 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 471 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 472 ; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 473 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 474 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 475 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 476 ; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 477 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 478 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3] 479 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] 480 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] 481 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] 482 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] 483 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] 484 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 485 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3] 486 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] 487 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] 488 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3] 489 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 490 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 491 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 492 ; AVX2-NEXT: retq 493 ; 494 ; AVX512F-LABEL: cvt_16i16_to_16f32: 495 ; AVX512F: # %bb.0: 496 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm10 497 ; AVX512F-NEXT: vmovq %xmm0, %rax 498 ; AVX512F-NEXT: movq %rax, %rcx 499 ; AVX512F-NEXT: shrq $48, %rcx 500 ; AVX512F-NEXT: movswl %cx, %ecx 501 ; AVX512F-NEXT: vmovd %ecx, %xmm8 502 ; AVX512F-NEXT: movq %rax, %rcx 503 ; AVX512F-NEXT: shrq $32, %rcx 504 ; AVX512F-NEXT: movswl %cx, %ecx 505 ; AVX512F-NEXT: vmovd %ecx, %xmm9 506 ; AVX512F-NEXT: movswl %ax, %ecx 507 ; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax 508 ; AVX512F-NEXT: shrl $16, %eax 509 ; AVX512F-NEXT: cwtl 510 ; AVX512F-NEXT: vmovd %eax, %xmm11 511 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax 512 ; AVX512F-NEXT: vmovd %ecx, %xmm12 513 ; AVX512F-NEXT: movq %rax, %rcx 514 ; AVX512F-NEXT: shrq $48, %rcx 515 ; AVX512F-NEXT: movswl %cx, %ecx 516 ; AVX512F-NEXT: vmovd %ecx, %xmm13 517 ; AVX512F-NEXT: movq %rax, %rcx 518 ; AVX512F-NEXT: shrq $32, %rcx 519 ; AVX512F-NEXT: movswl %cx, %ecx 520 ; AVX512F-NEXT: vmovd %ecx, %xmm14 521 ; AVX512F-NEXT: movswl %ax, %ecx 522 ; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax 523 ; AVX512F-NEXT: shrl $16, %eax 524 ; AVX512F-NEXT: cwtl 525 ; AVX512F-NEXT: vmovd %eax, %xmm15 526 ; AVX512F-NEXT: vmovq %xmm10, %rax 527 ; AVX512F-NEXT: vmovd %ecx, %xmm2 528 ; AVX512F-NEXT: movq %rax, %rcx 529 ; AVX512F-NEXT: shrq $48, %rcx 530 ; AVX512F-NEXT: movswl %cx, %ecx 531 ; AVX512F-NEXT: vmovd %ecx, %xmm3 532 ; AVX512F-NEXT: movq %rax, %rcx 533 ; AVX512F-NEXT: shrq $32, %rcx 534 ; AVX512F-NEXT: movswl %cx, %ecx 535 ; AVX512F-NEXT: vmovd %ecx, %xmm1 536 ; AVX512F-NEXT: movswl %ax, %ecx 537 ; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax 538 ; AVX512F-NEXT: shrl $16, %eax 539 ; AVX512F-NEXT: cwtl 540 ; AVX512F-NEXT: vmovd %eax, %xmm4 541 ; AVX512F-NEXT: vpextrq $1, %xmm10, %rax 542 ; AVX512F-NEXT: vmovd %ecx, %xmm10 543 ; AVX512F-NEXT: movq %rax, %rcx 544 ; AVX512F-NEXT: shrq $48, %rcx 545 ; AVX512F-NEXT: movswl %cx, %ecx 546 ; AVX512F-NEXT: vmovd %ecx, %xmm5 547 ; AVX512F-NEXT: movq %rax, %rcx 548 ; AVX512F-NEXT: shrq $32, %rcx 549 ; AVX512F-NEXT: movswl %cx, %ecx 550 ; AVX512F-NEXT: vmovd %ecx, %xmm6 551 ; AVX512F-NEXT: movl %eax, %ecx 552 ; AVX512F-NEXT: shrl $16, %ecx 553 ; AVX512F-NEXT: movswl %cx, %ecx 554 ; AVX512F-NEXT: vmovd %ecx, %xmm7 555 ; AVX512F-NEXT: cwtl 556 ; AVX512F-NEXT: vmovd %eax, %xmm0 557 ; AVX512F-NEXT: vcvtph2ps %xmm8, %xmm8 558 ; AVX512F-NEXT: vcvtph2ps %xmm9, %xmm9 559 ; AVX512F-NEXT: vcvtph2ps %xmm11, %xmm11 560 ; AVX512F-NEXT: vcvtph2ps %xmm12, %xmm12 561 ; AVX512F-NEXT: vcvtph2ps %xmm13, %xmm13 562 ; AVX512F-NEXT: vcvtph2ps %xmm14, %xmm14 563 ; AVX512F-NEXT: vcvtph2ps %xmm15, %xmm15 564 ; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 565 ; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 566 ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 567 ; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4 568 ; AVX512F-NEXT: vcvtph2ps %xmm10, %xmm10 569 ; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5 570 ; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 571 ; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7 572 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 573 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3] 574 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3] 575 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0] 576 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3] 577 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3] 578 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] 579 ; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 580 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3] 581 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3] 582 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0] 583 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3] 584 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 585 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 586 ; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 587 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 588 ; AVX512F-NEXT: retq 589 ; 590 ; AVX512VL-LABEL: cvt_16i16_to_16f32: 591 ; AVX512VL: # %bb.0: 592 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm10 593 ; AVX512VL-NEXT: vmovq %xmm0, %rax 594 ; AVX512VL-NEXT: movq %rax, %rcx 595 ; AVX512VL-NEXT: shrq $48, %rcx 596 ; AVX512VL-NEXT: movswl %cx, %ecx 597 ; AVX512VL-NEXT: vmovd %ecx, %xmm8 598 ; AVX512VL-NEXT: movq %rax, %rcx 599 ; AVX512VL-NEXT: shrq $32, %rcx 600 ; AVX512VL-NEXT: movswl %cx, %ecx 601 ; AVX512VL-NEXT: vmovd %ecx, %xmm9 602 ; AVX512VL-NEXT: movswl %ax, %ecx 603 ; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax 604 ; AVX512VL-NEXT: shrl $16, %eax 605 ; AVX512VL-NEXT: cwtl 606 ; AVX512VL-NEXT: vmovd %eax, %xmm11 607 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax 608 ; AVX512VL-NEXT: vmovd %ecx, %xmm12 609 ; AVX512VL-NEXT: movq %rax, %rcx 610 ; AVX512VL-NEXT: shrq $48, %rcx 611 ; AVX512VL-NEXT: movswl %cx, %ecx 612 ; AVX512VL-NEXT: vmovd %ecx, %xmm13 613 ; AVX512VL-NEXT: movq %rax, %rcx 614 ; AVX512VL-NEXT: shrq $32, %rcx 615 ; AVX512VL-NEXT: movswl %cx, %ecx 616 ; AVX512VL-NEXT: vmovd %ecx, %xmm14 617 ; AVX512VL-NEXT: movswl %ax, %ecx 618 ; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax 619 ; AVX512VL-NEXT: shrl $16, %eax 620 ; AVX512VL-NEXT: cwtl 621 ; AVX512VL-NEXT: vmovd %eax, %xmm15 622 ; AVX512VL-NEXT: vmovq %xmm10, %rax 623 ; AVX512VL-NEXT: vmovd %ecx, %xmm16 624 ; AVX512VL-NEXT: movq %rax, %rcx 625 ; AVX512VL-NEXT: shrq $48, %rcx 626 ; AVX512VL-NEXT: movswl %cx, %ecx 627 ; AVX512VL-NEXT: vmovd %ecx, %xmm17 628 ; AVX512VL-NEXT: movq %rax, %rcx 629 ; AVX512VL-NEXT: shrq $32, %rcx 630 ; AVX512VL-NEXT: movswl %cx, %ecx 631 ; AVX512VL-NEXT: vmovd %ecx, %xmm18 632 ; AVX512VL-NEXT: movswl %ax, %ecx 633 ; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax 634 ; AVX512VL-NEXT: shrl $16, %eax 635 ; AVX512VL-NEXT: cwtl 636 ; AVX512VL-NEXT: vmovd %eax, %xmm19 637 ; AVX512VL-NEXT: vpextrq $1, %xmm10, %rax 638 ; AVX512VL-NEXT: vmovd %ecx, %xmm10 639 ; AVX512VL-NEXT: movq %rax, %rcx 640 ; AVX512VL-NEXT: shrq $48, %rcx 641 ; AVX512VL-NEXT: movswl %cx, %ecx 642 ; AVX512VL-NEXT: vmovd %ecx, %xmm20 643 ; AVX512VL-NEXT: movq %rax, %rcx 644 ; AVX512VL-NEXT: shrq $32, %rcx 645 ; AVX512VL-NEXT: movswl %cx, %ecx 646 ; AVX512VL-NEXT: vmovd %ecx, %xmm21 647 ; AVX512VL-NEXT: movl %eax, %ecx 648 ; AVX512VL-NEXT: shrl $16, %ecx 649 ; AVX512VL-NEXT: movswl %cx, %ecx 650 ; AVX512VL-NEXT: vmovd %ecx, %xmm22 651 ; AVX512VL-NEXT: cwtl 652 ; AVX512VL-NEXT: vmovd %eax, %xmm2 653 ; AVX512VL-NEXT: vcvtph2ps %xmm8, %xmm8 654 ; AVX512VL-NEXT: vcvtph2ps %xmm9, %xmm9 655 ; AVX512VL-NEXT: vcvtph2ps %xmm11, %xmm11 656 ; AVX512VL-NEXT: vcvtph2ps %xmm12, %xmm12 657 ; AVX512VL-NEXT: vcvtph2ps %xmm13, %xmm13 658 ; AVX512VL-NEXT: vcvtph2ps %xmm14, %xmm14 659 ; AVX512VL-NEXT: vcvtph2ps %xmm15, %xmm15 660 ; AVX512VL-NEXT: vcvtph2ps %xmm16, %xmm16 661 ; AVX512VL-NEXT: vcvtph2ps %xmm17, %xmm4 662 ; AVX512VL-NEXT: vcvtph2ps %xmm18, %xmm0 663 ; AVX512VL-NEXT: vcvtph2ps %xmm19, %xmm5 664 ; AVX512VL-NEXT: vcvtph2ps %xmm10, %xmm7 665 ; AVX512VL-NEXT: vcvtph2ps %xmm20, %xmm3 666 ; AVX512VL-NEXT: vcvtph2ps %xmm21, %xmm6 667 ; AVX512VL-NEXT: vcvtph2ps %xmm22, %xmm1 668 ; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 669 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] 670 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] 671 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] 672 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[2,3] 673 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3] 674 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] 675 ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 676 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm16[0],xmm15[0],xmm16[2,3] 677 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3] 678 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0] 679 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3] 680 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 681 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 682 ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 683 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 684 ; AVX512VL-NEXT: retq 685 %1 = bitcast <16 x i16> %a0 to <16 x half> 686 %2 = fpext <16 x half> %1 to <16 x float> 687 ret <16 x float> %2 688 } 689 690 ; 691 ; Half to Float (Load) 692 ; 693 694 define float @load_cvt_i16_to_f32(i16* %a0) nounwind { 695 ; ALL-LABEL: load_cvt_i16_to_f32: 696 ; ALL: # %bb.0: 697 ; ALL-NEXT: movswl (%rdi), %eax 698 ; ALL-NEXT: vmovd %eax, %xmm0 699 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 700 ; ALL-NEXT: retq 701 %1 = load i16, i16* %a0 702 %2 = bitcast i16 %1 to half 703 %3 = fpext half %2 to float 704 ret float %3 705 } 706 707 define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind { 708 ; ALL-LABEL: load_cvt_4i16_to_4f32: 709 ; ALL: # %bb.0: 710 ; ALL-NEXT: movswl 6(%rdi), %eax 711 ; ALL-NEXT: vmovd %eax, %xmm0 712 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 713 ; ALL-NEXT: movswl 4(%rdi), %eax 714 ; ALL-NEXT: vmovd %eax, %xmm1 715 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 716 ; ALL-NEXT: movswl (%rdi), %eax 717 ; ALL-NEXT: vmovd %eax, %xmm2 718 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 719 ; ALL-NEXT: movswl 2(%rdi), %eax 720 ; ALL-NEXT: vmovd %eax, %xmm3 721 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 722 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] 723 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 724 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 725 ; ALL-NEXT: retq 726 %1 = load <4 x i16>, <4 x i16>* %a0 727 %2 = bitcast <4 x i16> %1 to <4 x half> 728 %3 = fpext <4 x half> %2 to <4 x float> 729 ret <4 x float> %3 730 } 731 732 define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind { 733 ; AVX1-LABEL: load_cvt_8i16_to_4f32: 734 ; AVX1: # %bb.0: 735 ; AVX1-NEXT: movq (%rdi), %rax 736 ; AVX1-NEXT: movq %rax, %rcx 737 ; AVX1-NEXT: movq %rax, %rdx 738 ; AVX1-NEXT: movswl %ax, %esi 739 ; AVX1-NEXT: # kill: def $eax killed $eax killed $rax 740 ; AVX1-NEXT: shrl $16, %eax 741 ; AVX1-NEXT: shrq $32, %rcx 742 ; AVX1-NEXT: shrq $48, %rdx 743 ; AVX1-NEXT: movswl %dx, %edx 744 ; AVX1-NEXT: vmovd %edx, %xmm0 745 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 746 ; AVX1-NEXT: movswl %cx, %ecx 747 ; AVX1-NEXT: vmovd %ecx, %xmm1 748 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 749 ; AVX1-NEXT: cwtl 750 ; AVX1-NEXT: vmovd %eax, %xmm2 751 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 752 ; AVX1-NEXT: vmovd %esi, %xmm3 753 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 754 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 755 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 756 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 757 ; AVX1-NEXT: retq 758 ; 759 ; AVX2-LABEL: load_cvt_8i16_to_4f32: 760 ; AVX2: # %bb.0: 761 ; AVX2-NEXT: movq (%rdi), %rax 762 ; AVX2-NEXT: movq %rax, %rcx 763 ; AVX2-NEXT: movq %rax, %rdx 764 ; AVX2-NEXT: movswl %ax, %esi 765 ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax 766 ; AVX2-NEXT: shrl $16, %eax 767 ; AVX2-NEXT: shrq $32, %rcx 768 ; AVX2-NEXT: shrq $48, %rdx 769 ; AVX2-NEXT: movswl %dx, %edx 770 ; AVX2-NEXT: vmovd %edx, %xmm0 771 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 772 ; AVX2-NEXT: movswl %cx, %ecx 773 ; AVX2-NEXT: vmovd %ecx, %xmm1 774 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 775 ; AVX2-NEXT: cwtl 776 ; AVX2-NEXT: vmovd %eax, %xmm2 777 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 778 ; AVX2-NEXT: vmovd %esi, %xmm3 779 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 780 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 781 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 782 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 783 ; AVX2-NEXT: retq 784 ; 785 ; AVX512F-LABEL: load_cvt_8i16_to_4f32: 786 ; AVX512F: # %bb.0: 787 ; AVX512F-NEXT: movq (%rdi), %rax 788 ; AVX512F-NEXT: movq %rax, %rcx 789 ; AVX512F-NEXT: movq %rax, %rdx 790 ; AVX512F-NEXT: movswl %ax, %esi 791 ; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax 792 ; AVX512F-NEXT: shrl $16, %eax 793 ; AVX512F-NEXT: shrq $32, %rcx 794 ; AVX512F-NEXT: shrq $48, %rdx 795 ; AVX512F-NEXT: movswl %dx, %edx 796 ; AVX512F-NEXT: vmovd %edx, %xmm0 797 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 798 ; AVX512F-NEXT: movswl %cx, %ecx 799 ; AVX512F-NEXT: vmovd %ecx, %xmm1 800 ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 801 ; AVX512F-NEXT: cwtl 802 ; AVX512F-NEXT: vmovd %eax, %xmm2 803 ; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 804 ; AVX512F-NEXT: vmovd %esi, %xmm3 805 ; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 806 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 807 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 808 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 809 ; AVX512F-NEXT: retq 810 ; 811 ; AVX512VL-LABEL: load_cvt_8i16_to_4f32: 812 ; AVX512VL: # %bb.0: 813 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 814 ; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp) 815 ; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax 816 ; AVX512VL-NEXT: movq %rax, %rcx 817 ; AVX512VL-NEXT: movq %rax, %rdx 818 ; AVX512VL-NEXT: movswl %ax, %esi 819 ; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax 820 ; AVX512VL-NEXT: shrl $16, %eax 821 ; AVX512VL-NEXT: shrq $32, %rcx 822 ; AVX512VL-NEXT: shrq $48, %rdx 823 ; AVX512VL-NEXT: movswl %dx, %edx 824 ; AVX512VL-NEXT: vmovd %edx, %xmm0 825 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 826 ; AVX512VL-NEXT: movswl %cx, %ecx 827 ; AVX512VL-NEXT: vmovd %ecx, %xmm1 828 ; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 829 ; AVX512VL-NEXT: cwtl 830 ; AVX512VL-NEXT: vmovd %eax, %xmm2 831 ; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 832 ; AVX512VL-NEXT: vmovd %esi, %xmm3 833 ; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 834 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 835 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 836 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 837 ; AVX512VL-NEXT: retq 838 %1 = load <8 x i16>, <8 x i16>* %a0 839 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 840 %3 = bitcast <4 x i16> %2 to <4 x half> 841 %4 = fpext <4 x half> %3 to <4 x float> 842 ret <4 x float> %4 843 } 844 845 define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind { 846 ; ALL-LABEL: load_cvt_8i16_to_8f32: 847 ; ALL: # %bb.0: 848 ; ALL-NEXT: movswl 6(%rdi), %eax 849 ; ALL-NEXT: vmovd %eax, %xmm0 850 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 851 ; ALL-NEXT: movswl 4(%rdi), %eax 852 ; ALL-NEXT: vmovd %eax, %xmm1 853 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 854 ; ALL-NEXT: movswl (%rdi), %eax 855 ; ALL-NEXT: vmovd %eax, %xmm2 856 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 857 ; ALL-NEXT: movswl 2(%rdi), %eax 858 ; ALL-NEXT: vmovd %eax, %xmm3 859 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 860 ; ALL-NEXT: movswl 14(%rdi), %eax 861 ; ALL-NEXT: vmovd %eax, %xmm4 862 ; ALL-NEXT: vcvtph2ps %xmm4, %xmm4 863 ; ALL-NEXT: movswl 12(%rdi), %eax 864 ; ALL-NEXT: vmovd %eax, %xmm5 865 ; ALL-NEXT: vcvtph2ps %xmm5, %xmm5 866 ; ALL-NEXT: movswl 8(%rdi), %eax 867 ; ALL-NEXT: vmovd %eax, %xmm6 868 ; ALL-NEXT: vcvtph2ps %xmm6, %xmm6 869 ; ALL-NEXT: movswl 10(%rdi), %eax 870 ; ALL-NEXT: vmovd %eax, %xmm7 871 ; ALL-NEXT: vcvtph2ps %xmm7, %xmm7 872 ; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] 873 ; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] 874 ; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] 875 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] 876 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 877 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 878 ; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 879 ; ALL-NEXT: retq 880 %1 = load <8 x i16>, <8 x i16>* %a0 881 %2 = bitcast <8 x i16> %1 to <8 x half> 882 %3 = fpext <8 x half> %2 to <8 x float> 883 ret <8 x float> %3 884 } 885 886 define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind { 887 ; AVX1-LABEL: load_cvt_16i16_to_16f32: 888 ; AVX1: # %bb.0: 889 ; AVX1-NEXT: movswl 22(%rdi), %eax 890 ; AVX1-NEXT: vmovd %eax, %xmm0 891 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm8 892 ; AVX1-NEXT: movswl 20(%rdi), %eax 893 ; AVX1-NEXT: vmovd %eax, %xmm0 894 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm9 895 ; AVX1-NEXT: movswl 16(%rdi), %eax 896 ; AVX1-NEXT: vmovd %eax, %xmm0 897 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm10 898 ; AVX1-NEXT: movswl 18(%rdi), %eax 899 ; AVX1-NEXT: vmovd %eax, %xmm0 900 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm11 901 ; AVX1-NEXT: movswl 30(%rdi), %eax 902 ; AVX1-NEXT: vmovd %eax, %xmm0 903 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm12 904 ; AVX1-NEXT: movswl 28(%rdi), %eax 905 ; AVX1-NEXT: vmovd %eax, %xmm0 906 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm13 907 ; AVX1-NEXT: movswl 24(%rdi), %eax 908 ; AVX1-NEXT: vmovd %eax, %xmm0 909 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm14 910 ; AVX1-NEXT: movswl 26(%rdi), %eax 911 ; AVX1-NEXT: vmovd %eax, %xmm0 912 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm15 913 ; AVX1-NEXT: movswl 6(%rdi), %eax 914 ; AVX1-NEXT: vmovd %eax, %xmm0 915 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 916 ; AVX1-NEXT: movswl 4(%rdi), %eax 917 ; AVX1-NEXT: vmovd %eax, %xmm2 918 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 919 ; AVX1-NEXT: movswl (%rdi), %eax 920 ; AVX1-NEXT: vmovd %eax, %xmm3 921 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 922 ; AVX1-NEXT: movswl 2(%rdi), %eax 923 ; AVX1-NEXT: vmovd %eax, %xmm4 924 ; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 925 ; AVX1-NEXT: movswl 14(%rdi), %eax 926 ; AVX1-NEXT: vmovd %eax, %xmm5 927 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 928 ; AVX1-NEXT: movswl 12(%rdi), %eax 929 ; AVX1-NEXT: vmovd %eax, %xmm6 930 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 931 ; AVX1-NEXT: movswl 8(%rdi), %eax 932 ; AVX1-NEXT: vmovd %eax, %xmm7 933 ; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 934 ; AVX1-NEXT: movswl 10(%rdi), %eax 935 ; AVX1-NEXT: vmovd %eax, %xmm1 936 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 937 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3] 938 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] 939 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] 940 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] 941 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3] 942 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] 943 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 944 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] 945 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] 946 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] 947 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] 948 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 949 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 950 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 951 ; AVX1-NEXT: retq 952 ; 953 ; AVX2-LABEL: load_cvt_16i16_to_16f32: 954 ; AVX2: # %bb.0: 955 ; AVX2-NEXT: movswl 22(%rdi), %eax 956 ; AVX2-NEXT: vmovd %eax, %xmm0 957 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm8 958 ; AVX2-NEXT: movswl 20(%rdi), %eax 959 ; AVX2-NEXT: vmovd %eax, %xmm0 960 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm9 961 ; AVX2-NEXT: movswl 16(%rdi), %eax 962 ; AVX2-NEXT: vmovd %eax, %xmm0 963 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm10 964 ; AVX2-NEXT: movswl 18(%rdi), %eax 965 ; AVX2-NEXT: vmovd %eax, %xmm0 966 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm11 967 ; AVX2-NEXT: movswl 30(%rdi), %eax 968 ; AVX2-NEXT: vmovd %eax, %xmm0 969 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm12 970 ; AVX2-NEXT: movswl 28(%rdi), %eax 971 ; AVX2-NEXT: vmovd %eax, %xmm0 972 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm13 973 ; AVX2-NEXT: movswl 24(%rdi), %eax 974 ; AVX2-NEXT: vmovd %eax, %xmm0 975 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm14 976 ; AVX2-NEXT: movswl 26(%rdi), %eax 977 ; AVX2-NEXT: vmovd %eax, %xmm0 978 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm15 979 ; AVX2-NEXT: movswl 6(%rdi), %eax 980 ; AVX2-NEXT: vmovd %eax, %xmm0 981 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 982 ; AVX2-NEXT: movswl 4(%rdi), %eax 983 ; AVX2-NEXT: vmovd %eax, %xmm2 984 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 985 ; AVX2-NEXT: movswl (%rdi), %eax 986 ; AVX2-NEXT: vmovd %eax, %xmm3 987 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 988 ; AVX2-NEXT: movswl 2(%rdi), %eax 989 ; AVX2-NEXT: vmovd %eax, %xmm4 990 ; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 991 ; AVX2-NEXT: movswl 14(%rdi), %eax 992 ; AVX2-NEXT: vmovd %eax, %xmm5 993 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 994 ; AVX2-NEXT: movswl 12(%rdi), %eax 995 ; AVX2-NEXT: vmovd %eax, %xmm6 996 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 997 ; AVX2-NEXT: movswl 8(%rdi), %eax 998 ; AVX2-NEXT: vmovd %eax, %xmm7 999 ; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 1000 ; AVX2-NEXT: movswl 10(%rdi), %eax 1001 ; AVX2-NEXT: vmovd %eax, %xmm1 1002 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 1003 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3] 1004 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] 1005 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] 1006 ; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] 1007 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3] 1008 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] 1009 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1010 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] 1011 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] 1012 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] 1013 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] 1014 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 1015 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 1016 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1017 ; AVX2-NEXT: retq 1018 ; 1019 ; AVX512F-LABEL: load_cvt_16i16_to_16f32: 1020 ; AVX512F: # %bb.0: 1021 ; AVX512F-NEXT: movswl 6(%rdi), %eax 1022 ; AVX512F-NEXT: vmovd %eax, %xmm0 1023 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm8 1024 ; AVX512F-NEXT: movswl 4(%rdi), %eax 1025 ; AVX512F-NEXT: vmovd %eax, %xmm0 1026 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm9 1027 ; AVX512F-NEXT: movswl (%rdi), %eax 1028 ; AVX512F-NEXT: vmovd %eax, %xmm0 1029 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm10 1030 ; AVX512F-NEXT: movswl 2(%rdi), %eax 1031 ; AVX512F-NEXT: vmovd %eax, %xmm0 1032 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm11 1033 ; AVX512F-NEXT: movswl 14(%rdi), %eax 1034 ; AVX512F-NEXT: vmovd %eax, %xmm0 1035 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm12 1036 ; AVX512F-NEXT: movswl 12(%rdi), %eax 1037 ; AVX512F-NEXT: vmovd %eax, %xmm0 1038 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm13 1039 ; AVX512F-NEXT: movswl 8(%rdi), %eax 1040 ; AVX512F-NEXT: vmovd %eax, %xmm0 1041 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm14 1042 ; AVX512F-NEXT: movswl 10(%rdi), %eax 1043 ; AVX512F-NEXT: vmovd %eax, %xmm0 1044 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm15 1045 ; AVX512F-NEXT: movswl 22(%rdi), %eax 1046 ; AVX512F-NEXT: vmovd %eax, %xmm0 1047 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 1048 ; AVX512F-NEXT: movswl 20(%rdi), %eax 1049 ; AVX512F-NEXT: vmovd %eax, %xmm1 1050 ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 1051 ; AVX512F-NEXT: movswl 16(%rdi), %eax 1052 ; AVX512F-NEXT: vmovd %eax, %xmm2 1053 ; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 1054 ; AVX512F-NEXT: movswl 18(%rdi), %eax 1055 ; AVX512F-NEXT: vmovd %eax, %xmm3 1056 ; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 1057 ; AVX512F-NEXT: movswl 30(%rdi), %eax 1058 ; AVX512F-NEXT: vmovd %eax, %xmm4 1059 ; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4 1060 ; AVX512F-NEXT: movswl 28(%rdi), %eax 1061 ; AVX512F-NEXT: vmovd %eax, %xmm5 1062 ; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5 1063 ; AVX512F-NEXT: movswl 24(%rdi), %eax 1064 ; AVX512F-NEXT: vmovd %eax, %xmm6 1065 ; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 1066 ; AVX512F-NEXT: movswl 26(%rdi), %eax 1067 ; AVX512F-NEXT: vmovd %eax, %xmm7 1068 ; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7 1069 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] 1070 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] 1071 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] 1072 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] 1073 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 1074 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1075 ; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 1076 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] 1077 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] 1078 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] 1079 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] 1080 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 1081 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 1082 ; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1083 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 1084 ; AVX512F-NEXT: retq 1085 ; 1086 ; AVX512VL-LABEL: load_cvt_16i16_to_16f32: 1087 ; AVX512VL: # %bb.0: 1088 ; AVX512VL-NEXT: movswl 6(%rdi), %eax 1089 ; AVX512VL-NEXT: vmovd %eax, %xmm0 1090 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm8 1091 ; AVX512VL-NEXT: movswl 4(%rdi), %eax 1092 ; AVX512VL-NEXT: vmovd %eax, %xmm1 1093 ; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm9 1094 ; AVX512VL-NEXT: movswl (%rdi), %eax 1095 ; AVX512VL-NEXT: vmovd %eax, %xmm2 1096 ; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm10 1097 ; AVX512VL-NEXT: movswl 2(%rdi), %eax 1098 ; AVX512VL-NEXT: vmovd %eax, %xmm3 1099 ; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm11 1100 ; AVX512VL-NEXT: movswl 14(%rdi), %eax 1101 ; AVX512VL-NEXT: vmovd %eax, %xmm4 1102 ; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm12 1103 ; AVX512VL-NEXT: movswl 12(%rdi), %eax 1104 ; AVX512VL-NEXT: vmovd %eax, %xmm5 1105 ; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm13 1106 ; AVX512VL-NEXT: movswl 8(%rdi), %eax 1107 ; AVX512VL-NEXT: vmovd %eax, %xmm6 1108 ; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm14 1109 ; AVX512VL-NEXT: movswl 10(%rdi), %eax 1110 ; AVX512VL-NEXT: vmovd %eax, %xmm7 1111 ; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm15 1112 ; AVX512VL-NEXT: movswl 22(%rdi), %eax 1113 ; AVX512VL-NEXT: vmovd %eax, %xmm0 1114 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 1115 ; AVX512VL-NEXT: movswl 20(%rdi), %eax 1116 ; AVX512VL-NEXT: vmovd %eax, %xmm1 1117 ; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 1118 ; AVX512VL-NEXT: movswl 16(%rdi), %eax 1119 ; AVX512VL-NEXT: vmovd %eax, %xmm2 1120 ; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 1121 ; AVX512VL-NEXT: movswl 18(%rdi), %eax 1122 ; AVX512VL-NEXT: vmovd %eax, %xmm3 1123 ; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 1124 ; AVX512VL-NEXT: movswl 30(%rdi), %eax 1125 ; AVX512VL-NEXT: vmovd %eax, %xmm4 1126 ; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4 1127 ; AVX512VL-NEXT: movswl 28(%rdi), %eax 1128 ; AVX512VL-NEXT: vmovd %eax, %xmm5 1129 ; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5 1130 ; AVX512VL-NEXT: movswl 24(%rdi), %eax 1131 ; AVX512VL-NEXT: vmovd %eax, %xmm6 1132 ; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6 1133 ; AVX512VL-NEXT: movswl 26(%rdi), %eax 1134 ; AVX512VL-NEXT: vmovd %eax, %xmm7 1135 ; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7 1136 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] 1137 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] 1138 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] 1139 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] 1140 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 1141 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1142 ; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 1143 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] 1144 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] 1145 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] 1146 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] 1147 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 1148 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 1149 ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1150 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 1151 ; AVX512VL-NEXT: retq 1152 %1 = load <16 x i16>, <16 x i16>* %a0 1153 %2 = bitcast <16 x i16> %1 to <16 x half> 1154 %3 = fpext <16 x half> %2 to <16 x float> 1155 ret <16 x float> %3 1156 } 1157 1158 ; 1159 ; Half to Double 1160 ; 1161 1162 define double @cvt_i16_to_f64(i16 %a0) nounwind { 1163 ; ALL-LABEL: cvt_i16_to_f64: 1164 ; ALL: # %bb.0: 1165 ; ALL-NEXT: movswl %di, %eax 1166 ; ALL-NEXT: vmovd %eax, %xmm0 1167 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 1168 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1169 ; ALL-NEXT: retq 1170 %1 = bitcast i16 %a0 to half 1171 %2 = fpext half %1 to double 1172 ret double %2 1173 } 1174 1175 define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind { 1176 ; AVX1-LABEL: cvt_2i16_to_2f64: 1177 ; AVX1: # %bb.0: 1178 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1179 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1180 ; AVX1-NEXT: vmovd %xmm0, %eax 1181 ; AVX1-NEXT: movswl %ax, %ecx 1182 ; AVX1-NEXT: shrl $16, %eax 1183 ; AVX1-NEXT: cwtl 1184 ; AVX1-NEXT: vmovd %eax, %xmm0 1185 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 1186 ; AVX1-NEXT: vmovd %ecx, %xmm1 1187 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 1188 ; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1189 ; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1190 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1191 ; AVX1-NEXT: retq 1192 ; 1193 ; AVX2-SLOW-LABEL: cvt_2i16_to_2f64: 1194 ; AVX2-SLOW: # %bb.0: 1195 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1196 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1197 ; AVX2-SLOW-NEXT: vmovd %xmm0, %eax 1198 ; AVX2-SLOW-NEXT: movswl %ax, %ecx 1199 ; AVX2-SLOW-NEXT: shrl $16, %eax 1200 ; AVX2-SLOW-NEXT: cwtl 1201 ; AVX2-SLOW-NEXT: vmovd %eax, %xmm0 1202 ; AVX2-SLOW-NEXT: vcvtph2ps %xmm0, %xmm0 1203 ; AVX2-SLOW-NEXT: vmovd %ecx, %xmm1 1204 ; AVX2-SLOW-NEXT: vcvtph2ps %xmm1, %xmm1 1205 ; AVX2-SLOW-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1206 ; AVX2-SLOW-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1207 ; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1208 ; AVX2-SLOW-NEXT: retq 1209 ; 1210 ; AVX2-FAST-LABEL: cvt_2i16_to_2f64: 1211 ; AVX2-FAST: # %bb.0: 1212 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] 1213 ; AVX2-FAST-NEXT: vmovd %xmm0, %eax 1214 ; AVX2-FAST-NEXT: movswl %ax, %ecx 1215 ; AVX2-FAST-NEXT: shrl $16, %eax 1216 ; AVX2-FAST-NEXT: cwtl 1217 ; AVX2-FAST-NEXT: vmovd %eax, %xmm0 1218 ; AVX2-FAST-NEXT: vcvtph2ps %xmm0, %xmm0 1219 ; AVX2-FAST-NEXT: vmovd %ecx, %xmm1 1220 ; AVX2-FAST-NEXT: vcvtph2ps %xmm1, %xmm1 1221 ; AVX2-FAST-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1222 ; AVX2-FAST-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1223 ; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1224 ; AVX2-FAST-NEXT: retq 1225 ; 1226 ; AVX512F-LABEL: cvt_2i16_to_2f64: 1227 ; AVX512F: # %bb.0: 1228 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1229 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1230 ; AVX512F-NEXT: vmovd %xmm0, %eax 1231 ; AVX512F-NEXT: movswl %ax, %ecx 1232 ; AVX512F-NEXT: shrl $16, %eax 1233 ; AVX512F-NEXT: cwtl 1234 ; AVX512F-NEXT: vmovd %eax, %xmm0 1235 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 1236 ; AVX512F-NEXT: vmovd %ecx, %xmm1 1237 ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 1238 ; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1239 ; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1240 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1241 ; AVX512F-NEXT: retq 1242 ; 1243 ; AVX512VL-LABEL: cvt_2i16_to_2f64: 1244 ; AVX512VL: # %bb.0: 1245 ; AVX512VL-NEXT: vpmovqw %xmm0, -{{[0-9]+}}(%rsp) 1246 ; AVX512VL-NEXT: movl -{{[0-9]+}}(%rsp), %eax 1247 ; AVX512VL-NEXT: movswl %ax, %ecx 1248 ; AVX512VL-NEXT: shrl $16, %eax 1249 ; AVX512VL-NEXT: cwtl 1250 ; AVX512VL-NEXT: vmovd %eax, %xmm0 1251 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 1252 ; AVX512VL-NEXT: vmovd %ecx, %xmm1 1253 ; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 1254 ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1255 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1256 ; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1257 ; AVX512VL-NEXT: retq 1258 %1 = bitcast <2 x i16> %a0 to <2 x half> 1259 %2 = fpext <2 x half> %1 to <2 x double> 1260 ret <2 x double> %2 1261 } 1262 1263 define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind { 1264 ; AVX1-LABEL: cvt_4i16_to_4f64: 1265 ; AVX1: # %bb.0: 1266 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1267 ; AVX1-NEXT: vmovq %xmm0, %rax 1268 ; AVX1-NEXT: movq %rax, %rcx 1269 ; AVX1-NEXT: movl %eax, %edx 1270 ; AVX1-NEXT: movswl %ax, %esi 1271 ; AVX1-NEXT: shrq $48, %rax 1272 ; AVX1-NEXT: shrq $32, %rcx 1273 ; AVX1-NEXT: shrl $16, %edx 1274 ; AVX1-NEXT: movswl %dx, %edx 1275 ; AVX1-NEXT: vmovd %edx, %xmm0 1276 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 1277 ; AVX1-NEXT: vmovd %esi, %xmm1 1278 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 1279 ; AVX1-NEXT: movswl %cx, %ecx 1280 ; AVX1-NEXT: vmovd %ecx, %xmm2 1281 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 1282 ; AVX1-NEXT: cwtl 1283 ; AVX1-NEXT: vmovd %eax, %xmm3 1284 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 1285 ; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1286 ; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1287 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1288 ; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1289 ; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1290 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1291 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1292 ; AVX1-NEXT: retq 1293 ; 1294 ; AVX2-LABEL: cvt_4i16_to_4f64: 1295 ; AVX2: # %bb.0: 1296 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1297 ; AVX2-NEXT: vmovq %xmm0, %rax 1298 ; AVX2-NEXT: movq %rax, %rcx 1299 ; AVX2-NEXT: movl %eax, %edx 1300 ; AVX2-NEXT: movswl %ax, %esi 1301 ; AVX2-NEXT: shrq $48, %rax 1302 ; AVX2-NEXT: shrq $32, %rcx 1303 ; AVX2-NEXT: shrl $16, %edx 1304 ; AVX2-NEXT: movswl %dx, %edx 1305 ; AVX2-NEXT: vmovd %edx, %xmm0 1306 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 1307 ; AVX2-NEXT: vmovd %esi, %xmm1 1308 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 1309 ; AVX2-NEXT: movswl %cx, %ecx 1310 ; AVX2-NEXT: vmovd %ecx, %xmm2 1311 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 1312 ; AVX2-NEXT: cwtl 1313 ; AVX2-NEXT: vmovd %eax, %xmm3 1314 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 1315 ; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1316 ; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1317 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1318 ; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1319 ; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1320 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1321 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1322 ; AVX2-NEXT: retq 1323 ; 1324 ; AVX512F-LABEL: cvt_4i16_to_4f64: 1325 ; AVX512F: # %bb.0: 1326 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1327 ; AVX512F-NEXT: vmovq %xmm0, %rax 1328 ; AVX512F-NEXT: movq %rax, %rcx 1329 ; AVX512F-NEXT: movl %eax, %edx 1330 ; AVX512F-NEXT: movswl %ax, %esi 1331 ; AVX512F-NEXT: shrq $48, %rax 1332 ; AVX512F-NEXT: shrq $32, %rcx 1333 ; AVX512F-NEXT: shrl $16, %edx 1334 ; AVX512F-NEXT: movswl %dx, %edx 1335 ; AVX512F-NEXT: vmovd %edx, %xmm0 1336 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 1337 ; AVX512F-NEXT: vmovd %esi, %xmm1 1338 ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 1339 ; AVX512F-NEXT: movswl %cx, %ecx 1340 ; AVX512F-NEXT: vmovd %ecx, %xmm2 1341 ; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 1342 ; AVX512F-NEXT: cwtl 1343 ; AVX512F-NEXT: vmovd %eax, %xmm3 1344 ; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 1345 ; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1346 ; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1347 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1348 ; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1349 ; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1350 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1351 ; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1352 ; AVX512F-NEXT: retq 1353 ; 1354 ; AVX512VL-LABEL: cvt_4i16_to_4f64: 1355 ; AVX512VL: # %bb.0: 1356 ; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp) 1357 ; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax 1358 ; AVX512VL-NEXT: movq %rax, %rcx 1359 ; AVX512VL-NEXT: movl %eax, %edx 1360 ; AVX512VL-NEXT: movswl %ax, %esi 1361 ; AVX512VL-NEXT: shrq $48, %rax 1362 ; AVX512VL-NEXT: shrq $32, %rcx 1363 ; AVX512VL-NEXT: shrl $16, %edx 1364 ; AVX512VL-NEXT: movswl %dx, %edx 1365 ; AVX512VL-NEXT: vmovd %edx, %xmm0 1366 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 1367 ; AVX512VL-NEXT: vmovd %esi, %xmm1 1368 ; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 1369 ; AVX512VL-NEXT: movswl %cx, %ecx 1370 ; AVX512VL-NEXT: vmovd %ecx, %xmm2 1371 ; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 1372 ; AVX512VL-NEXT: cwtl 1373 ; AVX512VL-NEXT: vmovd %eax, %xmm3 1374 ; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 1375 ; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1376 ; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1377 ; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1378 ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1379 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1380 ; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1381 ; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1382 ; AVX512VL-NEXT: retq 1383 %1 = bitcast <4 x i16> %a0 to <4 x half> 1384 %2 = fpext <4 x half> %1 to <4 x double> 1385 ret <4 x double> %2 1386 } 1387 1388 define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind { 1389 ; AVX1-LABEL: cvt_8i16_to_2f64: 1390 ; AVX1: # %bb.0: 1391 ; AVX1-NEXT: vmovd %xmm0, %eax 1392 ; AVX1-NEXT: movswl %ax, %ecx 1393 ; AVX1-NEXT: shrl $16, %eax 1394 ; AVX1-NEXT: cwtl 1395 ; AVX1-NEXT: vmovd %eax, %xmm0 1396 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 1397 ; AVX1-NEXT: vmovd %ecx, %xmm1 1398 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 1399 ; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1400 ; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1401 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1402 ; AVX1-NEXT: retq 1403 ; 1404 ; AVX2-LABEL: cvt_8i16_to_2f64: 1405 ; AVX2: # %bb.0: 1406 ; AVX2-NEXT: vmovd %xmm0, %eax 1407 ; AVX2-NEXT: movswl %ax, %ecx 1408 ; AVX2-NEXT: shrl $16, %eax 1409 ; AVX2-NEXT: cwtl 1410 ; AVX2-NEXT: vmovd %eax, %xmm0 1411 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 1412 ; AVX2-NEXT: vmovd %ecx, %xmm1 1413 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 1414 ; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1415 ; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1416 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1417 ; AVX2-NEXT: retq 1418 ; 1419 ; AVX512F-LABEL: cvt_8i16_to_2f64: 1420 ; AVX512F: # %bb.0: 1421 ; AVX512F-NEXT: vmovd %xmm0, %eax 1422 ; AVX512F-NEXT: movswl %ax, %ecx 1423 ; AVX512F-NEXT: shrl $16, %eax 1424 ; AVX512F-NEXT: cwtl 1425 ; AVX512F-NEXT: vmovd %eax, %xmm0 1426 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 1427 ; AVX512F-NEXT: vmovd %ecx, %xmm1 1428 ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 1429 ; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1430 ; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1431 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1432 ; AVX512F-NEXT: retq 1433 ; 1434 ; AVX512VL-LABEL: cvt_8i16_to_2f64: 1435 ; AVX512VL: # %bb.0: 1436 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 1437 ; AVX512VL-NEXT: vpmovqw %xmm0, -{{[0-9]+}}(%rsp) 1438 ; AVX512VL-NEXT: movl -{{[0-9]+}}(%rsp), %eax 1439 ; AVX512VL-NEXT: movswl %ax, %ecx 1440 ; AVX512VL-NEXT: shrl $16, %eax 1441 ; AVX512VL-NEXT: cwtl 1442 ; AVX512VL-NEXT: vmovd %eax, %xmm0 1443 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 1444 ; AVX512VL-NEXT: vmovd %ecx, %xmm1 1445 ; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 1446 ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1447 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1448 ; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1449 ; AVX512VL-NEXT: retq 1450 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1> 1451 %2 = bitcast <2 x i16> %1 to <2 x half> 1452 %3 = fpext <2 x half> %2 to <2 x double> 1453 ret <2 x double> %3 1454 } 1455 1456 define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind { 1457 ; AVX1-LABEL: cvt_8i16_to_4f64: 1458 ; AVX1: # %bb.0: 1459 ; AVX1-NEXT: vmovq %xmm0, %rax 1460 ; AVX1-NEXT: movq %rax, %rcx 1461 ; AVX1-NEXT: movl %eax, %edx 1462 ; AVX1-NEXT: movswl %ax, %esi 1463 ; AVX1-NEXT: shrq $48, %rax 1464 ; AVX1-NEXT: shrq $32, %rcx 1465 ; AVX1-NEXT: shrl $16, %edx 1466 ; AVX1-NEXT: movswl %dx, %edx 1467 ; AVX1-NEXT: vmovd %edx, %xmm0 1468 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 1469 ; AVX1-NEXT: vmovd %esi, %xmm1 1470 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 1471 ; AVX1-NEXT: movswl %cx, %ecx 1472 ; AVX1-NEXT: vmovd %ecx, %xmm2 1473 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 1474 ; AVX1-NEXT: cwtl 1475 ; AVX1-NEXT: vmovd %eax, %xmm3 1476 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 1477 ; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1478 ; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1479 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1480 ; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1481 ; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1482 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1483 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1484 ; AVX1-NEXT: retq 1485 ; 1486 ; AVX2-LABEL: cvt_8i16_to_4f64: 1487 ; AVX2: # %bb.0: 1488 ; AVX2-NEXT: vmovq %xmm0, %rax 1489 ; AVX2-NEXT: movq %rax, %rcx 1490 ; AVX2-NEXT: movl %eax, %edx 1491 ; AVX2-NEXT: movswl %ax, %esi 1492 ; AVX2-NEXT: shrq $48, %rax 1493 ; AVX2-NEXT: shrq $32, %rcx 1494 ; AVX2-NEXT: shrl $16, %edx 1495 ; AVX2-NEXT: movswl %dx, %edx 1496 ; AVX2-NEXT: vmovd %edx, %xmm0 1497 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 1498 ; AVX2-NEXT: vmovd %esi, %xmm1 1499 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 1500 ; AVX2-NEXT: movswl %cx, %ecx 1501 ; AVX2-NEXT: vmovd %ecx, %xmm2 1502 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 1503 ; AVX2-NEXT: cwtl 1504 ; AVX2-NEXT: vmovd %eax, %xmm3 1505 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 1506 ; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1507 ; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1508 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1509 ; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1510 ; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1511 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1512 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1513 ; AVX2-NEXT: retq 1514 ; 1515 ; AVX512F-LABEL: cvt_8i16_to_4f64: 1516 ; AVX512F: # %bb.0: 1517 ; AVX512F-NEXT: vmovq %xmm0, %rax 1518 ; AVX512F-NEXT: movq %rax, %rcx 1519 ; AVX512F-NEXT: movl %eax, %edx 1520 ; AVX512F-NEXT: movswl %ax, %esi 1521 ; AVX512F-NEXT: shrq $48, %rax 1522 ; AVX512F-NEXT: shrq $32, %rcx 1523 ; AVX512F-NEXT: shrl $16, %edx 1524 ; AVX512F-NEXT: movswl %dx, %edx 1525 ; AVX512F-NEXT: vmovd %edx, %xmm0 1526 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 1527 ; AVX512F-NEXT: vmovd %esi, %xmm1 1528 ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 1529 ; AVX512F-NEXT: movswl %cx, %ecx 1530 ; AVX512F-NEXT: vmovd %ecx, %xmm2 1531 ; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 1532 ; AVX512F-NEXT: cwtl 1533 ; AVX512F-NEXT: vmovd %eax, %xmm3 1534 ; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 1535 ; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1536 ; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1537 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1538 ; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1539 ; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1540 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1541 ; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1542 ; AVX512F-NEXT: retq 1543 ; 1544 ; AVX512VL-LABEL: cvt_8i16_to_4f64: 1545 ; AVX512VL: # %bb.0: 1546 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1547 ; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp) 1548 ; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax 1549 ; AVX512VL-NEXT: movq %rax, %rcx 1550 ; AVX512VL-NEXT: movl %eax, %edx 1551 ; AVX512VL-NEXT: movswl %ax, %esi 1552 ; AVX512VL-NEXT: shrq $48, %rax 1553 ; AVX512VL-NEXT: shrq $32, %rcx 1554 ; AVX512VL-NEXT: shrl $16, %edx 1555 ; AVX512VL-NEXT: movswl %dx, %edx 1556 ; AVX512VL-NEXT: vmovd %edx, %xmm0 1557 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 1558 ; AVX512VL-NEXT: vmovd %esi, %xmm1 1559 ; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 1560 ; AVX512VL-NEXT: movswl %cx, %ecx 1561 ; AVX512VL-NEXT: vmovd %ecx, %xmm2 1562 ; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 1563 ; AVX512VL-NEXT: cwtl 1564 ; AVX512VL-NEXT: vmovd %eax, %xmm3 1565 ; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 1566 ; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1567 ; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1568 ; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1569 ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1570 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1571 ; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1572 ; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1573 ; AVX512VL-NEXT: retq 1574 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1575 %2 = bitcast <4 x i16> %1 to <4 x half> 1576 %3 = fpext <4 x half> %2 to <4 x double> 1577 ret <4 x double> %3 1578 } 1579 1580 define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind { 1581 ; AVX1-LABEL: cvt_8i16_to_8f64: 1582 ; AVX1: # %bb.0: 1583 ; AVX1-NEXT: vmovq %xmm0, %rdx 1584 ; AVX1-NEXT: movq %rdx, %r9 1585 ; AVX1-NEXT: movl %edx, %r10d 1586 ; AVX1-NEXT: movswl %dx, %r8d 1587 ; AVX1-NEXT: shrq $48, %rdx 1588 ; AVX1-NEXT: shrq $32, %r9 1589 ; AVX1-NEXT: shrl $16, %r10d 1590 ; AVX1-NEXT: vpextrq $1, %xmm0, %rdi 1591 ; AVX1-NEXT: movq %rdi, %rsi 1592 ; AVX1-NEXT: movl %edi, %eax 1593 ; AVX1-NEXT: movswl %di, %ecx 1594 ; AVX1-NEXT: shrq $48, %rdi 1595 ; AVX1-NEXT: shrq $32, %rsi 1596 ; AVX1-NEXT: shrl $16, %eax 1597 ; AVX1-NEXT: cwtl 1598 ; AVX1-NEXT: vmovd %eax, %xmm0 1599 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1 1600 ; AVX1-NEXT: vmovd %ecx, %xmm0 1601 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2 1602 ; AVX1-NEXT: movswl %si, %eax 1603 ; AVX1-NEXT: vmovd %eax, %xmm0 1604 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3 1605 ; AVX1-NEXT: movswl %di, %eax 1606 ; AVX1-NEXT: vmovd %eax, %xmm0 1607 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm4 1608 ; AVX1-NEXT: movswl %r10w, %eax 1609 ; AVX1-NEXT: vmovd %eax, %xmm0 1610 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 1611 ; AVX1-NEXT: vmovd %r8d, %xmm5 1612 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 1613 ; AVX1-NEXT: movswl %r9w, %eax 1614 ; AVX1-NEXT: vmovd %eax, %xmm6 1615 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 1616 ; AVX1-NEXT: movswl %dx, %eax 1617 ; AVX1-NEXT: vmovd %eax, %xmm7 1618 ; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 1619 ; AVX1-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 1620 ; AVX1-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 1621 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] 1622 ; AVX1-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 1623 ; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1624 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0] 1625 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 1626 ; AVX1-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 1627 ; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1628 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] 1629 ; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1630 ; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1631 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] 1632 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1633 ; AVX1-NEXT: retq 1634 ; 1635 ; AVX2-LABEL: cvt_8i16_to_8f64: 1636 ; AVX2: # %bb.0: 1637 ; AVX2-NEXT: vmovq %xmm0, %rdx 1638 ; AVX2-NEXT: movq %rdx, %r9 1639 ; AVX2-NEXT: movl %edx, %r10d 1640 ; AVX2-NEXT: movswl %dx, %r8d 1641 ; AVX2-NEXT: shrq $48, %rdx 1642 ; AVX2-NEXT: shrq $32, %r9 1643 ; AVX2-NEXT: shrl $16, %r10d 1644 ; AVX2-NEXT: vpextrq $1, %xmm0, %rdi 1645 ; AVX2-NEXT: movq %rdi, %rsi 1646 ; AVX2-NEXT: movl %edi, %eax 1647 ; AVX2-NEXT: movswl %di, %ecx 1648 ; AVX2-NEXT: shrq $48, %rdi 1649 ; AVX2-NEXT: shrq $32, %rsi 1650 ; AVX2-NEXT: shrl $16, %eax 1651 ; AVX2-NEXT: cwtl 1652 ; AVX2-NEXT: vmovd %eax, %xmm0 1653 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1 1654 ; AVX2-NEXT: vmovd %ecx, %xmm0 1655 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2 1656 ; AVX2-NEXT: movswl %si, %eax 1657 ; AVX2-NEXT: vmovd %eax, %xmm0 1658 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3 1659 ; AVX2-NEXT: movswl %di, %eax 1660 ; AVX2-NEXT: vmovd %eax, %xmm0 1661 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm4 1662 ; AVX2-NEXT: movswl %r10w, %eax 1663 ; AVX2-NEXT: vmovd %eax, %xmm0 1664 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 1665 ; AVX2-NEXT: vmovd %r8d, %xmm5 1666 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 1667 ; AVX2-NEXT: movswl %r9w, %eax 1668 ; AVX2-NEXT: vmovd %eax, %xmm6 1669 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 1670 ; AVX2-NEXT: movswl %dx, %eax 1671 ; AVX2-NEXT: vmovd %eax, %xmm7 1672 ; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 1673 ; AVX2-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 1674 ; AVX2-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 1675 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] 1676 ; AVX2-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 1677 ; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1678 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0] 1679 ; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 1680 ; AVX2-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 1681 ; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1682 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] 1683 ; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1684 ; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1685 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] 1686 ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1687 ; AVX2-NEXT: retq 1688 ; 1689 ; AVX512-LABEL: cvt_8i16_to_8f64: 1690 ; AVX512: # %bb.0: 1691 ; AVX512-NEXT: vpextrq $1, %xmm0, %rdx 1692 ; AVX512-NEXT: movq %rdx, %r9 1693 ; AVX512-NEXT: movl %edx, %r10d 1694 ; AVX512-NEXT: movswl %dx, %r8d 1695 ; AVX512-NEXT: shrq $48, %rdx 1696 ; AVX512-NEXT: shrq $32, %r9 1697 ; AVX512-NEXT: shrl $16, %r10d 1698 ; AVX512-NEXT: vmovq %xmm0, %rdi 1699 ; AVX512-NEXT: movq %rdi, %rsi 1700 ; AVX512-NEXT: movl %edi, %eax 1701 ; AVX512-NEXT: movswl %di, %ecx 1702 ; AVX512-NEXT: shrq $48, %rdi 1703 ; AVX512-NEXT: shrq $32, %rsi 1704 ; AVX512-NEXT: shrl $16, %eax 1705 ; AVX512-NEXT: cwtl 1706 ; AVX512-NEXT: vmovd %eax, %xmm0 1707 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1708 ; AVX512-NEXT: vmovd %ecx, %xmm1 1709 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 1710 ; AVX512-NEXT: movswl %si, %eax 1711 ; AVX512-NEXT: vmovd %eax, %xmm2 1712 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 1713 ; AVX512-NEXT: movswl %di, %eax 1714 ; AVX512-NEXT: vmovd %eax, %xmm3 1715 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 1716 ; AVX512-NEXT: movswl %r10w, %eax 1717 ; AVX512-NEXT: vmovd %eax, %xmm4 1718 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 1719 ; AVX512-NEXT: vmovd %r8d, %xmm5 1720 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 1721 ; AVX512-NEXT: movswl %r9w, %eax 1722 ; AVX512-NEXT: vmovd %eax, %xmm6 1723 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6 1724 ; AVX512-NEXT: movswl %dx, %eax 1725 ; AVX512-NEXT: vmovd %eax, %xmm7 1726 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7 1727 ; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 1728 ; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 1729 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] 1730 ; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 1731 ; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 1732 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0] 1733 ; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 1734 ; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1735 ; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1736 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1737 ; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1738 ; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1739 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1740 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1741 ; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 1742 ; AVX512-NEXT: retq 1743 %1 = bitcast <8 x i16> %a0 to <8 x half> 1744 %2 = fpext <8 x half> %1 to <8 x double> 1745 ret <8 x double> %2 1746 } 1747 1748 ; 1749 ; Half to Double (Load) 1750 ; 1751 1752 define double @load_cvt_i16_to_f64(i16* %a0) nounwind { 1753 ; ALL-LABEL: load_cvt_i16_to_f64: 1754 ; ALL: # %bb.0: 1755 ; ALL-NEXT: movswl (%rdi), %eax 1756 ; ALL-NEXT: vmovd %eax, %xmm0 1757 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 1758 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1759 ; ALL-NEXT: retq 1760 %1 = load i16, i16* %a0 1761 %2 = bitcast i16 %1 to half 1762 %3 = fpext half %2 to double 1763 ret double %3 1764 } 1765 1766 define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind { 1767 ; ALL-LABEL: load_cvt_2i16_to_2f64: 1768 ; ALL: # %bb.0: 1769 ; ALL-NEXT: movswl (%rdi), %eax 1770 ; ALL-NEXT: vmovd %eax, %xmm0 1771 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 1772 ; ALL-NEXT: movswl 2(%rdi), %eax 1773 ; ALL-NEXT: vmovd %eax, %xmm1 1774 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 1775 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1776 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1777 ; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1778 ; ALL-NEXT: retq 1779 %1 = load <2 x i16>, <2 x i16>* %a0 1780 %2 = bitcast <2 x i16> %1 to <2 x half> 1781 %3 = fpext <2 x half> %2 to <2 x double> 1782 ret <2 x double> %3 1783 } 1784 1785 define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind { 1786 ; ALL-LABEL: load_cvt_4i16_to_4f64: 1787 ; ALL: # %bb.0: 1788 ; ALL-NEXT: movswl (%rdi), %eax 1789 ; ALL-NEXT: vmovd %eax, %xmm0 1790 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 1791 ; ALL-NEXT: movswl 2(%rdi), %eax 1792 ; ALL-NEXT: vmovd %eax, %xmm1 1793 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 1794 ; ALL-NEXT: movswl 4(%rdi), %eax 1795 ; ALL-NEXT: vmovd %eax, %xmm2 1796 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 1797 ; ALL-NEXT: movswl 6(%rdi), %eax 1798 ; ALL-NEXT: vmovd %eax, %xmm3 1799 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 1800 ; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1801 ; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1802 ; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1803 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1804 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1805 ; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1806 ; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1807 ; ALL-NEXT: retq 1808 %1 = load <4 x i16>, <4 x i16>* %a0 1809 %2 = bitcast <4 x i16> %1 to <4 x half> 1810 %3 = fpext <4 x half> %2 to <4 x double> 1811 ret <4 x double> %3 1812 } 1813 1814 define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind { 1815 ; AVX1-LABEL: load_cvt_8i16_to_4f64: 1816 ; AVX1: # %bb.0: 1817 ; AVX1-NEXT: movq (%rdi), %rax 1818 ; AVX1-NEXT: movq %rax, %rcx 1819 ; AVX1-NEXT: movl %eax, %edx 1820 ; AVX1-NEXT: movswl %ax, %esi 1821 ; AVX1-NEXT: shrq $48, %rax 1822 ; AVX1-NEXT: shrq $32, %rcx 1823 ; AVX1-NEXT: shrl $16, %edx 1824 ; AVX1-NEXT: movswl %dx, %edx 1825 ; AVX1-NEXT: vmovd %edx, %xmm0 1826 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 1827 ; AVX1-NEXT: vmovd %esi, %xmm1 1828 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 1829 ; AVX1-NEXT: movswl %cx, %ecx 1830 ; AVX1-NEXT: vmovd %ecx, %xmm2 1831 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 1832 ; AVX1-NEXT: cwtl 1833 ; AVX1-NEXT: vmovd %eax, %xmm3 1834 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 1835 ; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1836 ; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1837 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1838 ; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1839 ; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1840 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1841 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1842 ; AVX1-NEXT: retq 1843 ; 1844 ; AVX2-LABEL: load_cvt_8i16_to_4f64: 1845 ; AVX2: # %bb.0: 1846 ; AVX2-NEXT: movq (%rdi), %rax 1847 ; AVX2-NEXT: movq %rax, %rcx 1848 ; AVX2-NEXT: movl %eax, %edx 1849 ; AVX2-NEXT: movswl %ax, %esi 1850 ; AVX2-NEXT: shrq $48, %rax 1851 ; AVX2-NEXT: shrq $32, %rcx 1852 ; AVX2-NEXT: shrl $16, %edx 1853 ; AVX2-NEXT: movswl %dx, %edx 1854 ; AVX2-NEXT: vmovd %edx, %xmm0 1855 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 1856 ; AVX2-NEXT: vmovd %esi, %xmm1 1857 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 1858 ; AVX2-NEXT: movswl %cx, %ecx 1859 ; AVX2-NEXT: vmovd %ecx, %xmm2 1860 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 1861 ; AVX2-NEXT: cwtl 1862 ; AVX2-NEXT: vmovd %eax, %xmm3 1863 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 1864 ; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1865 ; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1866 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1867 ; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1868 ; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1869 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1870 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1871 ; AVX2-NEXT: retq 1872 ; 1873 ; AVX512F-LABEL: load_cvt_8i16_to_4f64: 1874 ; AVX512F: # %bb.0: 1875 ; AVX512F-NEXT: movq (%rdi), %rax 1876 ; AVX512F-NEXT: movq %rax, %rcx 1877 ; AVX512F-NEXT: movl %eax, %edx 1878 ; AVX512F-NEXT: movswl %ax, %esi 1879 ; AVX512F-NEXT: shrq $48, %rax 1880 ; AVX512F-NEXT: shrq $32, %rcx 1881 ; AVX512F-NEXT: shrl $16, %edx 1882 ; AVX512F-NEXT: movswl %dx, %edx 1883 ; AVX512F-NEXT: vmovd %edx, %xmm0 1884 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 1885 ; AVX512F-NEXT: vmovd %esi, %xmm1 1886 ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 1887 ; AVX512F-NEXT: movswl %cx, %ecx 1888 ; AVX512F-NEXT: vmovd %ecx, %xmm2 1889 ; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 1890 ; AVX512F-NEXT: cwtl 1891 ; AVX512F-NEXT: vmovd %eax, %xmm3 1892 ; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 1893 ; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1894 ; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1895 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1896 ; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1897 ; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1898 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1899 ; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1900 ; AVX512F-NEXT: retq 1901 ; 1902 ; AVX512VL-LABEL: load_cvt_8i16_to_4f64: 1903 ; AVX512VL: # %bb.0: 1904 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1905 ; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp) 1906 ; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax 1907 ; AVX512VL-NEXT: movq %rax, %rcx 1908 ; AVX512VL-NEXT: movl %eax, %edx 1909 ; AVX512VL-NEXT: movswl %ax, %esi 1910 ; AVX512VL-NEXT: shrq $48, %rax 1911 ; AVX512VL-NEXT: shrq $32, %rcx 1912 ; AVX512VL-NEXT: shrl $16, %edx 1913 ; AVX512VL-NEXT: movswl %dx, %edx 1914 ; AVX512VL-NEXT: vmovd %edx, %xmm0 1915 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 1916 ; AVX512VL-NEXT: vmovd %esi, %xmm1 1917 ; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 1918 ; AVX512VL-NEXT: movswl %cx, %ecx 1919 ; AVX512VL-NEXT: vmovd %ecx, %xmm2 1920 ; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 1921 ; AVX512VL-NEXT: cwtl 1922 ; AVX512VL-NEXT: vmovd %eax, %xmm3 1923 ; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 1924 ; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1925 ; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1926 ; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1927 ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1928 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1929 ; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1930 ; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1931 ; AVX512VL-NEXT: retq 1932 %1 = load <8 x i16>, <8 x i16>* %a0 1933 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1934 %3 = bitcast <4 x i16> %2 to <4 x half> 1935 %4 = fpext <4 x half> %3 to <4 x double> 1936 ret <4 x double> %4 1937 } 1938 1939 define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind { 1940 ; AVX1-LABEL: load_cvt_8i16_to_8f64: 1941 ; AVX1: # %bb.0: 1942 ; AVX1-NEXT: movswl 8(%rdi), %eax 1943 ; AVX1-NEXT: vmovd %eax, %xmm0 1944 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1 1945 ; AVX1-NEXT: movswl 10(%rdi), %eax 1946 ; AVX1-NEXT: vmovd %eax, %xmm0 1947 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2 1948 ; AVX1-NEXT: movswl 12(%rdi), %eax 1949 ; AVX1-NEXT: vmovd %eax, %xmm0 1950 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3 1951 ; AVX1-NEXT: movswl 14(%rdi), %eax 1952 ; AVX1-NEXT: vmovd %eax, %xmm0 1953 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm4 1954 ; AVX1-NEXT: movswl (%rdi), %eax 1955 ; AVX1-NEXT: vmovd %eax, %xmm0 1956 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 1957 ; AVX1-NEXT: movswl 2(%rdi), %eax 1958 ; AVX1-NEXT: vmovd %eax, %xmm5 1959 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 1960 ; AVX1-NEXT: movswl 4(%rdi), %eax 1961 ; AVX1-NEXT: vmovd %eax, %xmm6 1962 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 1963 ; AVX1-NEXT: movswl 6(%rdi), %eax 1964 ; AVX1-NEXT: vmovd %eax, %xmm7 1965 ; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 1966 ; AVX1-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 1967 ; AVX1-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 1968 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] 1969 ; AVX1-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 1970 ; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1971 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] 1972 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 1973 ; AVX1-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 1974 ; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1975 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] 1976 ; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1977 ; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1978 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1979 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1980 ; AVX1-NEXT: retq 1981 ; 1982 ; AVX2-LABEL: load_cvt_8i16_to_8f64: 1983 ; AVX2: # %bb.0: 1984 ; AVX2-NEXT: movswl 8(%rdi), %eax 1985 ; AVX2-NEXT: vmovd %eax, %xmm0 1986 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1 1987 ; AVX2-NEXT: movswl 10(%rdi), %eax 1988 ; AVX2-NEXT: vmovd %eax, %xmm0 1989 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2 1990 ; AVX2-NEXT: movswl 12(%rdi), %eax 1991 ; AVX2-NEXT: vmovd %eax, %xmm0 1992 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3 1993 ; AVX2-NEXT: movswl 14(%rdi), %eax 1994 ; AVX2-NEXT: vmovd %eax, %xmm0 1995 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm4 1996 ; AVX2-NEXT: movswl (%rdi), %eax 1997 ; AVX2-NEXT: vmovd %eax, %xmm0 1998 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 1999 ; AVX2-NEXT: movswl 2(%rdi), %eax 2000 ; AVX2-NEXT: vmovd %eax, %xmm5 2001 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 2002 ; AVX2-NEXT: movswl 4(%rdi), %eax 2003 ; AVX2-NEXT: vmovd %eax, %xmm6 2004 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 2005 ; AVX2-NEXT: movswl 6(%rdi), %eax 2006 ; AVX2-NEXT: vmovd %eax, %xmm7 2007 ; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 2008 ; AVX2-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 2009 ; AVX2-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 2010 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] 2011 ; AVX2-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 2012 ; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2013 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] 2014 ; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 2015 ; AVX2-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 2016 ; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 2017 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] 2018 ; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 2019 ; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 2020 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 2021 ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 2022 ; AVX2-NEXT: retq 2023 ; 2024 ; AVX512-LABEL: load_cvt_8i16_to_8f64: 2025 ; AVX512: # %bb.0: 2026 ; AVX512-NEXT: movswl (%rdi), %eax 2027 ; AVX512-NEXT: vmovd %eax, %xmm0 2028 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 2029 ; AVX512-NEXT: movswl 2(%rdi), %eax 2030 ; AVX512-NEXT: vmovd %eax, %xmm1 2031 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 2032 ; AVX512-NEXT: movswl 4(%rdi), %eax 2033 ; AVX512-NEXT: vmovd %eax, %xmm2 2034 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 2035 ; AVX512-NEXT: movswl 6(%rdi), %eax 2036 ; AVX512-NEXT: vmovd %eax, %xmm3 2037 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 2038 ; AVX512-NEXT: movswl 8(%rdi), %eax 2039 ; AVX512-NEXT: vmovd %eax, %xmm4 2040 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 2041 ; AVX512-NEXT: movswl 10(%rdi), %eax 2042 ; AVX512-NEXT: vmovd %eax, %xmm5 2043 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 2044 ; AVX512-NEXT: movswl 12(%rdi), %eax 2045 ; AVX512-NEXT: vmovd %eax, %xmm6 2046 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6 2047 ; AVX512-NEXT: movswl 14(%rdi), %eax 2048 ; AVX512-NEXT: vmovd %eax, %xmm7 2049 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7 2050 ; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 2051 ; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 2052 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] 2053 ; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 2054 ; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 2055 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 2056 ; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 2057 ; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 2058 ; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 2059 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 2060 ; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 2061 ; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2062 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2063 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2064 ; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 2065 ; AVX512-NEXT: retq 2066 %1 = load <8 x i16>, <8 x i16>* %a0 2067 %2 = bitcast <8 x i16> %1 to <8 x half> 2068 %3 = fpext <8 x half> %2 to <8 x double> 2069 ret <8 x double> %3 2070 } 2071 2072 ; 2073 ; Float to Half 2074 ; 2075 2076 define i16 @cvt_f32_to_i16(float %a0) nounwind { 2077 ; ALL-LABEL: cvt_f32_to_i16: 2078 ; ALL: # %bb.0: 2079 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2080 ; ALL-NEXT: vmovd %xmm0, %eax 2081 ; ALL-NEXT: # kill: def $ax killed $ax killed $eax 2082 ; ALL-NEXT: retq 2083 %1 = fptrunc float %a0 to half 2084 %2 = bitcast half %1 to i16 2085 ret i16 %2 2086 } 2087 2088 define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind { 2089 ; ALL-LABEL: cvt_4f32_to_4i16: 2090 ; ALL: # %bb.0: 2091 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2092 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2093 ; ALL-NEXT: vmovd %xmm1, %eax 2094 ; ALL-NEXT: shll $16, %eax 2095 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2096 ; ALL-NEXT: vmovd %xmm1, %ecx 2097 ; ALL-NEXT: movzwl %cx, %ecx 2098 ; ALL-NEXT: orl %eax, %ecx 2099 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2100 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2101 ; ALL-NEXT: vmovd %xmm1, %eax 2102 ; ALL-NEXT: shll $16, %eax 2103 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2104 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2105 ; ALL-NEXT: vmovd %xmm0, %edx 2106 ; ALL-NEXT: movzwl %dx, %edx 2107 ; ALL-NEXT: orl %eax, %edx 2108 ; ALL-NEXT: shlq $32, %rdx 2109 ; ALL-NEXT: orq %rcx, %rdx 2110 ; ALL-NEXT: vmovq %rdx, %xmm0 2111 ; ALL-NEXT: retq 2112 %1 = fptrunc <4 x float> %a0 to <4 x half> 2113 %2 = bitcast <4 x half> %1 to <4 x i16> 2114 ret <4 x i16> %2 2115 } 2116 2117 define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind { 2118 ; ALL-LABEL: cvt_4f32_to_8i16_undef: 2119 ; ALL: # %bb.0: 2120 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2121 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2122 ; ALL-NEXT: vmovd %xmm1, %eax 2123 ; ALL-NEXT: shll $16, %eax 2124 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2125 ; ALL-NEXT: vmovd %xmm1, %ecx 2126 ; ALL-NEXT: movzwl %cx, %ecx 2127 ; ALL-NEXT: orl %eax, %ecx 2128 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2129 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2130 ; ALL-NEXT: vmovd %xmm1, %eax 2131 ; ALL-NEXT: shll $16, %eax 2132 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2133 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2134 ; ALL-NEXT: vmovd %xmm0, %edx 2135 ; ALL-NEXT: movzwl %dx, %edx 2136 ; ALL-NEXT: orl %eax, %edx 2137 ; ALL-NEXT: shlq $32, %rdx 2138 ; ALL-NEXT: orq %rcx, %rdx 2139 ; ALL-NEXT: vmovq %rdx, %xmm0 2140 ; ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2141 ; ALL-NEXT: retq 2142 %1 = fptrunc <4 x float> %a0 to <4 x half> 2143 %2 = bitcast <4 x half> %1 to <4 x i16> 2144 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2145 ret <8 x i16> %3 2146 } 2147 2148 define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind { 2149 ; AVX1-LABEL: cvt_4f32_to_8i16_zero: 2150 ; AVX1: # %bb.0: 2151 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2152 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2153 ; AVX1-NEXT: vmovd %xmm1, %eax 2154 ; AVX1-NEXT: shll $16, %eax 2155 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2156 ; AVX1-NEXT: vmovd %xmm1, %ecx 2157 ; AVX1-NEXT: movzwl %cx, %ecx 2158 ; AVX1-NEXT: orl %eax, %ecx 2159 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2160 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2161 ; AVX1-NEXT: vmovd %xmm1, %eax 2162 ; AVX1-NEXT: shll $16, %eax 2163 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2164 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2165 ; AVX1-NEXT: vmovd %xmm0, %edx 2166 ; AVX1-NEXT: movzwl %dx, %edx 2167 ; AVX1-NEXT: orl %eax, %edx 2168 ; AVX1-NEXT: shlq $32, %rdx 2169 ; AVX1-NEXT: orq %rcx, %rdx 2170 ; AVX1-NEXT: vmovq %rdx, %xmm0 2171 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2172 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 2173 ; AVX1-NEXT: retq 2174 ; 2175 ; AVX2-SLOW-LABEL: cvt_4f32_to_8i16_zero: 2176 ; AVX2-SLOW: # %bb.0: 2177 ; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2178 ; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2179 ; AVX2-SLOW-NEXT: vmovd %xmm1, %eax 2180 ; AVX2-SLOW-NEXT: shll $16, %eax 2181 ; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2182 ; AVX2-SLOW-NEXT: vmovd %xmm1, %ecx 2183 ; AVX2-SLOW-NEXT: movzwl %cx, %ecx 2184 ; AVX2-SLOW-NEXT: orl %eax, %ecx 2185 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2186 ; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2187 ; AVX2-SLOW-NEXT: vmovd %xmm1, %eax 2188 ; AVX2-SLOW-NEXT: shll $16, %eax 2189 ; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2190 ; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2191 ; AVX2-SLOW-NEXT: vmovd %xmm0, %edx 2192 ; AVX2-SLOW-NEXT: movzwl %dx, %edx 2193 ; AVX2-SLOW-NEXT: orl %eax, %edx 2194 ; AVX2-SLOW-NEXT: shlq $32, %rdx 2195 ; AVX2-SLOW-NEXT: orq %rcx, %rdx 2196 ; AVX2-SLOW-NEXT: vmovq %rdx, %xmm0 2197 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2198 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 2199 ; AVX2-SLOW-NEXT: retq 2200 ; 2201 ; AVX2-FAST-LABEL: cvt_4f32_to_8i16_zero: 2202 ; AVX2-FAST: # %bb.0: 2203 ; AVX2-FAST-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2204 ; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2205 ; AVX2-FAST-NEXT: vmovd %xmm1, %eax 2206 ; AVX2-FAST-NEXT: shll $16, %eax 2207 ; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2208 ; AVX2-FAST-NEXT: vmovd %xmm1, %ecx 2209 ; AVX2-FAST-NEXT: movzwl %cx, %ecx 2210 ; AVX2-FAST-NEXT: orl %eax, %ecx 2211 ; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2212 ; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2213 ; AVX2-FAST-NEXT: vmovd %xmm1, %eax 2214 ; AVX2-FAST-NEXT: shll $16, %eax 2215 ; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2216 ; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2217 ; AVX2-FAST-NEXT: vmovd %xmm0, %edx 2218 ; AVX2-FAST-NEXT: movzwl %dx, %edx 2219 ; AVX2-FAST-NEXT: orl %eax, %edx 2220 ; AVX2-FAST-NEXT: shlq $32, %rdx 2221 ; AVX2-FAST-NEXT: orq %rcx, %rdx 2222 ; AVX2-FAST-NEXT: vmovq %rdx, %xmm0 2223 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero 2224 ; AVX2-FAST-NEXT: retq 2225 ; 2226 ; AVX512F-LABEL: cvt_4f32_to_8i16_zero: 2227 ; AVX512F: # %bb.0: 2228 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2229 ; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2230 ; AVX512F-NEXT: vmovd %xmm1, %eax 2231 ; AVX512F-NEXT: shll $16, %eax 2232 ; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2233 ; AVX512F-NEXT: vmovd %xmm1, %ecx 2234 ; AVX512F-NEXT: movzwl %cx, %ecx 2235 ; AVX512F-NEXT: orl %eax, %ecx 2236 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2237 ; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2238 ; AVX512F-NEXT: vmovd %xmm1, %eax 2239 ; AVX512F-NEXT: shll $16, %eax 2240 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2241 ; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2242 ; AVX512F-NEXT: vmovd %xmm0, %edx 2243 ; AVX512F-NEXT: movzwl %dx, %edx 2244 ; AVX512F-NEXT: orl %eax, %edx 2245 ; AVX512F-NEXT: shlq $32, %rdx 2246 ; AVX512F-NEXT: orq %rcx, %rdx 2247 ; AVX512F-NEXT: vmovq %rdx, %xmm0 2248 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2249 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 2250 ; AVX512F-NEXT: retq 2251 ; 2252 ; AVX512VL-LABEL: cvt_4f32_to_8i16_zero: 2253 ; AVX512VL: # %bb.0: 2254 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2255 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2256 ; AVX512VL-NEXT: vmovd %xmm1, %eax 2257 ; AVX512VL-NEXT: shll $16, %eax 2258 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2259 ; AVX512VL-NEXT: vmovd %xmm1, %ecx 2260 ; AVX512VL-NEXT: movzwl %cx, %ecx 2261 ; AVX512VL-NEXT: orl %eax, %ecx 2262 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2263 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2264 ; AVX512VL-NEXT: vmovd %xmm1, %eax 2265 ; AVX512VL-NEXT: shll $16, %eax 2266 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2267 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2268 ; AVX512VL-NEXT: vmovd %xmm0, %edx 2269 ; AVX512VL-NEXT: movzwl %dx, %edx 2270 ; AVX512VL-NEXT: orl %eax, %edx 2271 ; AVX512VL-NEXT: shlq $32, %rdx 2272 ; AVX512VL-NEXT: orq %rcx, %rdx 2273 ; AVX512VL-NEXT: vmovq %rdx, %xmm0 2274 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero 2275 ; AVX512VL-NEXT: retq 2276 %1 = fptrunc <4 x float> %a0 to <4 x half> 2277 %2 = bitcast <4 x half> %1 to <4 x i16> 2278 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2279 ret <8 x i16> %3 2280 } 2281 2282 define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind { 2283 ; ALL-LABEL: cvt_8f32_to_8i16: 2284 ; ALL: # %bb.0: 2285 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2286 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2287 ; ALL-NEXT: vmovd %xmm1, %eax 2288 ; ALL-NEXT: shll $16, %eax 2289 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2290 ; ALL-NEXT: vmovd %xmm1, %ecx 2291 ; ALL-NEXT: movzwl %cx, %ecx 2292 ; ALL-NEXT: orl %eax, %ecx 2293 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2294 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2295 ; ALL-NEXT: vmovd %xmm1, %edx 2296 ; ALL-NEXT: shll $16, %edx 2297 ; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2298 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2299 ; ALL-NEXT: vmovd %xmm1, %eax 2300 ; ALL-NEXT: movzwl %ax, %eax 2301 ; ALL-NEXT: orl %edx, %eax 2302 ; ALL-NEXT: shlq $32, %rax 2303 ; ALL-NEXT: orq %rcx, %rax 2304 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 2305 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2306 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2307 ; ALL-NEXT: vmovd %xmm1, %ecx 2308 ; ALL-NEXT: shll $16, %ecx 2309 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2310 ; ALL-NEXT: vmovd %xmm1, %edx 2311 ; ALL-NEXT: movzwl %dx, %edx 2312 ; ALL-NEXT: orl %ecx, %edx 2313 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2314 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2315 ; ALL-NEXT: vmovd %xmm1, %ecx 2316 ; ALL-NEXT: shll $16, %ecx 2317 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2318 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2319 ; ALL-NEXT: vmovd %xmm0, %esi 2320 ; ALL-NEXT: movzwl %si, %esi 2321 ; ALL-NEXT: orl %ecx, %esi 2322 ; ALL-NEXT: shlq $32, %rsi 2323 ; ALL-NEXT: orq %rdx, %rsi 2324 ; ALL-NEXT: vmovq %rsi, %xmm0 2325 ; ALL-NEXT: vmovq %rax, %xmm1 2326 ; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2327 ; ALL-NEXT: vzeroupper 2328 ; ALL-NEXT: retq 2329 %1 = fptrunc <8 x float> %a0 to <8 x half> 2330 %2 = bitcast <8 x half> %1 to <8 x i16> 2331 ret <8 x i16> %2 2332 } 2333 2334 define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { 2335 ; AVX1-LABEL: cvt_16f32_to_16i16: 2336 ; AVX1: # %bb.0: 2337 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm2 2338 ; AVX1-NEXT: vmovd %xmm2, %eax 2339 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 2340 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2341 ; AVX1-NEXT: vmovd %eax, %xmm3 2342 ; AVX1-NEXT: vmovd %xmm2, %eax 2343 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 2344 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2345 ; AVX1-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 2346 ; AVX1-NEXT: vmovd %xmm2, %eax 2347 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2348 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 2349 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2350 ; AVX1-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 2351 ; AVX1-NEXT: vmovd %xmm1, %eax 2352 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm1 2353 ; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 2354 ; AVX1-NEXT: vmovd %xmm1, %eax 2355 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 2356 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2357 ; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 2358 ; AVX1-NEXT: vmovd %xmm1, %eax 2359 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 2360 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2361 ; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 2362 ; AVX1-NEXT: vmovd %xmm1, %eax 2363 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2364 ; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 2365 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2366 ; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 2367 ; AVX1-NEXT: vmovd %xmm2, %eax 2368 ; AVX1-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 2369 ; AVX1-NEXT: vmovd %xmm1, %eax 2370 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2371 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2372 ; AVX1-NEXT: vmovd %eax, %xmm3 2373 ; AVX1-NEXT: vmovd %xmm1, %eax 2374 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2375 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2376 ; AVX1-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 2377 ; AVX1-NEXT: vmovd %xmm1, %eax 2378 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2379 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 2380 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2381 ; AVX1-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 2382 ; AVX1-NEXT: vmovd %xmm0, %eax 2383 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm0 2384 ; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 2385 ; AVX1-NEXT: vmovd %xmm0, %eax 2386 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 2387 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2388 ; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 2389 ; AVX1-NEXT: vmovd %xmm0, %eax 2390 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3] 2391 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2392 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 2393 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2394 ; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 2395 ; AVX1-NEXT: vmovd %xmm1, %eax 2396 ; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1 2397 ; AVX1-NEXT: vmovd %xmm0, %eax 2398 ; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 2399 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2400 ; AVX1-NEXT: retq 2401 ; 2402 ; AVX2-LABEL: cvt_16f32_to_16i16: 2403 ; AVX2: # %bb.0: 2404 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm2 2405 ; AVX2-NEXT: vmovd %xmm2, %eax 2406 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 2407 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2408 ; AVX2-NEXT: vmovd %eax, %xmm3 2409 ; AVX2-NEXT: vmovd %xmm2, %eax 2410 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 2411 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2412 ; AVX2-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 2413 ; AVX2-NEXT: vmovd %xmm2, %eax 2414 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 2415 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 2416 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2417 ; AVX2-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 2418 ; AVX2-NEXT: vmovd %xmm1, %eax 2419 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm1 2420 ; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 2421 ; AVX2-NEXT: vmovd %xmm1, %eax 2422 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 2423 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2424 ; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 2425 ; AVX2-NEXT: vmovd %xmm1, %eax 2426 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 2427 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2428 ; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 2429 ; AVX2-NEXT: vmovd %xmm1, %eax 2430 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2431 ; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 2432 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2433 ; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 2434 ; AVX2-NEXT: vmovd %xmm2, %eax 2435 ; AVX2-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 2436 ; AVX2-NEXT: vmovd %xmm1, %eax 2437 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2438 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2439 ; AVX2-NEXT: vmovd %eax, %xmm3 2440 ; AVX2-NEXT: vmovd %xmm1, %eax 2441 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2442 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2443 ; AVX2-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 2444 ; AVX2-NEXT: vmovd %xmm1, %eax 2445 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 2446 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 2447 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2448 ; AVX2-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 2449 ; AVX2-NEXT: vmovd %xmm0, %eax 2450 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm0 2451 ; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 2452 ; AVX2-NEXT: vmovd %xmm0, %eax 2453 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 2454 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2455 ; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 2456 ; AVX2-NEXT: vmovd %xmm0, %eax 2457 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3] 2458 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2459 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 2460 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2461 ; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 2462 ; AVX2-NEXT: vmovd %xmm1, %eax 2463 ; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1 2464 ; AVX2-NEXT: vmovd %xmm0, %eax 2465 ; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 2466 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2467 ; AVX2-NEXT: retq 2468 ; 2469 ; AVX512-LABEL: cvt_16f32_to_16i16: 2470 ; AVX512: # %bb.0: 2471 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 2472 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm2 2473 ; AVX512-NEXT: vmovd %xmm2, %eax 2474 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 2475 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2476 ; AVX512-NEXT: vmovd %eax, %xmm3 2477 ; AVX512-NEXT: vmovd %xmm2, %eax 2478 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 2479 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2480 ; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 2481 ; AVX512-NEXT: vmovd %xmm2, %eax 2482 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 2483 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 2484 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2485 ; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 2486 ; AVX512-NEXT: vmovd %xmm1, %eax 2487 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1 2488 ; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 2489 ; AVX512-NEXT: vmovd %xmm1, %eax 2490 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 2491 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2492 ; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 2493 ; AVX512-NEXT: vmovd %xmm1, %eax 2494 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 2495 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2496 ; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 2497 ; AVX512-NEXT: vmovd %xmm1, %eax 2498 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2499 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 2500 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2501 ; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 2502 ; AVX512-NEXT: vmovd %xmm2, %eax 2503 ; AVX512-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 2504 ; AVX512-NEXT: vmovd %xmm1, %eax 2505 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2506 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2507 ; AVX512-NEXT: vmovd %eax, %xmm3 2508 ; AVX512-NEXT: vmovd %xmm1, %eax 2509 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2510 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2511 ; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 2512 ; AVX512-NEXT: vmovd %xmm1, %eax 2513 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 2514 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 2515 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2516 ; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 2517 ; AVX512-NEXT: vmovd %xmm0, %eax 2518 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0 2519 ; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 2520 ; AVX512-NEXT: vmovd %xmm0, %eax 2521 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 2522 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2523 ; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 2524 ; AVX512-NEXT: vmovd %xmm0, %eax 2525 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] 2526 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2527 ; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 2528 ; AVX512-NEXT: vmovd %xmm0, %eax 2529 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3] 2530 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2531 ; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1 2532 ; AVX512-NEXT: vmovd %xmm0, %eax 2533 ; AVX512-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 2534 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2535 ; AVX512-NEXT: retq 2536 %1 = fptrunc <16 x float> %a0 to <16 x half> 2537 %2 = bitcast <16 x half> %1 to <16 x i16> 2538 ret <16 x i16> %2 2539 } 2540 2541 ; 2542 ; Float to Half (Store) 2543 ; 2544 2545 define void @store_cvt_f32_to_i16(float %a0, i16* %a1) nounwind { 2546 ; ALL-LABEL: store_cvt_f32_to_i16: 2547 ; ALL: # %bb.0: 2548 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2549 ; ALL-NEXT: vmovd %xmm0, %eax 2550 ; ALL-NEXT: movw %ax, (%rdi) 2551 ; ALL-NEXT: retq 2552 %1 = fptrunc float %a0 to half 2553 %2 = bitcast half %1 to i16 2554 store i16 %2, i16* %a1 2555 ret void 2556 } 2557 2558 define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) nounwind { 2559 ; ALL-LABEL: store_cvt_4f32_to_4i16: 2560 ; ALL: # %bb.0: 2561 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2562 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2563 ; ALL-NEXT: vmovd %xmm1, %eax 2564 ; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2565 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2566 ; ALL-NEXT: vmovd %xmm1, %ecx 2567 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2568 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2569 ; ALL-NEXT: vmovd %xmm1, %edx 2570 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2571 ; ALL-NEXT: vmovd %xmm0, %esi 2572 ; ALL-NEXT: movw %si, (%rdi) 2573 ; ALL-NEXT: movw %dx, 6(%rdi) 2574 ; ALL-NEXT: movw %cx, 4(%rdi) 2575 ; ALL-NEXT: movw %ax, 2(%rdi) 2576 ; ALL-NEXT: retq 2577 %1 = fptrunc <4 x float> %a0 to <4 x half> 2578 %2 = bitcast <4 x half> %1 to <4 x i16> 2579 store <4 x i16> %2, <4 x i16>* %a1 2580 ret void 2581 } 2582 2583 define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounwind { 2584 ; ALL-LABEL: store_cvt_4f32_to_8i16_undef: 2585 ; ALL: # %bb.0: 2586 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2587 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2588 ; ALL-NEXT: vmovd %xmm1, %eax 2589 ; ALL-NEXT: shll $16, %eax 2590 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2591 ; ALL-NEXT: vmovd %xmm1, %ecx 2592 ; ALL-NEXT: movzwl %cx, %ecx 2593 ; ALL-NEXT: orl %eax, %ecx 2594 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2595 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2596 ; ALL-NEXT: vmovd %xmm1, %eax 2597 ; ALL-NEXT: shll $16, %eax 2598 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2599 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2600 ; ALL-NEXT: vmovd %xmm0, %edx 2601 ; ALL-NEXT: movzwl %dx, %edx 2602 ; ALL-NEXT: orl %eax, %edx 2603 ; ALL-NEXT: shlq $32, %rdx 2604 ; ALL-NEXT: orq %rcx, %rdx 2605 ; ALL-NEXT: vmovq %rdx, %xmm0 2606 ; ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2607 ; ALL-NEXT: vmovdqa %xmm0, (%rdi) 2608 ; ALL-NEXT: retq 2609 %1 = fptrunc <4 x float> %a0 to <4 x half> 2610 %2 = bitcast <4 x half> %1 to <4 x i16> 2611 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2612 store <8 x i16> %3, <8 x i16>* %a1 2613 ret void 2614 } 2615 2616 define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwind { 2617 ; AVX1-LABEL: store_cvt_4f32_to_8i16_zero: 2618 ; AVX1: # %bb.0: 2619 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2620 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2621 ; AVX1-NEXT: vmovd %xmm1, %eax 2622 ; AVX1-NEXT: shll $16, %eax 2623 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2624 ; AVX1-NEXT: vmovd %xmm1, %ecx 2625 ; AVX1-NEXT: movzwl %cx, %ecx 2626 ; AVX1-NEXT: orl %eax, %ecx 2627 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2628 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2629 ; AVX1-NEXT: vmovd %xmm1, %eax 2630 ; AVX1-NEXT: shll $16, %eax 2631 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2632 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2633 ; AVX1-NEXT: vmovd %xmm0, %edx 2634 ; AVX1-NEXT: movzwl %dx, %edx 2635 ; AVX1-NEXT: orl %eax, %edx 2636 ; AVX1-NEXT: shlq $32, %rdx 2637 ; AVX1-NEXT: orq %rcx, %rdx 2638 ; AVX1-NEXT: vmovq %rdx, %xmm0 2639 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2640 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 2641 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi) 2642 ; AVX1-NEXT: retq 2643 ; 2644 ; AVX2-SLOW-LABEL: store_cvt_4f32_to_8i16_zero: 2645 ; AVX2-SLOW: # %bb.0: 2646 ; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2647 ; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2648 ; AVX2-SLOW-NEXT: vmovd %xmm1, %eax 2649 ; AVX2-SLOW-NEXT: shll $16, %eax 2650 ; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2651 ; AVX2-SLOW-NEXT: vmovd %xmm1, %ecx 2652 ; AVX2-SLOW-NEXT: movzwl %cx, %ecx 2653 ; AVX2-SLOW-NEXT: orl %eax, %ecx 2654 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2655 ; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2656 ; AVX2-SLOW-NEXT: vmovd %xmm1, %eax 2657 ; AVX2-SLOW-NEXT: shll $16, %eax 2658 ; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2659 ; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2660 ; AVX2-SLOW-NEXT: vmovd %xmm0, %edx 2661 ; AVX2-SLOW-NEXT: movzwl %dx, %edx 2662 ; AVX2-SLOW-NEXT: orl %eax, %edx 2663 ; AVX2-SLOW-NEXT: shlq $32, %rdx 2664 ; AVX2-SLOW-NEXT: orq %rcx, %rdx 2665 ; AVX2-SLOW-NEXT: vmovq %rdx, %xmm0 2666 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2667 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 2668 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rdi) 2669 ; AVX2-SLOW-NEXT: retq 2670 ; 2671 ; AVX2-FAST-LABEL: store_cvt_4f32_to_8i16_zero: 2672 ; AVX2-FAST: # %bb.0: 2673 ; AVX2-FAST-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2674 ; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2675 ; AVX2-FAST-NEXT: vmovd %xmm1, %eax 2676 ; AVX2-FAST-NEXT: shll $16, %eax 2677 ; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2678 ; AVX2-FAST-NEXT: vmovd %xmm1, %ecx 2679 ; AVX2-FAST-NEXT: movzwl %cx, %ecx 2680 ; AVX2-FAST-NEXT: orl %eax, %ecx 2681 ; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2682 ; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2683 ; AVX2-FAST-NEXT: vmovd %xmm1, %eax 2684 ; AVX2-FAST-NEXT: shll $16, %eax 2685 ; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2686 ; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2687 ; AVX2-FAST-NEXT: vmovd %xmm0, %edx 2688 ; AVX2-FAST-NEXT: movzwl %dx, %edx 2689 ; AVX2-FAST-NEXT: orl %eax, %edx 2690 ; AVX2-FAST-NEXT: shlq $32, %rdx 2691 ; AVX2-FAST-NEXT: orq %rcx, %rdx 2692 ; AVX2-FAST-NEXT: vmovq %rdx, %xmm0 2693 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero 2694 ; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rdi) 2695 ; AVX2-FAST-NEXT: retq 2696 ; 2697 ; AVX512F-LABEL: store_cvt_4f32_to_8i16_zero: 2698 ; AVX512F: # %bb.0: 2699 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2700 ; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2701 ; AVX512F-NEXT: vmovd %xmm1, %eax 2702 ; AVX512F-NEXT: shll $16, %eax 2703 ; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2704 ; AVX512F-NEXT: vmovd %xmm1, %ecx 2705 ; AVX512F-NEXT: movzwl %cx, %ecx 2706 ; AVX512F-NEXT: orl %eax, %ecx 2707 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2708 ; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2709 ; AVX512F-NEXT: vmovd %xmm1, %eax 2710 ; AVX512F-NEXT: shll $16, %eax 2711 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2712 ; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2713 ; AVX512F-NEXT: vmovd %xmm0, %edx 2714 ; AVX512F-NEXT: movzwl %dx, %edx 2715 ; AVX512F-NEXT: orl %eax, %edx 2716 ; AVX512F-NEXT: shlq $32, %rdx 2717 ; AVX512F-NEXT: orq %rcx, %rdx 2718 ; AVX512F-NEXT: vmovq %rdx, %xmm0 2719 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2720 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 2721 ; AVX512F-NEXT: vmovdqa %xmm0, (%rdi) 2722 ; AVX512F-NEXT: retq 2723 ; 2724 ; AVX512VL-LABEL: store_cvt_4f32_to_8i16_zero: 2725 ; AVX512VL: # %bb.0: 2726 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2727 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2728 ; AVX512VL-NEXT: vmovd %xmm1, %eax 2729 ; AVX512VL-NEXT: shll $16, %eax 2730 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2731 ; AVX512VL-NEXT: vmovd %xmm1, %ecx 2732 ; AVX512VL-NEXT: movzwl %cx, %ecx 2733 ; AVX512VL-NEXT: orl %eax, %ecx 2734 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2735 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2736 ; AVX512VL-NEXT: vmovd %xmm1, %eax 2737 ; AVX512VL-NEXT: shll $16, %eax 2738 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2739 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2740 ; AVX512VL-NEXT: vmovd %xmm0, %edx 2741 ; AVX512VL-NEXT: movzwl %dx, %edx 2742 ; AVX512VL-NEXT: orl %eax, %edx 2743 ; AVX512VL-NEXT: shlq $32, %rdx 2744 ; AVX512VL-NEXT: orq %rcx, %rdx 2745 ; AVX512VL-NEXT: vmovq %rdx, %xmm0 2746 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero 2747 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rdi) 2748 ; AVX512VL-NEXT: retq 2749 %1 = fptrunc <4 x float> %a0 to <4 x half> 2750 %2 = bitcast <4 x half> %1 to <4 x i16> 2751 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2752 store <8 x i16> %3, <8 x i16>* %a1 2753 ret void 2754 } 2755 2756 define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind { 2757 ; ALL-LABEL: store_cvt_8f32_to_8i16: 2758 ; ALL: # %bb.0: 2759 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2760 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2761 ; ALL-NEXT: vmovd %xmm1, %r8d 2762 ; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2763 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2764 ; ALL-NEXT: vmovd %xmm1, %r9d 2765 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2766 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2767 ; ALL-NEXT: vmovd %xmm1, %r10d 2768 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 2769 ; ALL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 2770 ; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2771 ; ALL-NEXT: vmovd %xmm2, %r11d 2772 ; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 2773 ; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2774 ; ALL-NEXT: vmovd %xmm2, %eax 2775 ; ALL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 2776 ; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2777 ; ALL-NEXT: vmovd %xmm2, %ecx 2778 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2779 ; ALL-NEXT: vmovd %xmm0, %edx 2780 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm0 2781 ; ALL-NEXT: vmovd %xmm0, %esi 2782 ; ALL-NEXT: movw %si, 8(%rdi) 2783 ; ALL-NEXT: movw %dx, (%rdi) 2784 ; ALL-NEXT: movw %cx, 14(%rdi) 2785 ; ALL-NEXT: movw %ax, 12(%rdi) 2786 ; ALL-NEXT: movw %r11w, 10(%rdi) 2787 ; ALL-NEXT: movw %r10w, 6(%rdi) 2788 ; ALL-NEXT: movw %r9w, 4(%rdi) 2789 ; ALL-NEXT: movw %r8w, 2(%rdi) 2790 ; ALL-NEXT: vzeroupper 2791 ; ALL-NEXT: retq 2792 %1 = fptrunc <8 x float> %a0 to <8 x half> 2793 %2 = bitcast <8 x half> %1 to <8 x i16> 2794 store <8 x i16> %2, <8 x i16>* %a1 2795 ret void 2796 } 2797 2798 define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwind { 2799 ; AVX1-LABEL: store_cvt_16f32_to_16i16: 2800 ; AVX1: # %bb.0: 2801 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2802 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2803 ; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm4 2804 ; AVX1-NEXT: vmovd %xmm4, %eax 2805 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm4 2806 ; AVX1-NEXT: movw %ax, 24(%rdi) 2807 ; AVX1-NEXT: vmovd %xmm4, %eax 2808 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm4 2809 ; AVX1-NEXT: movw %ax, 16(%rdi) 2810 ; AVX1-NEXT: vmovd %xmm4, %eax 2811 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm4 2812 ; AVX1-NEXT: movw %ax, 8(%rdi) 2813 ; AVX1-NEXT: vmovd %xmm4, %eax 2814 ; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3] 2815 ; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2816 ; AVX1-NEXT: movw %ax, (%rdi) 2817 ; AVX1-NEXT: vmovd %xmm4, %eax 2818 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] 2819 ; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2820 ; AVX1-NEXT: movw %ax, 30(%rdi) 2821 ; AVX1-NEXT: vmovd %xmm4, %eax 2822 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 2823 ; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2824 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] 2825 ; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2826 ; AVX1-NEXT: movw %ax, 28(%rdi) 2827 ; AVX1-NEXT: vmovd %xmm3, %eax 2828 ; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3] 2829 ; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2830 ; AVX1-NEXT: movw %ax, 26(%rdi) 2831 ; AVX1-NEXT: vmovd %xmm3, %eax 2832 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 2833 ; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2834 ; AVX1-NEXT: movw %ax, 22(%rdi) 2835 ; AVX1-NEXT: vmovd %xmm3, %eax 2836 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 2837 ; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2838 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 2839 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2840 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 2841 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2842 ; AVX1-NEXT: movw %ax, 20(%rdi) 2843 ; AVX1-NEXT: vmovd %xmm1, %eax 2844 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3] 2845 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2846 ; AVX1-NEXT: movw %ax, 18(%rdi) 2847 ; AVX1-NEXT: vmovd %xmm1, %eax 2848 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 2849 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2850 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 2851 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2852 ; AVX1-NEXT: movw %ax, 14(%rdi) 2853 ; AVX1-NEXT: vmovd %xmm2, %eax 2854 ; AVX1-NEXT: movw %ax, 12(%rdi) 2855 ; AVX1-NEXT: vmovd %xmm1, %eax 2856 ; AVX1-NEXT: movw %ax, 10(%rdi) 2857 ; AVX1-NEXT: vmovd %xmm0, %eax 2858 ; AVX1-NEXT: movw %ax, 6(%rdi) 2859 ; AVX1-NEXT: vmovd %xmm3, %eax 2860 ; AVX1-NEXT: movw %ax, 4(%rdi) 2861 ; AVX1-NEXT: vmovd %xmm4, %eax 2862 ; AVX1-NEXT: movw %ax, 2(%rdi) 2863 ; AVX1-NEXT: vzeroupper 2864 ; AVX1-NEXT: retq 2865 ; 2866 ; AVX2-LABEL: store_cvt_16f32_to_16i16: 2867 ; AVX2: # %bb.0: 2868 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 2869 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm3 2870 ; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm4 2871 ; AVX2-NEXT: vmovd %xmm4, %eax 2872 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm4 2873 ; AVX2-NEXT: movw %ax, 24(%rdi) 2874 ; AVX2-NEXT: vmovd %xmm4, %eax 2875 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm4 2876 ; AVX2-NEXT: movw %ax, 16(%rdi) 2877 ; AVX2-NEXT: vmovd %xmm4, %eax 2878 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm4 2879 ; AVX2-NEXT: movw %ax, 8(%rdi) 2880 ; AVX2-NEXT: vmovd %xmm4, %eax 2881 ; AVX2-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3] 2882 ; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2883 ; AVX2-NEXT: movw %ax, (%rdi) 2884 ; AVX2-NEXT: vmovd %xmm4, %eax 2885 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] 2886 ; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2887 ; AVX2-NEXT: movw %ax, 30(%rdi) 2888 ; AVX2-NEXT: vmovd %xmm4, %eax 2889 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 2890 ; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2891 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] 2892 ; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2893 ; AVX2-NEXT: movw %ax, 28(%rdi) 2894 ; AVX2-NEXT: vmovd %xmm3, %eax 2895 ; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3] 2896 ; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2897 ; AVX2-NEXT: movw %ax, 26(%rdi) 2898 ; AVX2-NEXT: vmovd %xmm3, %eax 2899 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 2900 ; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2901 ; AVX2-NEXT: movw %ax, 22(%rdi) 2902 ; AVX2-NEXT: vmovd %xmm3, %eax 2903 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 2904 ; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2905 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 2906 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2907 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 2908 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2909 ; AVX2-NEXT: movw %ax, 20(%rdi) 2910 ; AVX2-NEXT: vmovd %xmm1, %eax 2911 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3] 2912 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2913 ; AVX2-NEXT: movw %ax, 18(%rdi) 2914 ; AVX2-NEXT: vmovd %xmm1, %eax 2915 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 2916 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2917 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 2918 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2919 ; AVX2-NEXT: movw %ax, 14(%rdi) 2920 ; AVX2-NEXT: vmovd %xmm2, %eax 2921 ; AVX2-NEXT: movw %ax, 12(%rdi) 2922 ; AVX2-NEXT: vmovd %xmm1, %eax 2923 ; AVX2-NEXT: movw %ax, 10(%rdi) 2924 ; AVX2-NEXT: vmovd %xmm0, %eax 2925 ; AVX2-NEXT: movw %ax, 6(%rdi) 2926 ; AVX2-NEXT: vmovd %xmm3, %eax 2927 ; AVX2-NEXT: movw %ax, 4(%rdi) 2928 ; AVX2-NEXT: vmovd %xmm4, %eax 2929 ; AVX2-NEXT: movw %ax, 2(%rdi) 2930 ; AVX2-NEXT: vzeroupper 2931 ; AVX2-NEXT: retq 2932 ; 2933 ; AVX512-LABEL: store_cvt_16f32_to_16i16: 2934 ; AVX512: # %bb.0: 2935 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 2936 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm2 2937 ; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm3 2938 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm4 2939 ; AVX512-NEXT: vmovd %xmm4, %eax 2940 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm4 2941 ; AVX512-NEXT: movw %ax, 24(%rdi) 2942 ; AVX512-NEXT: vmovd %xmm4, %eax 2943 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm4 2944 ; AVX512-NEXT: movw %ax, 16(%rdi) 2945 ; AVX512-NEXT: vmovd %xmm4, %eax 2946 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm4 2947 ; AVX512-NEXT: movw %ax, 8(%rdi) 2948 ; AVX512-NEXT: vmovd %xmm4, %eax 2949 ; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3] 2950 ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2951 ; AVX512-NEXT: movw %ax, (%rdi) 2952 ; AVX512-NEXT: vmovd %xmm4, %eax 2953 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] 2954 ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2955 ; AVX512-NEXT: movw %ax, 30(%rdi) 2956 ; AVX512-NEXT: vmovd %xmm4, %eax 2957 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 2958 ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2959 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] 2960 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2961 ; AVX512-NEXT: movw %ax, 28(%rdi) 2962 ; AVX512-NEXT: vmovd %xmm3, %eax 2963 ; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3] 2964 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2965 ; AVX512-NEXT: movw %ax, 26(%rdi) 2966 ; AVX512-NEXT: vmovd %xmm3, %eax 2967 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 2968 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2969 ; AVX512-NEXT: movw %ax, 22(%rdi) 2970 ; AVX512-NEXT: vmovd %xmm3, %eax 2971 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 2972 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2973 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 2974 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2975 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] 2976 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2977 ; AVX512-NEXT: movw %ax, 20(%rdi) 2978 ; AVX512-NEXT: vmovd %xmm2, %eax 2979 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 2980 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2981 ; AVX512-NEXT: movw %ax, 18(%rdi) 2982 ; AVX512-NEXT: vmovd %xmm2, %eax 2983 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 2984 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2985 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 2986 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2987 ; AVX512-NEXT: movw %ax, 14(%rdi) 2988 ; AVX512-NEXT: vmovd %xmm1, %eax 2989 ; AVX512-NEXT: movw %ax, 12(%rdi) 2990 ; AVX512-NEXT: vmovd %xmm2, %eax 2991 ; AVX512-NEXT: movw %ax, 10(%rdi) 2992 ; AVX512-NEXT: vmovd %xmm0, %eax 2993 ; AVX512-NEXT: movw %ax, 6(%rdi) 2994 ; AVX512-NEXT: vmovd %xmm3, %eax 2995 ; AVX512-NEXT: movw %ax, 4(%rdi) 2996 ; AVX512-NEXT: vmovd %xmm4, %eax 2997 ; AVX512-NEXT: movw %ax, 2(%rdi) 2998 ; AVX512-NEXT: vzeroupper 2999 ; AVX512-NEXT: retq 3000 %1 = fptrunc <16 x float> %a0 to <16 x half> 3001 %2 = bitcast <16 x half> %1 to <16 x i16> 3002 store <16 x i16> %2, <16 x i16>* %a1 3003 ret void 3004 } 3005 3006 ; 3007 ; Double to Half 3008 ; 3009 3010 define i16 @cvt_f64_to_i16(double %a0) nounwind { 3011 ; ALL-LABEL: cvt_f64_to_i16: 3012 ; ALL: # %bb.0: 3013 ; ALL-NEXT: jmp __truncdfhf2 # TAILCALL 3014 %1 = fptrunc double %a0 to half 3015 %2 = bitcast half %1 to i16 3016 ret i16 %2 3017 } 3018 3019 define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind { 3020 ; ALL-LABEL: cvt_2f64_to_2i16: 3021 ; ALL: # %bb.0: 3022 ; ALL-NEXT: pushq %rbx 3023 ; ALL-NEXT: subq $16, %rsp 3024 ; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3025 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3026 ; ALL-NEXT: callq __truncdfhf2 3027 ; ALL-NEXT: movl %eax, %ebx 3028 ; ALL-NEXT: shll $16, %ebx 3029 ; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3030 ; ALL-NEXT: callq __truncdfhf2 3031 ; ALL-NEXT: movzwl %ax, %eax 3032 ; ALL-NEXT: orl %ebx, %eax 3033 ; ALL-NEXT: vmovd %eax, %xmm0 3034 ; ALL-NEXT: addq $16, %rsp 3035 ; ALL-NEXT: popq %rbx 3036 ; ALL-NEXT: retq 3037 %1 = fptrunc <2 x double> %a0 to <2 x half> 3038 %2 = bitcast <2 x half> %1 to <2 x i16> 3039 ret <2 x i16> %2 3040 } 3041 3042 define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { 3043 ; AVX1-LABEL: cvt_4f64_to_4i16: 3044 ; AVX1: # %bb.0: 3045 ; AVX1-NEXT: pushq %r14 3046 ; AVX1-NEXT: pushq %rbx 3047 ; AVX1-NEXT: subq $40, %rsp 3048 ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3049 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3050 ; AVX1-NEXT: vzeroupper 3051 ; AVX1-NEXT: callq __truncdfhf2 3052 ; AVX1-NEXT: movl %eax, %ebx 3053 ; AVX1-NEXT: shll $16, %ebx 3054 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3055 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3056 ; AVX1-NEXT: vzeroupper 3057 ; AVX1-NEXT: callq __truncdfhf2 3058 ; AVX1-NEXT: movzwl %ax, %r14d 3059 ; AVX1-NEXT: orl %ebx, %r14d 3060 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3061 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3062 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3063 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3064 ; AVX1-NEXT: vzeroupper 3065 ; AVX1-NEXT: callq __truncdfhf2 3066 ; AVX1-NEXT: movl %eax, %ebx 3067 ; AVX1-NEXT: shll $16, %ebx 3068 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3069 ; AVX1-NEXT: callq __truncdfhf2 3070 ; AVX1-NEXT: movzwl %ax, %eax 3071 ; AVX1-NEXT: orl %ebx, %eax 3072 ; AVX1-NEXT: shlq $32, %rax 3073 ; AVX1-NEXT: orq %r14, %rax 3074 ; AVX1-NEXT: vmovq %rax, %xmm0 3075 ; AVX1-NEXT: addq $40, %rsp 3076 ; AVX1-NEXT: popq %rbx 3077 ; AVX1-NEXT: popq %r14 3078 ; AVX1-NEXT: retq 3079 ; 3080 ; AVX2-LABEL: cvt_4f64_to_4i16: 3081 ; AVX2: # %bb.0: 3082 ; AVX2-NEXT: pushq %r14 3083 ; AVX2-NEXT: pushq %rbx 3084 ; AVX2-NEXT: subq $40, %rsp 3085 ; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3086 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3087 ; AVX2-NEXT: vzeroupper 3088 ; AVX2-NEXT: callq __truncdfhf2 3089 ; AVX2-NEXT: movl %eax, %ebx 3090 ; AVX2-NEXT: shll $16, %ebx 3091 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3092 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3093 ; AVX2-NEXT: vzeroupper 3094 ; AVX2-NEXT: callq __truncdfhf2 3095 ; AVX2-NEXT: movzwl %ax, %r14d 3096 ; AVX2-NEXT: orl %ebx, %r14d 3097 ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3098 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3099 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3100 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3101 ; AVX2-NEXT: vzeroupper 3102 ; AVX2-NEXT: callq __truncdfhf2 3103 ; AVX2-NEXT: movl %eax, %ebx 3104 ; AVX2-NEXT: shll $16, %ebx 3105 ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3106 ; AVX2-NEXT: callq __truncdfhf2 3107 ; AVX2-NEXT: movzwl %ax, %eax 3108 ; AVX2-NEXT: orl %ebx, %eax 3109 ; AVX2-NEXT: shlq $32, %rax 3110 ; AVX2-NEXT: orq %r14, %rax 3111 ; AVX2-NEXT: vmovq %rax, %xmm0 3112 ; AVX2-NEXT: addq $40, %rsp 3113 ; AVX2-NEXT: popq %rbx 3114 ; AVX2-NEXT: popq %r14 3115 ; AVX2-NEXT: retq 3116 ; 3117 ; AVX512-LABEL: cvt_4f64_to_4i16: 3118 ; AVX512: # %bb.0: 3119 ; AVX512-NEXT: pushq %r14 3120 ; AVX512-NEXT: pushq %rbx 3121 ; AVX512-NEXT: subq $40, %rsp 3122 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3123 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3124 ; AVX512-NEXT: vzeroupper 3125 ; AVX512-NEXT: callq __truncdfhf2 3126 ; AVX512-NEXT: movl %eax, %ebx 3127 ; AVX512-NEXT: shll $16, %ebx 3128 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3129 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3130 ; AVX512-NEXT: vzeroupper 3131 ; AVX512-NEXT: callq __truncdfhf2 3132 ; AVX512-NEXT: movzwl %ax, %r14d 3133 ; AVX512-NEXT: orl %ebx, %r14d 3134 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3135 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3136 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3137 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3138 ; AVX512-NEXT: vzeroupper 3139 ; AVX512-NEXT: callq __truncdfhf2 3140 ; AVX512-NEXT: movl %eax, %ebx 3141 ; AVX512-NEXT: shll $16, %ebx 3142 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3143 ; AVX512-NEXT: callq __truncdfhf2 3144 ; AVX512-NEXT: movzwl %ax, %eax 3145 ; AVX512-NEXT: orl %ebx, %eax 3146 ; AVX512-NEXT: shlq $32, %rax 3147 ; AVX512-NEXT: orq %r14, %rax 3148 ; AVX512-NEXT: vmovq %rax, %xmm0 3149 ; AVX512-NEXT: addq $40, %rsp 3150 ; AVX512-NEXT: popq %rbx 3151 ; AVX512-NEXT: popq %r14 3152 ; AVX512-NEXT: retq 3153 %1 = fptrunc <4 x double> %a0 to <4 x half> 3154 %2 = bitcast <4 x half> %1 to <4 x i16> 3155 ret <4 x i16> %2 3156 } 3157 3158 define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { 3159 ; AVX1-LABEL: cvt_4f64_to_8i16_undef: 3160 ; AVX1: # %bb.0: 3161 ; AVX1-NEXT: pushq %r14 3162 ; AVX1-NEXT: pushq %rbx 3163 ; AVX1-NEXT: subq $40, %rsp 3164 ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3165 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3166 ; AVX1-NEXT: vzeroupper 3167 ; AVX1-NEXT: callq __truncdfhf2 3168 ; AVX1-NEXT: movl %eax, %ebx 3169 ; AVX1-NEXT: shll $16, %ebx 3170 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3171 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3172 ; AVX1-NEXT: vzeroupper 3173 ; AVX1-NEXT: callq __truncdfhf2 3174 ; AVX1-NEXT: movzwl %ax, %r14d 3175 ; AVX1-NEXT: orl %ebx, %r14d 3176 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3177 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3178 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3179 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3180 ; AVX1-NEXT: vzeroupper 3181 ; AVX1-NEXT: callq __truncdfhf2 3182 ; AVX1-NEXT: movl %eax, %ebx 3183 ; AVX1-NEXT: shll $16, %ebx 3184 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3185 ; AVX1-NEXT: callq __truncdfhf2 3186 ; AVX1-NEXT: movzwl %ax, %eax 3187 ; AVX1-NEXT: orl %ebx, %eax 3188 ; AVX1-NEXT: shlq $32, %rax 3189 ; AVX1-NEXT: orq %r14, %rax 3190 ; AVX1-NEXT: vmovq %rax, %xmm0 3191 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3192 ; AVX1-NEXT: addq $40, %rsp 3193 ; AVX1-NEXT: popq %rbx 3194 ; AVX1-NEXT: popq %r14 3195 ; AVX1-NEXT: retq 3196 ; 3197 ; AVX2-LABEL: cvt_4f64_to_8i16_undef: 3198 ; AVX2: # %bb.0: 3199 ; AVX2-NEXT: pushq %r14 3200 ; AVX2-NEXT: pushq %rbx 3201 ; AVX2-NEXT: subq $40, %rsp 3202 ; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3203 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3204 ; AVX2-NEXT: vzeroupper 3205 ; AVX2-NEXT: callq __truncdfhf2 3206 ; AVX2-NEXT: movl %eax, %ebx 3207 ; AVX2-NEXT: shll $16, %ebx 3208 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3209 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3210 ; AVX2-NEXT: vzeroupper 3211 ; AVX2-NEXT: callq __truncdfhf2 3212 ; AVX2-NEXT: movzwl %ax, %r14d 3213 ; AVX2-NEXT: orl %ebx, %r14d 3214 ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3215 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3216 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3217 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3218 ; AVX2-NEXT: vzeroupper 3219 ; AVX2-NEXT: callq __truncdfhf2 3220 ; AVX2-NEXT: movl %eax, %ebx 3221 ; AVX2-NEXT: shll $16, %ebx 3222 ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3223 ; AVX2-NEXT: callq __truncdfhf2 3224 ; AVX2-NEXT: movzwl %ax, %eax 3225 ; AVX2-NEXT: orl %ebx, %eax 3226 ; AVX2-NEXT: shlq $32, %rax 3227 ; AVX2-NEXT: orq %r14, %rax 3228 ; AVX2-NEXT: vmovq %rax, %xmm0 3229 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3230 ; AVX2-NEXT: addq $40, %rsp 3231 ; AVX2-NEXT: popq %rbx 3232 ; AVX2-NEXT: popq %r14 3233 ; AVX2-NEXT: retq 3234 ; 3235 ; AVX512-LABEL: cvt_4f64_to_8i16_undef: 3236 ; AVX512: # %bb.0: 3237 ; AVX512-NEXT: pushq %r14 3238 ; AVX512-NEXT: pushq %rbx 3239 ; AVX512-NEXT: subq $40, %rsp 3240 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3241 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3242 ; AVX512-NEXT: vzeroupper 3243 ; AVX512-NEXT: callq __truncdfhf2 3244 ; AVX512-NEXT: movl %eax, %ebx 3245 ; AVX512-NEXT: shll $16, %ebx 3246 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3247 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3248 ; AVX512-NEXT: vzeroupper 3249 ; AVX512-NEXT: callq __truncdfhf2 3250 ; AVX512-NEXT: movzwl %ax, %r14d 3251 ; AVX512-NEXT: orl %ebx, %r14d 3252 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3253 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3254 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3255 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3256 ; AVX512-NEXT: vzeroupper 3257 ; AVX512-NEXT: callq __truncdfhf2 3258 ; AVX512-NEXT: movl %eax, %ebx 3259 ; AVX512-NEXT: shll $16, %ebx 3260 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3261 ; AVX512-NEXT: callq __truncdfhf2 3262 ; AVX512-NEXT: movzwl %ax, %eax 3263 ; AVX512-NEXT: orl %ebx, %eax 3264 ; AVX512-NEXT: shlq $32, %rax 3265 ; AVX512-NEXT: orq %r14, %rax 3266 ; AVX512-NEXT: vmovq %rax, %xmm0 3267 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3268 ; AVX512-NEXT: addq $40, %rsp 3269 ; AVX512-NEXT: popq %rbx 3270 ; AVX512-NEXT: popq %r14 3271 ; AVX512-NEXT: retq 3272 %1 = fptrunc <4 x double> %a0 to <4 x half> 3273 %2 = bitcast <4 x half> %1 to <4 x i16> 3274 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3275 ret <8 x i16> %3 3276 } 3277 3278 define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { 3279 ; AVX1-LABEL: cvt_4f64_to_8i16_zero: 3280 ; AVX1: # %bb.0: 3281 ; AVX1-NEXT: pushq %r14 3282 ; AVX1-NEXT: pushq %rbx 3283 ; AVX1-NEXT: subq $40, %rsp 3284 ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3285 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3286 ; AVX1-NEXT: vzeroupper 3287 ; AVX1-NEXT: callq __truncdfhf2 3288 ; AVX1-NEXT: movl %eax, %ebx 3289 ; AVX1-NEXT: shll $16, %ebx 3290 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3291 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3292 ; AVX1-NEXT: vzeroupper 3293 ; AVX1-NEXT: callq __truncdfhf2 3294 ; AVX1-NEXT: movzwl %ax, %r14d 3295 ; AVX1-NEXT: orl %ebx, %r14d 3296 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3297 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3298 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3299 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3300 ; AVX1-NEXT: vzeroupper 3301 ; AVX1-NEXT: callq __truncdfhf2 3302 ; AVX1-NEXT: movl %eax, %ebx 3303 ; AVX1-NEXT: shll $16, %ebx 3304 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3305 ; AVX1-NEXT: callq __truncdfhf2 3306 ; AVX1-NEXT: movzwl %ax, %eax 3307 ; AVX1-NEXT: orl %ebx, %eax 3308 ; AVX1-NEXT: shlq $32, %rax 3309 ; AVX1-NEXT: orq %r14, %rax 3310 ; AVX1-NEXT: vmovq %rax, %xmm0 3311 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3312 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 3313 ; AVX1-NEXT: addq $40, %rsp 3314 ; AVX1-NEXT: popq %rbx 3315 ; AVX1-NEXT: popq %r14 3316 ; AVX1-NEXT: retq 3317 ; 3318 ; AVX2-SLOW-LABEL: cvt_4f64_to_8i16_zero: 3319 ; AVX2-SLOW: # %bb.0: 3320 ; AVX2-SLOW-NEXT: pushq %r14 3321 ; AVX2-SLOW-NEXT: pushq %rbx 3322 ; AVX2-SLOW-NEXT: subq $40, %rsp 3323 ; AVX2-SLOW-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3324 ; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3325 ; AVX2-SLOW-NEXT: vzeroupper 3326 ; AVX2-SLOW-NEXT: callq __truncdfhf2 3327 ; AVX2-SLOW-NEXT: movl %eax, %ebx 3328 ; AVX2-SLOW-NEXT: shll $16, %ebx 3329 ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3330 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3331 ; AVX2-SLOW-NEXT: vzeroupper 3332 ; AVX2-SLOW-NEXT: callq __truncdfhf2 3333 ; AVX2-SLOW-NEXT: movzwl %ax, %r14d 3334 ; AVX2-SLOW-NEXT: orl %ebx, %r14d 3335 ; AVX2-SLOW-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3336 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 3337 ; AVX2-SLOW-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3338 ; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3339 ; AVX2-SLOW-NEXT: vzeroupper 3340 ; AVX2-SLOW-NEXT: callq __truncdfhf2 3341 ; AVX2-SLOW-NEXT: movl %eax, %ebx 3342 ; AVX2-SLOW-NEXT: shll $16, %ebx 3343 ; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3344 ; AVX2-SLOW-NEXT: callq __truncdfhf2 3345 ; AVX2-SLOW-NEXT: movzwl %ax, %eax 3346 ; AVX2-SLOW-NEXT: orl %ebx, %eax 3347 ; AVX2-SLOW-NEXT: shlq $32, %rax 3348 ; AVX2-SLOW-NEXT: orq %r14, %rax 3349 ; AVX2-SLOW-NEXT: vmovq %rax, %xmm0 3350 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3351 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 3352 ; AVX2-SLOW-NEXT: addq $40, %rsp 3353 ; AVX2-SLOW-NEXT: popq %rbx 3354 ; AVX2-SLOW-NEXT: popq %r14 3355 ; AVX2-SLOW-NEXT: retq 3356 ; 3357 ; AVX2-FAST-LABEL: cvt_4f64_to_8i16_zero: 3358 ; AVX2-FAST: # %bb.0: 3359 ; AVX2-FAST-NEXT: pushq %r14 3360 ; AVX2-FAST-NEXT: pushq %rbx 3361 ; AVX2-FAST-NEXT: subq $40, %rsp 3362 ; AVX2-FAST-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3363 ; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3364 ; AVX2-FAST-NEXT: vzeroupper 3365 ; AVX2-FAST-NEXT: callq __truncdfhf2 3366 ; AVX2-FAST-NEXT: movl %eax, %ebx 3367 ; AVX2-FAST-NEXT: shll $16, %ebx 3368 ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3369 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3370 ; AVX2-FAST-NEXT: vzeroupper 3371 ; AVX2-FAST-NEXT: callq __truncdfhf2 3372 ; AVX2-FAST-NEXT: movzwl %ax, %r14d 3373 ; AVX2-FAST-NEXT: orl %ebx, %r14d 3374 ; AVX2-FAST-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3375 ; AVX2-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 3376 ; AVX2-FAST-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3377 ; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3378 ; AVX2-FAST-NEXT: vzeroupper 3379 ; AVX2-FAST-NEXT: callq __truncdfhf2 3380 ; AVX2-FAST-NEXT: movl %eax, %ebx 3381 ; AVX2-FAST-NEXT: shll $16, %ebx 3382 ; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3383 ; AVX2-FAST-NEXT: callq __truncdfhf2 3384 ; AVX2-FAST-NEXT: movzwl %ax, %eax 3385 ; AVX2-FAST-NEXT: orl %ebx, %eax 3386 ; AVX2-FAST-NEXT: shlq $32, %rax 3387 ; AVX2-FAST-NEXT: orq %r14, %rax 3388 ; AVX2-FAST-NEXT: vmovq %rax, %xmm0 3389 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero 3390 ; AVX2-FAST-NEXT: addq $40, %rsp 3391 ; AVX2-FAST-NEXT: popq %rbx 3392 ; AVX2-FAST-NEXT: popq %r14 3393 ; AVX2-FAST-NEXT: retq 3394 ; 3395 ; AVX512F-LABEL: cvt_4f64_to_8i16_zero: 3396 ; AVX512F: # %bb.0: 3397 ; AVX512F-NEXT: pushq %r14 3398 ; AVX512F-NEXT: pushq %rbx 3399 ; AVX512F-NEXT: subq $40, %rsp 3400 ; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3401 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3402 ; AVX512F-NEXT: vzeroupper 3403 ; AVX512F-NEXT: callq __truncdfhf2 3404 ; AVX512F-NEXT: movl %eax, %ebx 3405 ; AVX512F-NEXT: shll $16, %ebx 3406 ; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3407 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3408 ; AVX512F-NEXT: vzeroupper 3409 ; AVX512F-NEXT: callq __truncdfhf2 3410 ; AVX512F-NEXT: movzwl %ax, %r14d 3411 ; AVX512F-NEXT: orl %ebx, %r14d 3412 ; AVX512F-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3413 ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 3414 ; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3415 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3416 ; AVX512F-NEXT: vzeroupper 3417 ; AVX512F-NEXT: callq __truncdfhf2 3418 ; AVX512F-NEXT: movl %eax, %ebx 3419 ; AVX512F-NEXT: shll $16, %ebx 3420 ; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3421 ; AVX512F-NEXT: callq __truncdfhf2 3422 ; AVX512F-NEXT: movzwl %ax, %eax 3423 ; AVX512F-NEXT: orl %ebx, %eax 3424 ; AVX512F-NEXT: shlq $32, %rax 3425 ; AVX512F-NEXT: orq %r14, %rax 3426 ; AVX512F-NEXT: vmovq %rax, %xmm0 3427 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3428 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 3429 ; AVX512F-NEXT: addq $40, %rsp 3430 ; AVX512F-NEXT: popq %rbx 3431 ; AVX512F-NEXT: popq %r14 3432 ; AVX512F-NEXT: retq 3433 ; 3434 ; AVX512VL-LABEL: cvt_4f64_to_8i16_zero: 3435 ; AVX512VL: # %bb.0: 3436 ; AVX512VL-NEXT: pushq %r14 3437 ; AVX512VL-NEXT: pushq %rbx 3438 ; AVX512VL-NEXT: subq $40, %rsp 3439 ; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3440 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3441 ; AVX512VL-NEXT: vzeroupper 3442 ; AVX512VL-NEXT: callq __truncdfhf2 3443 ; AVX512VL-NEXT: movl %eax, %ebx 3444 ; AVX512VL-NEXT: shll $16, %ebx 3445 ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3446 ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3447 ; AVX512VL-NEXT: vzeroupper 3448 ; AVX512VL-NEXT: callq __truncdfhf2 3449 ; AVX512VL-NEXT: movzwl %ax, %r14d 3450 ; AVX512VL-NEXT: orl %ebx, %r14d 3451 ; AVX512VL-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3452 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 3453 ; AVX512VL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3454 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3455 ; AVX512VL-NEXT: vzeroupper 3456 ; AVX512VL-NEXT: callq __truncdfhf2 3457 ; AVX512VL-NEXT: movl %eax, %ebx 3458 ; AVX512VL-NEXT: shll $16, %ebx 3459 ; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3460 ; AVX512VL-NEXT: callq __truncdfhf2 3461 ; AVX512VL-NEXT: movzwl %ax, %eax 3462 ; AVX512VL-NEXT: orl %ebx, %eax 3463 ; AVX512VL-NEXT: shlq $32, %rax 3464 ; AVX512VL-NEXT: orq %r14, %rax 3465 ; AVX512VL-NEXT: vmovq %rax, %xmm0 3466 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero 3467 ; AVX512VL-NEXT: addq $40, %rsp 3468 ; AVX512VL-NEXT: popq %rbx 3469 ; AVX512VL-NEXT: popq %r14 3470 ; AVX512VL-NEXT: retq 3471 %1 = fptrunc <4 x double> %a0 to <4 x half> 3472 %2 = bitcast <4 x half> %1 to <4 x i16> 3473 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3474 ret <8 x i16> %3 3475 } 3476 3477 define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind { 3478 ; AVX1-LABEL: cvt_8f64_to_8i16: 3479 ; AVX1: # %bb.0: 3480 ; AVX1-NEXT: pushq %r15 3481 ; AVX1-NEXT: pushq %r14 3482 ; AVX1-NEXT: pushq %rbx 3483 ; AVX1-NEXT: subq $64, %rsp 3484 ; AVX1-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill 3485 ; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3486 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3487 ; AVX1-NEXT: vzeroupper 3488 ; AVX1-NEXT: callq __truncdfhf2 3489 ; AVX1-NEXT: movl %eax, %ebx 3490 ; AVX1-NEXT: shll $16, %ebx 3491 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3492 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3493 ; AVX1-NEXT: vzeroupper 3494 ; AVX1-NEXT: callq __truncdfhf2 3495 ; AVX1-NEXT: movzwl %ax, %r15d 3496 ; AVX1-NEXT: orl %ebx, %r15d 3497 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3498 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3499 ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3500 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3501 ; AVX1-NEXT: vzeroupper 3502 ; AVX1-NEXT: callq __truncdfhf2 3503 ; AVX1-NEXT: movl %eax, %ebx 3504 ; AVX1-NEXT: shll $16, %ebx 3505 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3506 ; AVX1-NEXT: callq __truncdfhf2 3507 ; AVX1-NEXT: movzwl %ax, %r14d 3508 ; AVX1-NEXT: orl %ebx, %r14d 3509 ; AVX1-NEXT: shlq $32, %r14 3510 ; AVX1-NEXT: orq %r15, %r14 3511 ; AVX1-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3512 ; AVX1-NEXT: # xmm0 = mem[1,0] 3513 ; AVX1-NEXT: callq __truncdfhf2 3514 ; AVX1-NEXT: movl %eax, %ebx 3515 ; AVX1-NEXT: shll $16, %ebx 3516 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3517 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3518 ; AVX1-NEXT: vzeroupper 3519 ; AVX1-NEXT: callq __truncdfhf2 3520 ; AVX1-NEXT: movzwl %ax, %r15d 3521 ; AVX1-NEXT: orl %ebx, %r15d 3522 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3523 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3524 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3525 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3526 ; AVX1-NEXT: vzeroupper 3527 ; AVX1-NEXT: callq __truncdfhf2 3528 ; AVX1-NEXT: movl %eax, %ebx 3529 ; AVX1-NEXT: shll $16, %ebx 3530 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3531 ; AVX1-NEXT: callq __truncdfhf2 3532 ; AVX1-NEXT: movzwl %ax, %eax 3533 ; AVX1-NEXT: orl %ebx, %eax 3534 ; AVX1-NEXT: shlq $32, %rax 3535 ; AVX1-NEXT: orq %r15, %rax 3536 ; AVX1-NEXT: vmovq %rax, %xmm0 3537 ; AVX1-NEXT: vmovq %r14, %xmm1 3538 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 3539 ; AVX1-NEXT: addq $64, %rsp 3540 ; AVX1-NEXT: popq %rbx 3541 ; AVX1-NEXT: popq %r14 3542 ; AVX1-NEXT: popq %r15 3543 ; AVX1-NEXT: retq 3544 ; 3545 ; AVX2-LABEL: cvt_8f64_to_8i16: 3546 ; AVX2: # %bb.0: 3547 ; AVX2-NEXT: pushq %r15 3548 ; AVX2-NEXT: pushq %r14 3549 ; AVX2-NEXT: pushq %rbx 3550 ; AVX2-NEXT: subq $64, %rsp 3551 ; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill 3552 ; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3553 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3554 ; AVX2-NEXT: vzeroupper 3555 ; AVX2-NEXT: callq __truncdfhf2 3556 ; AVX2-NEXT: movl %eax, %ebx 3557 ; AVX2-NEXT: shll $16, %ebx 3558 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3559 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3560 ; AVX2-NEXT: vzeroupper 3561 ; AVX2-NEXT: callq __truncdfhf2 3562 ; AVX2-NEXT: movzwl %ax, %r15d 3563 ; AVX2-NEXT: orl %ebx, %r15d 3564 ; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3565 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3566 ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3567 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3568 ; AVX2-NEXT: vzeroupper 3569 ; AVX2-NEXT: callq __truncdfhf2 3570 ; AVX2-NEXT: movl %eax, %ebx 3571 ; AVX2-NEXT: shll $16, %ebx 3572 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3573 ; AVX2-NEXT: callq __truncdfhf2 3574 ; AVX2-NEXT: movzwl %ax, %r14d 3575 ; AVX2-NEXT: orl %ebx, %r14d 3576 ; AVX2-NEXT: shlq $32, %r14 3577 ; AVX2-NEXT: orq %r15, %r14 3578 ; AVX2-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3579 ; AVX2-NEXT: # xmm0 = mem[1,0] 3580 ; AVX2-NEXT: callq __truncdfhf2 3581 ; AVX2-NEXT: movl %eax, %ebx 3582 ; AVX2-NEXT: shll $16, %ebx 3583 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3584 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3585 ; AVX2-NEXT: vzeroupper 3586 ; AVX2-NEXT: callq __truncdfhf2 3587 ; AVX2-NEXT: movzwl %ax, %r15d 3588 ; AVX2-NEXT: orl %ebx, %r15d 3589 ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3590 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3591 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3592 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3593 ; AVX2-NEXT: vzeroupper 3594 ; AVX2-NEXT: callq __truncdfhf2 3595 ; AVX2-NEXT: movl %eax, %ebx 3596 ; AVX2-NEXT: shll $16, %ebx 3597 ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3598 ; AVX2-NEXT: callq __truncdfhf2 3599 ; AVX2-NEXT: movzwl %ax, %eax 3600 ; AVX2-NEXT: orl %ebx, %eax 3601 ; AVX2-NEXT: shlq $32, %rax 3602 ; AVX2-NEXT: orq %r15, %rax 3603 ; AVX2-NEXT: vmovq %rax, %xmm0 3604 ; AVX2-NEXT: vmovq %r14, %xmm1 3605 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 3606 ; AVX2-NEXT: addq $64, %rsp 3607 ; AVX2-NEXT: popq %rbx 3608 ; AVX2-NEXT: popq %r14 3609 ; AVX2-NEXT: popq %r15 3610 ; AVX2-NEXT: retq 3611 ; 3612 ; AVX512-LABEL: cvt_8f64_to_8i16: 3613 ; AVX512: # %bb.0: 3614 ; AVX512-NEXT: pushq %r15 3615 ; AVX512-NEXT: pushq %r14 3616 ; AVX512-NEXT: pushq %rbx 3617 ; AVX512-NEXT: subq $96, %rsp 3618 ; AVX512-NEXT: vmovupd %zmm0, (%rsp) # 64-byte Spill 3619 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3620 ; AVX512-NEXT: vzeroupper 3621 ; AVX512-NEXT: callq __truncdfhf2 3622 ; AVX512-NEXT: movl %eax, %ebx 3623 ; AVX512-NEXT: shll $16, %ebx 3624 ; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload 3625 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 3626 ; AVX512-NEXT: vzeroupper 3627 ; AVX512-NEXT: callq __truncdfhf2 3628 ; AVX512-NEXT: movzwl %ax, %r15d 3629 ; AVX512-NEXT: orl %ebx, %r15d 3630 ; AVX512-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload 3631 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3632 ; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3633 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3634 ; AVX512-NEXT: vzeroupper 3635 ; AVX512-NEXT: callq __truncdfhf2 3636 ; AVX512-NEXT: movl %eax, %ebx 3637 ; AVX512-NEXT: shll $16, %ebx 3638 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3639 ; AVX512-NEXT: callq __truncdfhf2 3640 ; AVX512-NEXT: movzwl %ax, %r14d 3641 ; AVX512-NEXT: orl %ebx, %r14d 3642 ; AVX512-NEXT: shlq $32, %r14 3643 ; AVX512-NEXT: orq %r15, %r14 3644 ; AVX512-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload 3645 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 3646 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3647 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3648 ; AVX512-NEXT: vzeroupper 3649 ; AVX512-NEXT: callq __truncdfhf2 3650 ; AVX512-NEXT: movl %eax, %ebx 3651 ; AVX512-NEXT: shll $16, %ebx 3652 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3653 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3654 ; AVX512-NEXT: vzeroupper 3655 ; AVX512-NEXT: callq __truncdfhf2 3656 ; AVX512-NEXT: movzwl %ax, %r15d 3657 ; AVX512-NEXT: orl %ebx, %r15d 3658 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3659 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3660 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3661 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3662 ; AVX512-NEXT: vzeroupper 3663 ; AVX512-NEXT: callq __truncdfhf2 3664 ; AVX512-NEXT: movl %eax, %ebx 3665 ; AVX512-NEXT: shll $16, %ebx 3666 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3667 ; AVX512-NEXT: callq __truncdfhf2 3668 ; AVX512-NEXT: movzwl %ax, %eax 3669 ; AVX512-NEXT: orl %ebx, %eax 3670 ; AVX512-NEXT: shlq $32, %rax 3671 ; AVX512-NEXT: orq %r15, %rax 3672 ; AVX512-NEXT: vmovq %rax, %xmm0 3673 ; AVX512-NEXT: vmovq %r14, %xmm1 3674 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 3675 ; AVX512-NEXT: addq $96, %rsp 3676 ; AVX512-NEXT: popq %rbx 3677 ; AVX512-NEXT: popq %r14 3678 ; AVX512-NEXT: popq %r15 3679 ; AVX512-NEXT: retq 3680 %1 = fptrunc <8 x double> %a0 to <8 x half> 3681 %2 = bitcast <8 x half> %1 to <8 x i16> 3682 ret <8 x i16> %2 3683 } 3684 3685 ; 3686 ; Double to Half (Store) 3687 ; 3688 3689 define void @store_cvt_f64_to_i16(double %a0, i16* %a1) nounwind { 3690 ; ALL-LABEL: store_cvt_f64_to_i16: 3691 ; ALL: # %bb.0: 3692 ; ALL-NEXT: pushq %rbx 3693 ; ALL-NEXT: movq %rdi, %rbx 3694 ; ALL-NEXT: callq __truncdfhf2 3695 ; ALL-NEXT: movw %ax, (%rbx) 3696 ; ALL-NEXT: popq %rbx 3697 ; ALL-NEXT: retq 3698 %1 = fptrunc double %a0 to half 3699 %2 = bitcast half %1 to i16 3700 store i16 %2, i16* %a1 3701 ret void 3702 } 3703 3704 define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) nounwind { 3705 ; ALL-LABEL: store_cvt_2f64_to_2i16: 3706 ; ALL: # %bb.0: 3707 ; ALL-NEXT: pushq %rbp 3708 ; ALL-NEXT: pushq %rbx 3709 ; ALL-NEXT: subq $24, %rsp 3710 ; ALL-NEXT: movq %rdi, %rbx 3711 ; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3712 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3713 ; ALL-NEXT: callq __truncdfhf2 3714 ; ALL-NEXT: movl %eax, %ebp 3715 ; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3716 ; ALL-NEXT: callq __truncdfhf2 3717 ; ALL-NEXT: movw %ax, (%rbx) 3718 ; ALL-NEXT: movw %bp, 2(%rbx) 3719 ; ALL-NEXT: addq $24, %rsp 3720 ; ALL-NEXT: popq %rbx 3721 ; ALL-NEXT: popq %rbp 3722 ; ALL-NEXT: retq 3723 %1 = fptrunc <2 x double> %a0 to <2 x half> 3724 %2 = bitcast <2 x half> %1 to <2 x i16> 3725 store <2 x i16> %2, <2 x i16>* %a1 3726 ret void 3727 } 3728 3729 define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind { 3730 ; AVX1-LABEL: store_cvt_4f64_to_4i16: 3731 ; AVX1: # %bb.0: 3732 ; AVX1-NEXT: pushq %rbp 3733 ; AVX1-NEXT: pushq %r15 3734 ; AVX1-NEXT: pushq %r14 3735 ; AVX1-NEXT: pushq %rbx 3736 ; AVX1-NEXT: subq $88, %rsp 3737 ; AVX1-NEXT: movq %rdi, %rbx 3738 ; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3739 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3740 ; AVX1-NEXT: vzeroupper 3741 ; AVX1-NEXT: callq __truncdfhf2 3742 ; AVX1-NEXT: movl %eax, %r14d 3743 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3744 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3745 ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3746 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3747 ; AVX1-NEXT: vzeroupper 3748 ; AVX1-NEXT: callq __truncdfhf2 3749 ; AVX1-NEXT: movl %eax, %r15d 3750 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3751 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3752 ; AVX1-NEXT: vzeroupper 3753 ; AVX1-NEXT: callq __truncdfhf2 3754 ; AVX1-NEXT: movl %eax, %ebp 3755 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3756 ; AVX1-NEXT: callq __truncdfhf2 3757 ; AVX1-NEXT: movw %ax, 4(%rbx) 3758 ; AVX1-NEXT: movw %bp, (%rbx) 3759 ; AVX1-NEXT: movw %r15w, 6(%rbx) 3760 ; AVX1-NEXT: movw %r14w, 2(%rbx) 3761 ; AVX1-NEXT: addq $88, %rsp 3762 ; AVX1-NEXT: popq %rbx 3763 ; AVX1-NEXT: popq %r14 3764 ; AVX1-NEXT: popq %r15 3765 ; AVX1-NEXT: popq %rbp 3766 ; AVX1-NEXT: retq 3767 ; 3768 ; AVX2-LABEL: store_cvt_4f64_to_4i16: 3769 ; AVX2: # %bb.0: 3770 ; AVX2-NEXT: pushq %rbp 3771 ; AVX2-NEXT: pushq %r15 3772 ; AVX2-NEXT: pushq %r14 3773 ; AVX2-NEXT: pushq %rbx 3774 ; AVX2-NEXT: subq $88, %rsp 3775 ; AVX2-NEXT: movq %rdi, %rbx 3776 ; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3777 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3778 ; AVX2-NEXT: vzeroupper 3779 ; AVX2-NEXT: callq __truncdfhf2 3780 ; AVX2-NEXT: movl %eax, %r14d 3781 ; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3782 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3783 ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3784 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3785 ; AVX2-NEXT: vzeroupper 3786 ; AVX2-NEXT: callq __truncdfhf2 3787 ; AVX2-NEXT: movl %eax, %r15d 3788 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3789 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3790 ; AVX2-NEXT: vzeroupper 3791 ; AVX2-NEXT: callq __truncdfhf2 3792 ; AVX2-NEXT: movl %eax, %ebp 3793 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3794 ; AVX2-NEXT: callq __truncdfhf2 3795 ; AVX2-NEXT: movw %ax, 4(%rbx) 3796 ; AVX2-NEXT: movw %bp, (%rbx) 3797 ; AVX2-NEXT: movw %r15w, 6(%rbx) 3798 ; AVX2-NEXT: movw %r14w, 2(%rbx) 3799 ; AVX2-NEXT: addq $88, %rsp 3800 ; AVX2-NEXT: popq %rbx 3801 ; AVX2-NEXT: popq %r14 3802 ; AVX2-NEXT: popq %r15 3803 ; AVX2-NEXT: popq %rbp 3804 ; AVX2-NEXT: retq 3805 ; 3806 ; AVX512-LABEL: store_cvt_4f64_to_4i16: 3807 ; AVX512: # %bb.0: 3808 ; AVX512-NEXT: pushq %rbp 3809 ; AVX512-NEXT: pushq %r15 3810 ; AVX512-NEXT: pushq %r14 3811 ; AVX512-NEXT: pushq %rbx 3812 ; AVX512-NEXT: subq $88, %rsp 3813 ; AVX512-NEXT: movq %rdi, %rbx 3814 ; AVX512-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3815 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3816 ; AVX512-NEXT: vzeroupper 3817 ; AVX512-NEXT: callq __truncdfhf2 3818 ; AVX512-NEXT: movl %eax, %r14d 3819 ; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3820 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3821 ; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3822 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3823 ; AVX512-NEXT: vzeroupper 3824 ; AVX512-NEXT: callq __truncdfhf2 3825 ; AVX512-NEXT: movl %eax, %r15d 3826 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3827 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3828 ; AVX512-NEXT: vzeroupper 3829 ; AVX512-NEXT: callq __truncdfhf2 3830 ; AVX512-NEXT: movl %eax, %ebp 3831 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3832 ; AVX512-NEXT: callq __truncdfhf2 3833 ; AVX512-NEXT: movw %ax, 4(%rbx) 3834 ; AVX512-NEXT: movw %bp, (%rbx) 3835 ; AVX512-NEXT: movw %r15w, 6(%rbx) 3836 ; AVX512-NEXT: movw %r14w, 2(%rbx) 3837 ; AVX512-NEXT: addq $88, %rsp 3838 ; AVX512-NEXT: popq %rbx 3839 ; AVX512-NEXT: popq %r14 3840 ; AVX512-NEXT: popq %r15 3841 ; AVX512-NEXT: popq %rbp 3842 ; AVX512-NEXT: retq 3843 %1 = fptrunc <4 x double> %a0 to <4 x half> 3844 %2 = bitcast <4 x half> %1 to <4 x i16> 3845 store <4 x i16> %2, <4 x i16>* %a1 3846 ret void 3847 } 3848 3849 define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) nounwind { 3850 ; AVX1-LABEL: store_cvt_4f64_to_8i16_undef: 3851 ; AVX1: # %bb.0: 3852 ; AVX1-NEXT: pushq %rbp 3853 ; AVX1-NEXT: pushq %r14 3854 ; AVX1-NEXT: pushq %rbx 3855 ; AVX1-NEXT: subq $32, %rsp 3856 ; AVX1-NEXT: movq %rdi, %r14 3857 ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3858 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3859 ; AVX1-NEXT: vzeroupper 3860 ; AVX1-NEXT: callq __truncdfhf2 3861 ; AVX1-NEXT: movl %eax, %ebp 3862 ; AVX1-NEXT: shll $16, %ebp 3863 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3864 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3865 ; AVX1-NEXT: vzeroupper 3866 ; AVX1-NEXT: callq __truncdfhf2 3867 ; AVX1-NEXT: movzwl %ax, %ebx 3868 ; AVX1-NEXT: orl %ebp, %ebx 3869 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3870 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3871 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3872 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3873 ; AVX1-NEXT: vzeroupper 3874 ; AVX1-NEXT: callq __truncdfhf2 3875 ; AVX1-NEXT: movl %eax, %ebp 3876 ; AVX1-NEXT: shll $16, %ebp 3877 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3878 ; AVX1-NEXT: callq __truncdfhf2 3879 ; AVX1-NEXT: movzwl %ax, %eax 3880 ; AVX1-NEXT: orl %ebp, %eax 3881 ; AVX1-NEXT: shlq $32, %rax 3882 ; AVX1-NEXT: orq %rbx, %rax 3883 ; AVX1-NEXT: vmovq %rax, %xmm0 3884 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3885 ; AVX1-NEXT: vmovdqa %xmm0, (%r14) 3886 ; AVX1-NEXT: addq $32, %rsp 3887 ; AVX1-NEXT: popq %rbx 3888 ; AVX1-NEXT: popq %r14 3889 ; AVX1-NEXT: popq %rbp 3890 ; AVX1-NEXT: retq 3891 ; 3892 ; AVX2-LABEL: store_cvt_4f64_to_8i16_undef: 3893 ; AVX2: # %bb.0: 3894 ; AVX2-NEXT: pushq %rbp 3895 ; AVX2-NEXT: pushq %r14 3896 ; AVX2-NEXT: pushq %rbx 3897 ; AVX2-NEXT: subq $32, %rsp 3898 ; AVX2-NEXT: movq %rdi, %r14 3899 ; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3900 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3901 ; AVX2-NEXT: vzeroupper 3902 ; AVX2-NEXT: callq __truncdfhf2 3903 ; AVX2-NEXT: movl %eax, %ebp 3904 ; AVX2-NEXT: shll $16, %ebp 3905 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3906 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3907 ; AVX2-NEXT: vzeroupper 3908 ; AVX2-NEXT: callq __truncdfhf2 3909 ; AVX2-NEXT: movzwl %ax, %ebx 3910 ; AVX2-NEXT: orl %ebp, %ebx 3911 ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3912 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3913 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3914 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3915 ; AVX2-NEXT: vzeroupper 3916 ; AVX2-NEXT: callq __truncdfhf2 3917 ; AVX2-NEXT: movl %eax, %ebp 3918 ; AVX2-NEXT: shll $16, %ebp 3919 ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3920 ; AVX2-NEXT: callq __truncdfhf2 3921 ; AVX2-NEXT: movzwl %ax, %eax 3922 ; AVX2-NEXT: orl %ebp, %eax 3923 ; AVX2-NEXT: shlq $32, %rax 3924 ; AVX2-NEXT: orq %rbx, %rax 3925 ; AVX2-NEXT: vmovq %rax, %xmm0 3926 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3927 ; AVX2-NEXT: vmovdqa %xmm0, (%r14) 3928 ; AVX2-NEXT: addq $32, %rsp 3929 ; AVX2-NEXT: popq %rbx 3930 ; AVX2-NEXT: popq %r14 3931 ; AVX2-NEXT: popq %rbp 3932 ; AVX2-NEXT: retq 3933 ; 3934 ; AVX512-LABEL: store_cvt_4f64_to_8i16_undef: 3935 ; AVX512: # %bb.0: 3936 ; AVX512-NEXT: pushq %rbp 3937 ; AVX512-NEXT: pushq %r14 3938 ; AVX512-NEXT: pushq %rbx 3939 ; AVX512-NEXT: subq $32, %rsp 3940 ; AVX512-NEXT: movq %rdi, %r14 3941 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3942 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3943 ; AVX512-NEXT: vzeroupper 3944 ; AVX512-NEXT: callq __truncdfhf2 3945 ; AVX512-NEXT: movl %eax, %ebp 3946 ; AVX512-NEXT: shll $16, %ebp 3947 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3948 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3949 ; AVX512-NEXT: vzeroupper 3950 ; AVX512-NEXT: callq __truncdfhf2 3951 ; AVX512-NEXT: movzwl %ax, %ebx 3952 ; AVX512-NEXT: orl %ebp, %ebx 3953 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3954 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3955 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3956 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3957 ; AVX512-NEXT: vzeroupper 3958 ; AVX512-NEXT: callq __truncdfhf2 3959 ; AVX512-NEXT: movl %eax, %ebp 3960 ; AVX512-NEXT: shll $16, %ebp 3961 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3962 ; AVX512-NEXT: callq __truncdfhf2 3963 ; AVX512-NEXT: movzwl %ax, %eax 3964 ; AVX512-NEXT: orl %ebp, %eax 3965 ; AVX512-NEXT: shlq $32, %rax 3966 ; AVX512-NEXT: orq %rbx, %rax 3967 ; AVX512-NEXT: vmovq %rax, %xmm0 3968 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3969 ; AVX512-NEXT: vmovdqa %xmm0, (%r14) 3970 ; AVX512-NEXT: addq $32, %rsp 3971 ; AVX512-NEXT: popq %rbx 3972 ; AVX512-NEXT: popq %r14 3973 ; AVX512-NEXT: popq %rbp 3974 ; AVX512-NEXT: retq 3975 %1 = fptrunc <4 x double> %a0 to <4 x half> 3976 %2 = bitcast <4 x half> %1 to <4 x i16> 3977 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3978 store <8 x i16> %3, <8 x i16>* %a1 3979 ret void 3980 } 3981 3982 define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounwind { 3983 ; AVX1-LABEL: store_cvt_4f64_to_8i16_zero: 3984 ; AVX1: # %bb.0: 3985 ; AVX1-NEXT: pushq %rbp 3986 ; AVX1-NEXT: pushq %r14 3987 ; AVX1-NEXT: pushq %rbx 3988 ; AVX1-NEXT: subq $32, %rsp 3989 ; AVX1-NEXT: movq %rdi, %r14 3990 ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3991 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3992 ; AVX1-NEXT: vzeroupper 3993 ; AVX1-NEXT: callq __truncdfhf2 3994 ; AVX1-NEXT: movl %eax, %ebp 3995 ; AVX1-NEXT: shll $16, %ebp 3996 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3997 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3998 ; AVX1-NEXT: vzeroupper 3999 ; AVX1-NEXT: callq __truncdfhf2 4000 ; AVX1-NEXT: movzwl %ax, %ebx 4001 ; AVX1-NEXT: orl %ebp, %ebx 4002 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 4003 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 4004 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 4005 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4006 ; AVX1-NEXT: vzeroupper 4007 ; AVX1-NEXT: callq __truncdfhf2 4008 ; AVX1-NEXT: movl %eax, %ebp 4009 ; AVX1-NEXT: shll $16, %ebp 4010 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 4011 ; AVX1-NEXT: callq __truncdfhf2 4012 ; AVX1-NEXT: movzwl %ax, %eax 4013 ; AVX1-NEXT: orl %ebp, %eax 4014 ; AVX1-NEXT: shlq $32, %rax 4015 ; AVX1-NEXT: orq %rbx, %rax 4016 ; AVX1-NEXT: vmovq %rax, %xmm0 4017 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 4018 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 4019 ; AVX1-NEXT: vmovdqa %xmm0, (%r14) 4020 ; AVX1-NEXT: addq $32, %rsp 4021 ; AVX1-NEXT: popq %rbx 4022 ; AVX1-NEXT: popq %r14 4023 ; AVX1-NEXT: popq %rbp 4024 ; AVX1-NEXT: retq 4025 ; 4026 ; AVX2-SLOW-LABEL: store_cvt_4f64_to_8i16_zero: 4027 ; AVX2-SLOW: # %bb.0: 4028 ; AVX2-SLOW-NEXT: pushq %rbp 4029 ; AVX2-SLOW-NEXT: pushq %r14 4030 ; AVX2-SLOW-NEXT: pushq %rbx 4031 ; AVX2-SLOW-NEXT: subq $32, %rsp 4032 ; AVX2-SLOW-NEXT: movq %rdi, %r14 4033 ; AVX2-SLOW-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 4034 ; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4035 ; AVX2-SLOW-NEXT: vzeroupper 4036 ; AVX2-SLOW-NEXT: callq __truncdfhf2 4037 ; AVX2-SLOW-NEXT: movl %eax, %ebp 4038 ; AVX2-SLOW-NEXT: shll $16, %ebp 4039 ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 4040 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4041 ; AVX2-SLOW-NEXT: vzeroupper 4042 ; AVX2-SLOW-NEXT: callq __truncdfhf2 4043 ; AVX2-SLOW-NEXT: movzwl %ax, %ebx 4044 ; AVX2-SLOW-NEXT: orl %ebp, %ebx 4045 ; AVX2-SLOW-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 4046 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 4047 ; AVX2-SLOW-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 4048 ; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4049 ; AVX2-SLOW-NEXT: vzeroupper 4050 ; AVX2-SLOW-NEXT: callq __truncdfhf2 4051 ; AVX2-SLOW-NEXT: movl %eax, %ebp 4052 ; AVX2-SLOW-NEXT: shll $16, %ebp 4053 ; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 4054 ; AVX2-SLOW-NEXT: callq __truncdfhf2 4055 ; AVX2-SLOW-NEXT: movzwl %ax, %eax 4056 ; AVX2-SLOW-NEXT: orl %ebp, %eax 4057 ; AVX2-SLOW-NEXT: shlq $32, %rax 4058 ; AVX2-SLOW-NEXT: orq %rbx, %rax 4059 ; AVX2-SLOW-NEXT: vmovq %rax, %xmm0 4060 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 4061 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 4062 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%r14) 4063 ; AVX2-SLOW-NEXT: addq $32, %rsp 4064 ; AVX2-SLOW-NEXT: popq %rbx 4065 ; AVX2-SLOW-NEXT: popq %r14 4066 ; AVX2-SLOW-NEXT: popq %rbp 4067 ; AVX2-SLOW-NEXT: retq 4068 ; 4069 ; AVX2-FAST-LABEL: store_cvt_4f64_to_8i16_zero: 4070 ; AVX2-FAST: # %bb.0: 4071 ; AVX2-FAST-NEXT: pushq %rbp 4072 ; AVX2-FAST-NEXT: pushq %r14 4073 ; AVX2-FAST-NEXT: pushq %rbx 4074 ; AVX2-FAST-NEXT: subq $32, %rsp 4075 ; AVX2-FAST-NEXT: movq %rdi, %r14 4076 ; AVX2-FAST-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 4077 ; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4078 ; AVX2-FAST-NEXT: vzeroupper 4079 ; AVX2-FAST-NEXT: callq __truncdfhf2 4080 ; AVX2-FAST-NEXT: movl %eax, %ebp 4081 ; AVX2-FAST-NEXT: shll $16, %ebp 4082 ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 4083 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4084 ; AVX2-FAST-NEXT: vzeroupper 4085 ; AVX2-FAST-NEXT: callq __truncdfhf2 4086 ; AVX2-FAST-NEXT: movzwl %ax, %ebx 4087 ; AVX2-FAST-NEXT: orl %ebp, %ebx 4088 ; AVX2-FAST-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 4089 ; AVX2-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 4090 ; AVX2-FAST-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 4091 ; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4092 ; AVX2-FAST-NEXT: vzeroupper 4093 ; AVX2-FAST-NEXT: callq __truncdfhf2 4094 ; AVX2-FAST-NEXT: movl %eax, %ebp 4095 ; AVX2-FAST-NEXT: shll $16, %ebp 4096 ; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 4097 ; AVX2-FAST-NEXT: callq __truncdfhf2 4098 ; AVX2-FAST-NEXT: movzwl %ax, %eax 4099 ; AVX2-FAST-NEXT: orl %ebp, %eax 4100 ; AVX2-FAST-NEXT: shlq $32, %rax 4101 ; AVX2-FAST-NEXT: orq %rbx, %rax 4102 ; AVX2-FAST-NEXT: vmovq %rax, %xmm0 4103 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero 4104 ; AVX2-FAST-NEXT: vmovdqa %xmm0, (%r14) 4105 ; AVX2-FAST-NEXT: addq $32, %rsp 4106 ; AVX2-FAST-NEXT: popq %rbx 4107 ; AVX2-FAST-NEXT: popq %r14 4108 ; AVX2-FAST-NEXT: popq %rbp 4109 ; AVX2-FAST-NEXT: retq 4110 ; 4111 ; AVX512F-LABEL: store_cvt_4f64_to_8i16_zero: 4112 ; AVX512F: # %bb.0: 4113 ; AVX512F-NEXT: pushq %rbp 4114 ; AVX512F-NEXT: pushq %r14 4115 ; AVX512F-NEXT: pushq %rbx 4116 ; AVX512F-NEXT: subq $32, %rsp 4117 ; AVX512F-NEXT: movq %rdi, %r14 4118 ; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 4119 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4120 ; AVX512F-NEXT: vzeroupper 4121 ; AVX512F-NEXT: callq __truncdfhf2 4122 ; AVX512F-NEXT: movl %eax, %ebp 4123 ; AVX512F-NEXT: shll $16, %ebp 4124 ; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 4125 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4126 ; AVX512F-NEXT: vzeroupper 4127 ; AVX512F-NEXT: callq __truncdfhf2 4128 ; AVX512F-NEXT: movzwl %ax, %ebx 4129 ; AVX512F-NEXT: orl %ebp, %ebx 4130 ; AVX512F-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 4131 ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 4132 ; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 4133 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4134 ; AVX512F-NEXT: vzeroupper 4135 ; AVX512F-NEXT: callq __truncdfhf2 4136 ; AVX512F-NEXT: movl %eax, %ebp 4137 ; AVX512F-NEXT: shll $16, %ebp 4138 ; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 4139 ; AVX512F-NEXT: callq __truncdfhf2 4140 ; AVX512F-NEXT: movzwl %ax, %eax 4141 ; AVX512F-NEXT: orl %ebp, %eax 4142 ; AVX512F-NEXT: shlq $32, %rax 4143 ; AVX512F-NEXT: orq %rbx, %rax 4144 ; AVX512F-NEXT: vmovq %rax, %xmm0 4145 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 4146 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 4147 ; AVX512F-NEXT: vmovdqa %xmm0, (%r14) 4148 ; AVX512F-NEXT: addq $32, %rsp 4149 ; AVX512F-NEXT: popq %rbx 4150 ; AVX512F-NEXT: popq %r14 4151 ; AVX512F-NEXT: popq %rbp 4152 ; AVX512F-NEXT: retq 4153 ; 4154 ; AVX512VL-LABEL: store_cvt_4f64_to_8i16_zero: 4155 ; AVX512VL: # %bb.0: 4156 ; AVX512VL-NEXT: pushq %rbp 4157 ; AVX512VL-NEXT: pushq %r14 4158 ; AVX512VL-NEXT: pushq %rbx 4159 ; AVX512VL-NEXT: subq $32, %rsp 4160 ; AVX512VL-NEXT: movq %rdi, %r14 4161 ; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 4162 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4163 ; AVX512VL-NEXT: vzeroupper 4164 ; AVX512VL-NEXT: callq __truncdfhf2 4165 ; AVX512VL-NEXT: movl %eax, %ebp 4166 ; AVX512VL-NEXT: shll $16, %ebp 4167 ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 4168 ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4169 ; AVX512VL-NEXT: vzeroupper 4170 ; AVX512VL-NEXT: callq __truncdfhf2 4171 ; AVX512VL-NEXT: movzwl %ax, %ebx 4172 ; AVX512VL-NEXT: orl %ebp, %ebx 4173 ; AVX512VL-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 4174 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 4175 ; AVX512VL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 4176 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4177 ; AVX512VL-NEXT: vzeroupper 4178 ; AVX512VL-NEXT: callq __truncdfhf2 4179 ; AVX512VL-NEXT: movl %eax, %ebp 4180 ; AVX512VL-NEXT: shll $16, %ebp 4181 ; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 4182 ; AVX512VL-NEXT: callq __truncdfhf2 4183 ; AVX512VL-NEXT: movzwl %ax, %eax 4184 ; AVX512VL-NEXT: orl %ebp, %eax 4185 ; AVX512VL-NEXT: shlq $32, %rax 4186 ; AVX512VL-NEXT: orq %rbx, %rax 4187 ; AVX512VL-NEXT: vmovq %rax, %xmm0 4188 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero 4189 ; AVX512VL-NEXT: vmovdqa %xmm0, (%r14) 4190 ; AVX512VL-NEXT: addq $32, %rsp 4191 ; AVX512VL-NEXT: popq %rbx 4192 ; AVX512VL-NEXT: popq %r14 4193 ; AVX512VL-NEXT: popq %rbp 4194 ; AVX512VL-NEXT: retq 4195 %1 = fptrunc <4 x double> %a0 to <4 x half> 4196 %2 = bitcast <4 x half> %1 to <4 x i16> 4197 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 4198 store <8 x i16> %3, <8 x i16>* %a1 4199 ret void 4200 } 4201 4202 define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind { 4203 ; AVX1-LABEL: store_cvt_8f64_to_8i16: 4204 ; AVX1: # %bb.0: 4205 ; AVX1-NEXT: pushq %rbp 4206 ; AVX1-NEXT: pushq %r15 4207 ; AVX1-NEXT: pushq %r14 4208 ; AVX1-NEXT: pushq %r13 4209 ; AVX1-NEXT: pushq %r12 4210 ; AVX1-NEXT: pushq %rbx 4211 ; AVX1-NEXT: subq $136, %rsp 4212 ; AVX1-NEXT: movq %rdi, %rbx 4213 ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4214 ; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4215 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4216 ; AVX1-NEXT: vzeroupper 4217 ; AVX1-NEXT: callq __truncdfhf2 4218 ; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4219 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4220 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 4221 ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4222 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4223 ; AVX1-NEXT: vzeroupper 4224 ; AVX1-NEXT: callq __truncdfhf2 4225 ; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4226 ; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4227 ; AVX1-NEXT: # xmm0 = mem[1,0] 4228 ; AVX1-NEXT: callq __truncdfhf2 4229 ; AVX1-NEXT: movl %eax, %r12d 4230 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4231 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 4232 ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4233 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4234 ; AVX1-NEXT: vzeroupper 4235 ; AVX1-NEXT: callq __truncdfhf2 4236 ; AVX1-NEXT: movl %eax, %r13d 4237 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4238 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4239 ; AVX1-NEXT: vzeroupper 4240 ; AVX1-NEXT: callq __truncdfhf2 4241 ; AVX1-NEXT: movl %eax, %ebp 4242 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4243 ; AVX1-NEXT: callq __truncdfhf2 4244 ; AVX1-NEXT: movl %eax, %r14d 4245 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4246 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4247 ; AVX1-NEXT: vzeroupper 4248 ; AVX1-NEXT: callq __truncdfhf2 4249 ; AVX1-NEXT: movl %eax, %r15d 4250 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4251 ; AVX1-NEXT: callq __truncdfhf2 4252 ; AVX1-NEXT: movw %ax, 12(%rbx) 4253 ; AVX1-NEXT: movw %r15w, 8(%rbx) 4254 ; AVX1-NEXT: movw %r14w, 4(%rbx) 4255 ; AVX1-NEXT: movw %bp, (%rbx) 4256 ; AVX1-NEXT: movw %r13w, 14(%rbx) 4257 ; AVX1-NEXT: movw %r12w, 10(%rbx) 4258 ; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload 4259 ; AVX1-NEXT: movw %ax, 6(%rbx) 4260 ; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload 4261 ; AVX1-NEXT: movw %ax, 2(%rbx) 4262 ; AVX1-NEXT: addq $136, %rsp 4263 ; AVX1-NEXT: popq %rbx 4264 ; AVX1-NEXT: popq %r12 4265 ; AVX1-NEXT: popq %r13 4266 ; AVX1-NEXT: popq %r14 4267 ; AVX1-NEXT: popq %r15 4268 ; AVX1-NEXT: popq %rbp 4269 ; AVX1-NEXT: retq 4270 ; 4271 ; AVX2-LABEL: store_cvt_8f64_to_8i16: 4272 ; AVX2: # %bb.0: 4273 ; AVX2-NEXT: pushq %rbp 4274 ; AVX2-NEXT: pushq %r15 4275 ; AVX2-NEXT: pushq %r14 4276 ; AVX2-NEXT: pushq %r13 4277 ; AVX2-NEXT: pushq %r12 4278 ; AVX2-NEXT: pushq %rbx 4279 ; AVX2-NEXT: subq $136, %rsp 4280 ; AVX2-NEXT: movq %rdi, %rbx 4281 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4282 ; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4283 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4284 ; AVX2-NEXT: vzeroupper 4285 ; AVX2-NEXT: callq __truncdfhf2 4286 ; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4287 ; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4288 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 4289 ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4290 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4291 ; AVX2-NEXT: vzeroupper 4292 ; AVX2-NEXT: callq __truncdfhf2 4293 ; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4294 ; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4295 ; AVX2-NEXT: # xmm0 = mem[1,0] 4296 ; AVX2-NEXT: callq __truncdfhf2 4297 ; AVX2-NEXT: movl %eax, %r12d 4298 ; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4299 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 4300 ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4301 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4302 ; AVX2-NEXT: vzeroupper 4303 ; AVX2-NEXT: callq __truncdfhf2 4304 ; AVX2-NEXT: movl %eax, %r13d 4305 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4306 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4307 ; AVX2-NEXT: vzeroupper 4308 ; AVX2-NEXT: callq __truncdfhf2 4309 ; AVX2-NEXT: movl %eax, %ebp 4310 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4311 ; AVX2-NEXT: callq __truncdfhf2 4312 ; AVX2-NEXT: movl %eax, %r14d 4313 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4314 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4315 ; AVX2-NEXT: vzeroupper 4316 ; AVX2-NEXT: callq __truncdfhf2 4317 ; AVX2-NEXT: movl %eax, %r15d 4318 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4319 ; AVX2-NEXT: callq __truncdfhf2 4320 ; AVX2-NEXT: movw %ax, 12(%rbx) 4321 ; AVX2-NEXT: movw %r15w, 8(%rbx) 4322 ; AVX2-NEXT: movw %r14w, 4(%rbx) 4323 ; AVX2-NEXT: movw %bp, (%rbx) 4324 ; AVX2-NEXT: movw %r13w, 14(%rbx) 4325 ; AVX2-NEXT: movw %r12w, 10(%rbx) 4326 ; AVX2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload 4327 ; AVX2-NEXT: movw %ax, 6(%rbx) 4328 ; AVX2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload 4329 ; AVX2-NEXT: movw %ax, 2(%rbx) 4330 ; AVX2-NEXT: addq $136, %rsp 4331 ; AVX2-NEXT: popq %rbx 4332 ; AVX2-NEXT: popq %r12 4333 ; AVX2-NEXT: popq %r13 4334 ; AVX2-NEXT: popq %r14 4335 ; AVX2-NEXT: popq %r15 4336 ; AVX2-NEXT: popq %rbp 4337 ; AVX2-NEXT: retq 4338 ; 4339 ; AVX512-LABEL: store_cvt_8f64_to_8i16: 4340 ; AVX512: # %bb.0: 4341 ; AVX512-NEXT: pushq %rbp 4342 ; AVX512-NEXT: pushq %r15 4343 ; AVX512-NEXT: pushq %r14 4344 ; AVX512-NEXT: pushq %r13 4345 ; AVX512-NEXT: pushq %r12 4346 ; AVX512-NEXT: pushq %rbx 4347 ; AVX512-NEXT: subq $200, %rsp 4348 ; AVX512-NEXT: movq %rdi, %rbx 4349 ; AVX512-NEXT: vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4350 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4351 ; AVX512-NEXT: vzeroupper 4352 ; AVX512-NEXT: callq __truncdfhf2 4353 ; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4354 ; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 4355 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 4356 ; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4357 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4358 ; AVX512-NEXT: vzeroupper 4359 ; AVX512-NEXT: callq __truncdfhf2 4360 ; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4361 ; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 4362 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 4363 ; AVX512-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4364 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4365 ; AVX512-NEXT: vzeroupper 4366 ; AVX512-NEXT: callq __truncdfhf2 4367 ; AVX512-NEXT: movl %eax, %r12d 4368 ; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4369 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 4370 ; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4371 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4372 ; AVX512-NEXT: vzeroupper 4373 ; AVX512-NEXT: callq __truncdfhf2 4374 ; AVX512-NEXT: movl %eax, %r13d 4375 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 4376 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 4377 ; AVX512-NEXT: vzeroupper 4378 ; AVX512-NEXT: callq __truncdfhf2 4379 ; AVX512-NEXT: movl %eax, %ebp 4380 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4381 ; AVX512-NEXT: callq __truncdfhf2 4382 ; AVX512-NEXT: movl %eax, %r14d 4383 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4384 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4385 ; AVX512-NEXT: vzeroupper 4386 ; AVX512-NEXT: callq __truncdfhf2 4387 ; AVX512-NEXT: movl %eax, %r15d 4388 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4389 ; AVX512-NEXT: callq __truncdfhf2 4390 ; AVX512-NEXT: movw %ax, 12(%rbx) 4391 ; AVX512-NEXT: movw %r15w, 8(%rbx) 4392 ; AVX512-NEXT: movw %r14w, 4(%rbx) 4393 ; AVX512-NEXT: movw %bp, (%rbx) 4394 ; AVX512-NEXT: movw %r13w, 14(%rbx) 4395 ; AVX512-NEXT: movw %r12w, 10(%rbx) 4396 ; AVX512-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload 4397 ; AVX512-NEXT: movw %ax, 6(%rbx) 4398 ; AVX512-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload 4399 ; AVX512-NEXT: movw %ax, 2(%rbx) 4400 ; AVX512-NEXT: addq $200, %rsp 4401 ; AVX512-NEXT: popq %rbx 4402 ; AVX512-NEXT: popq %r12 4403 ; AVX512-NEXT: popq %r13 4404 ; AVX512-NEXT: popq %r14 4405 ; AVX512-NEXT: popq %r15 4406 ; AVX512-NEXT: popq %rbp 4407 ; AVX512-NEXT: retq 4408 %1 = fptrunc <8 x double> %a0 to <8 x half> 4409 %2 = bitcast <8 x half> %1 to <8 x i16> 4410 store <8 x i16> %2, <8 x i16>* %a1 4411 ret void 4412 } 4413