1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 5 6 ; 7 ; Half to Float 8 ; 9 10 define float @cvt_i16_to_f32(i16 %a0) { 11 ; ALL-LABEL: cvt_i16_to_f32: 12 ; ALL: # BB#0: 13 ; ALL-NEXT: movswl %di, %eax 14 ; ALL-NEXT: vmovd %eax, %xmm0 15 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 16 ; ALL-NEXT: retq 17 %1 = bitcast i16 %a0 to half 18 %2 = fpext half %1 to float 19 ret float %2 20 } 21 22 define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) { 23 ; ALL-LABEL: cvt_4i16_to_4f32: 24 ; ALL: # BB#0: 25 ; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 26 ; ALL-NEXT: vmovq %xmm0, %rax 27 ; ALL-NEXT: movq %rax, %rcx 28 ; ALL-NEXT: movq %rax, %rdx 29 ; ALL-NEXT: movswl %ax, %esi 30 ; ALL-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 31 ; ALL-NEXT: shrl $16, %eax 32 ; ALL-NEXT: shrq $32, %rcx 33 ; ALL-NEXT: shrq $48, %rdx 34 ; ALL-NEXT: movswl %dx, %edx 35 ; ALL-NEXT: vmovd %edx, %xmm0 36 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 37 ; ALL-NEXT: movswl %cx, %ecx 38 ; ALL-NEXT: vmovd %ecx, %xmm1 39 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 40 ; ALL-NEXT: cwtl 41 ; ALL-NEXT: vmovd %eax, %xmm2 42 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 43 ; ALL-NEXT: vmovd %esi, %xmm3 44 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 45 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 46 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 47 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 48 ; ALL-NEXT: retq 49 %1 = bitcast <4 x i16> %a0 to <4 x half> 50 %2 = fpext <4 x half> %1 to <4 x float> 51 ret <4 x float> %2 52 } 53 54 define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) { 55 ; ALL-LABEL: cvt_8i16_to_4f32: 56 ; ALL: # BB#0: 57 ; ALL-NEXT: vmovq %xmm0, %rax 58 ; ALL-NEXT: movq %rax, %rcx 59 ; ALL-NEXT: movq %rax, %rdx 60 ; ALL-NEXT: movswl %ax, %esi 61 ; ALL-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 62 ; ALL-NEXT: shrl $16, %eax 63 ; ALL-NEXT: shrq $32, %rcx 64 ; ALL-NEXT: shrq $48, %rdx 65 ; ALL-NEXT: movswl %dx, %edx 66 ; ALL-NEXT: vmovd %edx, %xmm0 67 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 68 ; ALL-NEXT: movswl %cx, %ecx 69 ; ALL-NEXT: vmovd %ecx, %xmm1 70 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 71 ; ALL-NEXT: cwtl 72 ; ALL-NEXT: vmovd %eax, %xmm2 73 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 74 ; ALL-NEXT: vmovd %esi, %xmm3 75 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 76 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 77 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 78 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 79 ; ALL-NEXT: retq 80 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 81 %2 = bitcast <4 x i16> %1 to <4 x half> 82 %3 = fpext <4 x half> %2 to <4 x float> 83 ret <4 x float> %3 84 } 85 86 define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) { 87 ; AVX1-LABEL: cvt_8i16_to_8f32: 88 ; AVX1: # BB#0: 89 ; AVX1-NEXT: vpextrq $1, %xmm0, %rdx 90 ; AVX1-NEXT: movq %rdx, %r8 91 ; AVX1-NEXT: movq %rdx, %r10 92 ; AVX1-NEXT: movswl %dx, %r9d 93 ; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill> 94 ; AVX1-NEXT: shrl $16, %edx 95 ; AVX1-NEXT: shrq $32, %r8 96 ; AVX1-NEXT: shrq $48, %r10 97 ; AVX1-NEXT: vmovq %xmm0, %rdi 98 ; AVX1-NEXT: movq %rdi, %rax 99 ; AVX1-NEXT: movq %rdi, %rsi 100 ; AVX1-NEXT: movswl %di, %ecx 101 ; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill> 102 ; AVX1-NEXT: shrl $16, %edi 103 ; AVX1-NEXT: shrq $32, %rax 104 ; AVX1-NEXT: shrq $48, %rsi 105 ; AVX1-NEXT: movswl %si, %esi 106 ; AVX1-NEXT: vmovd %esi, %xmm0 107 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 108 ; AVX1-NEXT: cwtl 109 ; AVX1-NEXT: vmovd %eax, %xmm1 110 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 111 ; AVX1-NEXT: movswl %di, %eax 112 ; AVX1-NEXT: vmovd %eax, %xmm2 113 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 114 ; AVX1-NEXT: vmovd %ecx, %xmm3 115 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 116 ; AVX1-NEXT: movswl %r10w, %eax 117 ; AVX1-NEXT: vmovd %eax, %xmm4 118 ; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 119 ; AVX1-NEXT: movswl %r8w, %eax 120 ; AVX1-NEXT: vmovd %eax, %xmm5 121 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 122 ; AVX1-NEXT: movswl %dx, %eax 123 ; AVX1-NEXT: vmovd %eax, %xmm6 124 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 125 ; AVX1-NEXT: vmovd %r9d, %xmm7 126 ; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 127 ; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3] 128 ; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] 129 ; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] 130 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 131 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 132 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 133 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 134 ; AVX1-NEXT: retq 135 ; 136 ; AVX2-LABEL: cvt_8i16_to_8f32: 137 ; AVX2: # BB#0: 138 ; AVX2-NEXT: vpextrq $1, %xmm0, %rdx 139 ; AVX2-NEXT: movq %rdx, %r8 140 ; AVX2-NEXT: movq %rdx, %r10 141 ; AVX2-NEXT: movswl %dx, %r9d 142 ; AVX2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill> 143 ; AVX2-NEXT: shrl $16, %edx 144 ; AVX2-NEXT: shrq $32, %r8 145 ; AVX2-NEXT: shrq $48, %r10 146 ; AVX2-NEXT: vmovq %xmm0, %rdi 147 ; AVX2-NEXT: movq %rdi, %rax 148 ; AVX2-NEXT: movq %rdi, %rsi 149 ; AVX2-NEXT: movswl %di, %ecx 150 ; AVX2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill> 151 ; AVX2-NEXT: shrl $16, %edi 152 ; AVX2-NEXT: shrq $32, %rax 153 ; AVX2-NEXT: shrq $48, %rsi 154 ; AVX2-NEXT: movswl %si, %esi 155 ; AVX2-NEXT: vmovd %esi, %xmm0 156 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 157 ; AVX2-NEXT: cwtl 158 ; AVX2-NEXT: vmovd %eax, %xmm1 159 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 160 ; AVX2-NEXT: movswl %di, %eax 161 ; AVX2-NEXT: vmovd %eax, %xmm2 162 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 163 ; AVX2-NEXT: vmovd %ecx, %xmm3 164 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 165 ; AVX2-NEXT: movswl %r10w, %eax 166 ; AVX2-NEXT: vmovd %eax, %xmm4 167 ; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 168 ; AVX2-NEXT: movswl %r8w, %eax 169 ; AVX2-NEXT: vmovd %eax, %xmm5 170 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 171 ; AVX2-NEXT: movswl %dx, %eax 172 ; AVX2-NEXT: vmovd %eax, %xmm6 173 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 174 ; AVX2-NEXT: vmovd %r9d, %xmm7 175 ; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 176 ; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3] 177 ; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] 178 ; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] 179 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 180 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 181 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 182 ; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 183 ; AVX2-NEXT: retq 184 ; 185 ; AVX512-LABEL: cvt_8i16_to_8f32: 186 ; AVX512: # BB#0: 187 ; AVX512-NEXT: vpextrq $1, %xmm0, %rdx 188 ; AVX512-NEXT: movq %rdx, %r8 189 ; AVX512-NEXT: movq %rdx, %r10 190 ; AVX512-NEXT: movswl %dx, %r9d 191 ; AVX512-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill> 192 ; AVX512-NEXT: shrl $16, %edx 193 ; AVX512-NEXT: shrq $32, %r8 194 ; AVX512-NEXT: shrq $48, %r10 195 ; AVX512-NEXT: vmovq %xmm0, %rdi 196 ; AVX512-NEXT: movq %rdi, %rax 197 ; AVX512-NEXT: movq %rdi, %rsi 198 ; AVX512-NEXT: movswl %di, %ecx 199 ; AVX512-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill> 200 ; AVX512-NEXT: shrl $16, %edi 201 ; AVX512-NEXT: shrq $32, %rax 202 ; AVX512-NEXT: shrq $48, %rsi 203 ; AVX512-NEXT: movswl %si, %esi 204 ; AVX512-NEXT: vmovd %esi, %xmm0 205 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 206 ; AVX512-NEXT: cwtl 207 ; AVX512-NEXT: vmovd %eax, %xmm1 208 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 209 ; AVX512-NEXT: movswl %di, %eax 210 ; AVX512-NEXT: vmovd %eax, %xmm2 211 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 212 ; AVX512-NEXT: vmovd %ecx, %xmm3 213 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 214 ; AVX512-NEXT: movswl %r10w, %eax 215 ; AVX512-NEXT: vmovd %eax, %xmm4 216 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 217 ; AVX512-NEXT: movswl %r8w, %eax 218 ; AVX512-NEXT: vmovd %eax, %xmm5 219 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 220 ; AVX512-NEXT: movswl %dx, %eax 221 ; AVX512-NEXT: vmovd %eax, %xmm6 222 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6 223 ; AVX512-NEXT: vmovd %r9d, %xmm7 224 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7 225 ; AVX512-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3] 226 ; AVX512-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] 227 ; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] 228 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 229 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 230 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 231 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 232 ; AVX512-NEXT: retq 233 %1 = bitcast <8 x i16> %a0 to <8 x half> 234 %2 = fpext <8 x half> %1 to <8 x float> 235 ret <8 x float> %2 236 } 237 238 define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) { 239 ; AVX1-LABEL: cvt_16i16_to_16f32: 240 ; AVX1: # BB#0: 241 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 242 ; AVX1-NEXT: vmovq %xmm4, %rax 243 ; AVX1-NEXT: movq %rax, %rcx 244 ; AVX1-NEXT: shrq $48, %rcx 245 ; AVX1-NEXT: movswl %cx, %ecx 246 ; AVX1-NEXT: vmovd %ecx, %xmm8 247 ; AVX1-NEXT: movq %rax, %rcx 248 ; AVX1-NEXT: shrq $32, %rcx 249 ; AVX1-NEXT: movswl %cx, %ecx 250 ; AVX1-NEXT: vmovd %ecx, %xmm9 251 ; AVX1-NEXT: movswl %ax, %ecx 252 ; AVX1-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 253 ; AVX1-NEXT: shrl $16, %eax 254 ; AVX1-NEXT: cwtl 255 ; AVX1-NEXT: vmovd %eax, %xmm10 256 ; AVX1-NEXT: vpextrq $1, %xmm4, %rax 257 ; AVX1-NEXT: vmovd %ecx, %xmm11 258 ; AVX1-NEXT: movq %rax, %rcx 259 ; AVX1-NEXT: shrq $48, %rcx 260 ; AVX1-NEXT: movswl %cx, %ecx 261 ; AVX1-NEXT: vmovd %ecx, %xmm12 262 ; AVX1-NEXT: movq %rax, %rcx 263 ; AVX1-NEXT: shrq $32, %rcx 264 ; AVX1-NEXT: movswl %cx, %ecx 265 ; AVX1-NEXT: vmovd %ecx, %xmm13 266 ; AVX1-NEXT: movswl %ax, %ecx 267 ; AVX1-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 268 ; AVX1-NEXT: shrl $16, %eax 269 ; AVX1-NEXT: cwtl 270 ; AVX1-NEXT: vmovd %eax, %xmm14 271 ; AVX1-NEXT: vmovq %xmm0, %rax 272 ; AVX1-NEXT: vmovd %ecx, %xmm15 273 ; AVX1-NEXT: movq %rax, %rcx 274 ; AVX1-NEXT: shrq $48, %rcx 275 ; AVX1-NEXT: movswl %cx, %ecx 276 ; AVX1-NEXT: vmovd %ecx, %xmm2 277 ; AVX1-NEXT: movq %rax, %rcx 278 ; AVX1-NEXT: shrq $32, %rcx 279 ; AVX1-NEXT: movswl %cx, %ecx 280 ; AVX1-NEXT: vmovd %ecx, %xmm3 281 ; AVX1-NEXT: movswl %ax, %ecx 282 ; AVX1-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 283 ; AVX1-NEXT: shrl $16, %eax 284 ; AVX1-NEXT: cwtl 285 ; AVX1-NEXT: vmovd %eax, %xmm4 286 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax 287 ; AVX1-NEXT: vmovd %ecx, %xmm0 288 ; AVX1-NEXT: movq %rax, %rcx 289 ; AVX1-NEXT: shrq $48, %rcx 290 ; AVX1-NEXT: movswl %cx, %ecx 291 ; AVX1-NEXT: vmovd %ecx, %xmm5 292 ; AVX1-NEXT: movq %rax, %rcx 293 ; AVX1-NEXT: shrq $32, %rcx 294 ; AVX1-NEXT: movswl %cx, %ecx 295 ; AVX1-NEXT: vmovd %ecx, %xmm6 296 ; AVX1-NEXT: movl %eax, %ecx 297 ; AVX1-NEXT: shrl $16, %ecx 298 ; AVX1-NEXT: movswl %cx, %ecx 299 ; AVX1-NEXT: vmovd %ecx, %xmm7 300 ; AVX1-NEXT: cwtl 301 ; AVX1-NEXT: vmovd %eax, %xmm1 302 ; AVX1-NEXT: vcvtph2ps %xmm8, %xmm8 303 ; AVX1-NEXT: vcvtph2ps %xmm9, %xmm9 304 ; AVX1-NEXT: vcvtph2ps %xmm10, %xmm10 305 ; AVX1-NEXT: vcvtph2ps %xmm11, %xmm11 306 ; AVX1-NEXT: vcvtph2ps %xmm12, %xmm12 307 ; AVX1-NEXT: vcvtph2ps %xmm13, %xmm13 308 ; AVX1-NEXT: vcvtph2ps %xmm14, %xmm14 309 ; AVX1-NEXT: vcvtph2ps %xmm15, %xmm15 310 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 311 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 312 ; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 313 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 314 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 315 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 316 ; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 317 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 318 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3] 319 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] 320 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] 321 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] 322 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] 323 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] 324 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 325 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3] 326 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] 327 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] 328 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3] 329 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 330 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 331 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 332 ; AVX1-NEXT: retq 333 ; 334 ; AVX2-LABEL: cvt_16i16_to_16f32: 335 ; AVX2: # BB#0: 336 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 337 ; AVX2-NEXT: vmovq %xmm4, %rax 338 ; AVX2-NEXT: movq %rax, %rcx 339 ; AVX2-NEXT: shrq $48, %rcx 340 ; AVX2-NEXT: movswl %cx, %ecx 341 ; AVX2-NEXT: vmovd %ecx, %xmm8 342 ; AVX2-NEXT: movq %rax, %rcx 343 ; AVX2-NEXT: shrq $32, %rcx 344 ; AVX2-NEXT: movswl %cx, %ecx 345 ; AVX2-NEXT: vmovd %ecx, %xmm9 346 ; AVX2-NEXT: movswl %ax, %ecx 347 ; AVX2-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 348 ; AVX2-NEXT: shrl $16, %eax 349 ; AVX2-NEXT: cwtl 350 ; AVX2-NEXT: vmovd %eax, %xmm10 351 ; AVX2-NEXT: vpextrq $1, %xmm4, %rax 352 ; AVX2-NEXT: vmovd %ecx, %xmm11 353 ; AVX2-NEXT: movq %rax, %rcx 354 ; AVX2-NEXT: shrq $48, %rcx 355 ; AVX2-NEXT: movswl %cx, %ecx 356 ; AVX2-NEXT: vmovd %ecx, %xmm12 357 ; AVX2-NEXT: movq %rax, %rcx 358 ; AVX2-NEXT: shrq $32, %rcx 359 ; AVX2-NEXT: movswl %cx, %ecx 360 ; AVX2-NEXT: vmovd %ecx, %xmm13 361 ; AVX2-NEXT: movswl %ax, %ecx 362 ; AVX2-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 363 ; AVX2-NEXT: shrl $16, %eax 364 ; AVX2-NEXT: cwtl 365 ; AVX2-NEXT: vmovd %eax, %xmm14 366 ; AVX2-NEXT: vmovq %xmm0, %rax 367 ; AVX2-NEXT: vmovd %ecx, %xmm15 368 ; AVX2-NEXT: movq %rax, %rcx 369 ; AVX2-NEXT: shrq $48, %rcx 370 ; AVX2-NEXT: movswl %cx, %ecx 371 ; AVX2-NEXT: vmovd %ecx, %xmm2 372 ; AVX2-NEXT: movq %rax, %rcx 373 ; AVX2-NEXT: shrq $32, %rcx 374 ; AVX2-NEXT: movswl %cx, %ecx 375 ; AVX2-NEXT: vmovd %ecx, %xmm3 376 ; AVX2-NEXT: movswl %ax, %ecx 377 ; AVX2-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 378 ; AVX2-NEXT: shrl $16, %eax 379 ; AVX2-NEXT: cwtl 380 ; AVX2-NEXT: vmovd %eax, %xmm4 381 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax 382 ; AVX2-NEXT: vmovd %ecx, %xmm0 383 ; AVX2-NEXT: movq %rax, %rcx 384 ; AVX2-NEXT: shrq $48, %rcx 385 ; AVX2-NEXT: movswl %cx, %ecx 386 ; AVX2-NEXT: vmovd %ecx, %xmm5 387 ; AVX2-NEXT: movq %rax, %rcx 388 ; AVX2-NEXT: shrq $32, %rcx 389 ; AVX2-NEXT: movswl %cx, %ecx 390 ; AVX2-NEXT: vmovd %ecx, %xmm6 391 ; AVX2-NEXT: movl %eax, %ecx 392 ; AVX2-NEXT: shrl $16, %ecx 393 ; AVX2-NEXT: movswl %cx, %ecx 394 ; AVX2-NEXT: vmovd %ecx, %xmm7 395 ; AVX2-NEXT: cwtl 396 ; AVX2-NEXT: vmovd %eax, %xmm1 397 ; AVX2-NEXT: vcvtph2ps %xmm8, %xmm8 398 ; AVX2-NEXT: vcvtph2ps %xmm9, %xmm9 399 ; AVX2-NEXT: vcvtph2ps %xmm10, %xmm10 400 ; AVX2-NEXT: vcvtph2ps %xmm11, %xmm11 401 ; AVX2-NEXT: vcvtph2ps %xmm12, %xmm12 402 ; AVX2-NEXT: vcvtph2ps %xmm13, %xmm13 403 ; AVX2-NEXT: vcvtph2ps %xmm14, %xmm14 404 ; AVX2-NEXT: vcvtph2ps %xmm15, %xmm15 405 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 406 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 407 ; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 408 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 409 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 410 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 411 ; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 412 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 413 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3] 414 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] 415 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] 416 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] 417 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] 418 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] 419 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 420 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3] 421 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] 422 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] 423 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3] 424 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 425 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 426 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 427 ; AVX2-NEXT: retq 428 ; 429 ; AVX512-LABEL: cvt_16i16_to_16f32: 430 ; AVX512: # BB#0: 431 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm10 432 ; AVX512-NEXT: vmovq %xmm0, %rax 433 ; AVX512-NEXT: movq %rax, %rcx 434 ; AVX512-NEXT: shrq $48, %rcx 435 ; AVX512-NEXT: movswl %cx, %ecx 436 ; AVX512-NEXT: vmovd %ecx, %xmm8 437 ; AVX512-NEXT: movq %rax, %rcx 438 ; AVX512-NEXT: shrq $32, %rcx 439 ; AVX512-NEXT: movswl %cx, %ecx 440 ; AVX512-NEXT: vmovd %ecx, %xmm9 441 ; AVX512-NEXT: movswl %ax, %ecx 442 ; AVX512-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 443 ; AVX512-NEXT: shrl $16, %eax 444 ; AVX512-NEXT: cwtl 445 ; AVX512-NEXT: vmovd %eax, %xmm11 446 ; AVX512-NEXT: vpextrq $1, %xmm0, %rax 447 ; AVX512-NEXT: vmovd %ecx, %xmm12 448 ; AVX512-NEXT: movq %rax, %rcx 449 ; AVX512-NEXT: shrq $48, %rcx 450 ; AVX512-NEXT: movswl %cx, %ecx 451 ; AVX512-NEXT: vmovd %ecx, %xmm13 452 ; AVX512-NEXT: movq %rax, %rcx 453 ; AVX512-NEXT: shrq $32, %rcx 454 ; AVX512-NEXT: movswl %cx, %ecx 455 ; AVX512-NEXT: vmovd %ecx, %xmm14 456 ; AVX512-NEXT: movswl %ax, %ecx 457 ; AVX512-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 458 ; AVX512-NEXT: shrl $16, %eax 459 ; AVX512-NEXT: cwtl 460 ; AVX512-NEXT: vmovd %eax, %xmm15 461 ; AVX512-NEXT: vmovq %xmm10, %rax 462 ; AVX512-NEXT: vmovd %ecx, %xmm2 463 ; AVX512-NEXT: movq %rax, %rcx 464 ; AVX512-NEXT: shrq $48, %rcx 465 ; AVX512-NEXT: movswl %cx, %ecx 466 ; AVX512-NEXT: vmovd %ecx, %xmm3 467 ; AVX512-NEXT: movq %rax, %rcx 468 ; AVX512-NEXT: shrq $32, %rcx 469 ; AVX512-NEXT: movswl %cx, %ecx 470 ; AVX512-NEXT: vmovd %ecx, %xmm1 471 ; AVX512-NEXT: movswl %ax, %ecx 472 ; AVX512-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 473 ; AVX512-NEXT: shrl $16, %eax 474 ; AVX512-NEXT: cwtl 475 ; AVX512-NEXT: vmovd %eax, %xmm4 476 ; AVX512-NEXT: vpextrq $1, %xmm10, %rax 477 ; AVX512-NEXT: vmovd %ecx, %xmm10 478 ; AVX512-NEXT: movq %rax, %rcx 479 ; AVX512-NEXT: shrq $48, %rcx 480 ; AVX512-NEXT: movswl %cx, %ecx 481 ; AVX512-NEXT: vmovd %ecx, %xmm5 482 ; AVX512-NEXT: movq %rax, %rcx 483 ; AVX512-NEXT: shrq $32, %rcx 484 ; AVX512-NEXT: movswl %cx, %ecx 485 ; AVX512-NEXT: vmovd %ecx, %xmm6 486 ; AVX512-NEXT: movl %eax, %ecx 487 ; AVX512-NEXT: shrl $16, %ecx 488 ; AVX512-NEXT: movswl %cx, %ecx 489 ; AVX512-NEXT: vmovd %ecx, %xmm7 490 ; AVX512-NEXT: cwtl 491 ; AVX512-NEXT: vmovd %eax, %xmm0 492 ; AVX512-NEXT: vcvtph2ps %xmm8, %xmm8 493 ; AVX512-NEXT: vcvtph2ps %xmm9, %xmm9 494 ; AVX512-NEXT: vcvtph2ps %xmm11, %xmm11 495 ; AVX512-NEXT: vcvtph2ps %xmm12, %xmm12 496 ; AVX512-NEXT: vcvtph2ps %xmm13, %xmm13 497 ; AVX512-NEXT: vcvtph2ps %xmm14, %xmm14 498 ; AVX512-NEXT: vcvtph2ps %xmm15, %xmm15 499 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 500 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 501 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 502 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 503 ; AVX512-NEXT: vcvtph2ps %xmm10, %xmm10 504 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 505 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6 506 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7 507 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 508 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3] 509 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3] 510 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0] 511 ; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3] 512 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3] 513 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] 514 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 515 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3] 516 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3] 517 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0] 518 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3] 519 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 520 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 521 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 522 ; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 523 ; AVX512-NEXT: retq 524 %1 = bitcast <16 x i16> %a0 to <16 x half> 525 %2 = fpext <16 x half> %1 to <16 x float> 526 ret <16 x float> %2 527 } 528 529 ; 530 ; Half to Float (Load) 531 ; 532 533 define float @load_cvt_i16_to_f32(i16* %a0) { 534 ; ALL-LABEL: load_cvt_i16_to_f32: 535 ; ALL: # BB#0: 536 ; ALL-NEXT: movswl (%rdi), %eax 537 ; ALL-NEXT: vmovd %eax, %xmm0 538 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 539 ; ALL-NEXT: retq 540 %1 = load i16, i16* %a0 541 %2 = bitcast i16 %1 to half 542 %3 = fpext half %2 to float 543 ret float %3 544 } 545 546 define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) { 547 ; ALL-LABEL: load_cvt_4i16_to_4f32: 548 ; ALL: # BB#0: 549 ; ALL-NEXT: movswl 6(%rdi), %eax 550 ; ALL-NEXT: vmovd %eax, %xmm0 551 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 552 ; ALL-NEXT: movswl 4(%rdi), %eax 553 ; ALL-NEXT: vmovd %eax, %xmm1 554 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 555 ; ALL-NEXT: movswl (%rdi), %eax 556 ; ALL-NEXT: vmovd %eax, %xmm2 557 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 558 ; ALL-NEXT: movswl 2(%rdi), %eax 559 ; ALL-NEXT: vmovd %eax, %xmm3 560 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 561 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] 562 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 563 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 564 ; ALL-NEXT: retq 565 %1 = load <4 x i16>, <4 x i16>* %a0 566 %2 = bitcast <4 x i16> %1 to <4 x half> 567 %3 = fpext <4 x half> %2 to <4 x float> 568 ret <4 x float> %3 569 } 570 571 define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) { 572 ; ALL-LABEL: load_cvt_8i16_to_4f32: 573 ; ALL: # BB#0: 574 ; ALL-NEXT: movq (%rdi), %rax 575 ; ALL-NEXT: movq %rax, %rcx 576 ; ALL-NEXT: movq %rax, %rdx 577 ; ALL-NEXT: movswl %ax, %esi 578 ; ALL-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 579 ; ALL-NEXT: shrl $16, %eax 580 ; ALL-NEXT: shrq $32, %rcx 581 ; ALL-NEXT: shrq $48, %rdx 582 ; ALL-NEXT: movswl %dx, %edx 583 ; ALL-NEXT: vmovd %edx, %xmm0 584 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 585 ; ALL-NEXT: movswl %cx, %ecx 586 ; ALL-NEXT: vmovd %ecx, %xmm1 587 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 588 ; ALL-NEXT: cwtl 589 ; ALL-NEXT: vmovd %eax, %xmm2 590 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 591 ; ALL-NEXT: vmovd %esi, %xmm3 592 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 593 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 594 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 595 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 596 ; ALL-NEXT: retq 597 %1 = load <8 x i16>, <8 x i16>* %a0 598 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 599 %3 = bitcast <4 x i16> %2 to <4 x half> 600 %4 = fpext <4 x half> %3 to <4 x float> 601 ret <4 x float> %4 602 } 603 604 define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) { 605 ; AVX1-LABEL: load_cvt_8i16_to_8f32: 606 ; AVX1: # BB#0: 607 ; AVX1-NEXT: movswl 6(%rdi), %eax 608 ; AVX1-NEXT: vmovd %eax, %xmm0 609 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 610 ; AVX1-NEXT: movswl 4(%rdi), %eax 611 ; AVX1-NEXT: vmovd %eax, %xmm1 612 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 613 ; AVX1-NEXT: movswl (%rdi), %eax 614 ; AVX1-NEXT: vmovd %eax, %xmm2 615 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 616 ; AVX1-NEXT: movswl 2(%rdi), %eax 617 ; AVX1-NEXT: vmovd %eax, %xmm3 618 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 619 ; AVX1-NEXT: movswl 14(%rdi), %eax 620 ; AVX1-NEXT: vmovd %eax, %xmm4 621 ; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 622 ; AVX1-NEXT: movswl 12(%rdi), %eax 623 ; AVX1-NEXT: vmovd %eax, %xmm5 624 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 625 ; AVX1-NEXT: movswl 8(%rdi), %eax 626 ; AVX1-NEXT: vmovd %eax, %xmm6 627 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 628 ; AVX1-NEXT: movswl 10(%rdi), %eax 629 ; AVX1-NEXT: vmovd %eax, %xmm7 630 ; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 631 ; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] 632 ; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] 633 ; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] 634 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] 635 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 636 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 637 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 638 ; AVX1-NEXT: retq 639 ; 640 ; AVX2-LABEL: load_cvt_8i16_to_8f32: 641 ; AVX2: # BB#0: 642 ; AVX2-NEXT: movswl 6(%rdi), %eax 643 ; AVX2-NEXT: vmovd %eax, %xmm0 644 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 645 ; AVX2-NEXT: movswl 4(%rdi), %eax 646 ; AVX2-NEXT: vmovd %eax, %xmm1 647 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 648 ; AVX2-NEXT: movswl (%rdi), %eax 649 ; AVX2-NEXT: vmovd %eax, %xmm2 650 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 651 ; AVX2-NEXT: movswl 2(%rdi), %eax 652 ; AVX2-NEXT: vmovd %eax, %xmm3 653 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 654 ; AVX2-NEXT: movswl 14(%rdi), %eax 655 ; AVX2-NEXT: vmovd %eax, %xmm4 656 ; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 657 ; AVX2-NEXT: movswl 12(%rdi), %eax 658 ; AVX2-NEXT: vmovd %eax, %xmm5 659 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 660 ; AVX2-NEXT: movswl 8(%rdi), %eax 661 ; AVX2-NEXT: vmovd %eax, %xmm6 662 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 663 ; AVX2-NEXT: movswl 10(%rdi), %eax 664 ; AVX2-NEXT: vmovd %eax, %xmm7 665 ; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 666 ; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] 667 ; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] 668 ; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] 669 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] 670 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 671 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 672 ; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 673 ; AVX2-NEXT: retq 674 ; 675 ; AVX512-LABEL: load_cvt_8i16_to_8f32: 676 ; AVX512: # BB#0: 677 ; AVX512-NEXT: movswl 6(%rdi), %eax 678 ; AVX512-NEXT: vmovd %eax, %xmm0 679 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 680 ; AVX512-NEXT: movswl 4(%rdi), %eax 681 ; AVX512-NEXT: vmovd %eax, %xmm1 682 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 683 ; AVX512-NEXT: movswl (%rdi), %eax 684 ; AVX512-NEXT: vmovd %eax, %xmm2 685 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 686 ; AVX512-NEXT: movswl 2(%rdi), %eax 687 ; AVX512-NEXT: vmovd %eax, %xmm3 688 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 689 ; AVX512-NEXT: movswl 14(%rdi), %eax 690 ; AVX512-NEXT: vmovd %eax, %xmm4 691 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 692 ; AVX512-NEXT: movswl 12(%rdi), %eax 693 ; AVX512-NEXT: vmovd %eax, %xmm5 694 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 695 ; AVX512-NEXT: movswl 8(%rdi), %eax 696 ; AVX512-NEXT: vmovd %eax, %xmm6 697 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6 698 ; AVX512-NEXT: movswl 10(%rdi), %eax 699 ; AVX512-NEXT: vmovd %eax, %xmm7 700 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7 701 ; AVX512-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] 702 ; AVX512-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] 703 ; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] 704 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] 705 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 706 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 707 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 708 ; AVX512-NEXT: retq 709 %1 = load <8 x i16>, <8 x i16>* %a0 710 %2 = bitcast <8 x i16> %1 to <8 x half> 711 %3 = fpext <8 x half> %2 to <8 x float> 712 ret <8 x float> %3 713 } 714 715 define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) { 716 ; AVX1-LABEL: load_cvt_16i16_to_16f32: 717 ; AVX1: # BB#0: 718 ; AVX1-NEXT: movswl 22(%rdi), %eax 719 ; AVX1-NEXT: vmovd %eax, %xmm0 720 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm8 721 ; AVX1-NEXT: movswl 20(%rdi), %eax 722 ; AVX1-NEXT: vmovd %eax, %xmm0 723 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm9 724 ; AVX1-NEXT: movswl 16(%rdi), %eax 725 ; AVX1-NEXT: vmovd %eax, %xmm0 726 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm10 727 ; AVX1-NEXT: movswl 18(%rdi), %eax 728 ; AVX1-NEXT: vmovd %eax, %xmm0 729 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm11 730 ; AVX1-NEXT: movswl 30(%rdi), %eax 731 ; AVX1-NEXT: vmovd %eax, %xmm0 732 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm12 733 ; AVX1-NEXT: movswl 28(%rdi), %eax 734 ; AVX1-NEXT: vmovd %eax, %xmm0 735 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm13 736 ; AVX1-NEXT: movswl 24(%rdi), %eax 737 ; AVX1-NEXT: vmovd %eax, %xmm0 738 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm14 739 ; AVX1-NEXT: movswl 26(%rdi), %eax 740 ; AVX1-NEXT: vmovd %eax, %xmm0 741 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm15 742 ; AVX1-NEXT: movswl 6(%rdi), %eax 743 ; AVX1-NEXT: vmovd %eax, %xmm0 744 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 745 ; AVX1-NEXT: movswl 4(%rdi), %eax 746 ; AVX1-NEXT: vmovd %eax, %xmm2 747 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 748 ; AVX1-NEXT: movswl (%rdi), %eax 749 ; AVX1-NEXT: vmovd %eax, %xmm3 750 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 751 ; AVX1-NEXT: movswl 2(%rdi), %eax 752 ; AVX1-NEXT: vmovd %eax, %xmm4 753 ; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 754 ; AVX1-NEXT: movswl 14(%rdi), %eax 755 ; AVX1-NEXT: vmovd %eax, %xmm5 756 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 757 ; AVX1-NEXT: movswl 12(%rdi), %eax 758 ; AVX1-NEXT: vmovd %eax, %xmm6 759 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 760 ; AVX1-NEXT: movswl 8(%rdi), %eax 761 ; AVX1-NEXT: vmovd %eax, %xmm7 762 ; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 763 ; AVX1-NEXT: movswl 10(%rdi), %eax 764 ; AVX1-NEXT: vmovd %eax, %xmm1 765 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 766 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3] 767 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] 768 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] 769 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] 770 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3] 771 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] 772 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 773 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] 774 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] 775 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] 776 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] 777 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 778 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 779 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 780 ; AVX1-NEXT: retq 781 ; 782 ; AVX2-LABEL: load_cvt_16i16_to_16f32: 783 ; AVX2: # BB#0: 784 ; AVX2-NEXT: movswl 22(%rdi), %eax 785 ; AVX2-NEXT: vmovd %eax, %xmm0 786 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm8 787 ; AVX2-NEXT: movswl 20(%rdi), %eax 788 ; AVX2-NEXT: vmovd %eax, %xmm0 789 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm9 790 ; AVX2-NEXT: movswl 16(%rdi), %eax 791 ; AVX2-NEXT: vmovd %eax, %xmm0 792 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm10 793 ; AVX2-NEXT: movswl 18(%rdi), %eax 794 ; AVX2-NEXT: vmovd %eax, %xmm0 795 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm11 796 ; AVX2-NEXT: movswl 30(%rdi), %eax 797 ; AVX2-NEXT: vmovd %eax, %xmm0 798 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm12 799 ; AVX2-NEXT: movswl 28(%rdi), %eax 800 ; AVX2-NEXT: vmovd %eax, %xmm0 801 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm13 802 ; AVX2-NEXT: movswl 24(%rdi), %eax 803 ; AVX2-NEXT: vmovd %eax, %xmm0 804 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm14 805 ; AVX2-NEXT: movswl 26(%rdi), %eax 806 ; AVX2-NEXT: vmovd %eax, %xmm0 807 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm15 808 ; AVX2-NEXT: movswl 6(%rdi), %eax 809 ; AVX2-NEXT: vmovd %eax, %xmm0 810 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 811 ; AVX2-NEXT: movswl 4(%rdi), %eax 812 ; AVX2-NEXT: vmovd %eax, %xmm2 813 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 814 ; AVX2-NEXT: movswl (%rdi), %eax 815 ; AVX2-NEXT: vmovd %eax, %xmm3 816 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 817 ; AVX2-NEXT: movswl 2(%rdi), %eax 818 ; AVX2-NEXT: vmovd %eax, %xmm4 819 ; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 820 ; AVX2-NEXT: movswl 14(%rdi), %eax 821 ; AVX2-NEXT: vmovd %eax, %xmm5 822 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 823 ; AVX2-NEXT: movswl 12(%rdi), %eax 824 ; AVX2-NEXT: vmovd %eax, %xmm6 825 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 826 ; AVX2-NEXT: movswl 8(%rdi), %eax 827 ; AVX2-NEXT: vmovd %eax, %xmm7 828 ; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 829 ; AVX2-NEXT: movswl 10(%rdi), %eax 830 ; AVX2-NEXT: vmovd %eax, %xmm1 831 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 832 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3] 833 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] 834 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] 835 ; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] 836 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3] 837 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] 838 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 839 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] 840 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] 841 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] 842 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] 843 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 844 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 845 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 846 ; AVX2-NEXT: retq 847 ; 848 ; AVX512-LABEL: load_cvt_16i16_to_16f32: 849 ; AVX512: # BB#0: 850 ; AVX512-NEXT: movswl 6(%rdi), %eax 851 ; AVX512-NEXT: vmovd %eax, %xmm0 852 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm8 853 ; AVX512-NEXT: movswl 4(%rdi), %eax 854 ; AVX512-NEXT: vmovd %eax, %xmm0 855 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm9 856 ; AVX512-NEXT: movswl (%rdi), %eax 857 ; AVX512-NEXT: vmovd %eax, %xmm0 858 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm10 859 ; AVX512-NEXT: movswl 2(%rdi), %eax 860 ; AVX512-NEXT: vmovd %eax, %xmm0 861 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm11 862 ; AVX512-NEXT: movswl 14(%rdi), %eax 863 ; AVX512-NEXT: vmovd %eax, %xmm0 864 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm12 865 ; AVX512-NEXT: movswl 12(%rdi), %eax 866 ; AVX512-NEXT: vmovd %eax, %xmm0 867 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm13 868 ; AVX512-NEXT: movswl 8(%rdi), %eax 869 ; AVX512-NEXT: vmovd %eax, %xmm0 870 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm14 871 ; AVX512-NEXT: movswl 10(%rdi), %eax 872 ; AVX512-NEXT: vmovd %eax, %xmm0 873 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm15 874 ; AVX512-NEXT: movswl 22(%rdi), %eax 875 ; AVX512-NEXT: vmovd %eax, %xmm0 876 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 877 ; AVX512-NEXT: movswl 20(%rdi), %eax 878 ; AVX512-NEXT: vmovd %eax, %xmm1 879 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 880 ; AVX512-NEXT: movswl 16(%rdi), %eax 881 ; AVX512-NEXT: vmovd %eax, %xmm2 882 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 883 ; AVX512-NEXT: movswl 18(%rdi), %eax 884 ; AVX512-NEXT: vmovd %eax, %xmm3 885 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 886 ; AVX512-NEXT: movswl 30(%rdi), %eax 887 ; AVX512-NEXT: vmovd %eax, %xmm4 888 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 889 ; AVX512-NEXT: movswl 28(%rdi), %eax 890 ; AVX512-NEXT: vmovd %eax, %xmm5 891 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 892 ; AVX512-NEXT: movswl 24(%rdi), %eax 893 ; AVX512-NEXT: vmovd %eax, %xmm6 894 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6 895 ; AVX512-NEXT: movswl 26(%rdi), %eax 896 ; AVX512-NEXT: vmovd %eax, %xmm7 897 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7 898 ; AVX512-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] 899 ; AVX512-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] 900 ; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] 901 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] 902 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 903 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 904 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 905 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] 906 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] 907 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] 908 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] 909 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 910 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 911 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 912 ; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 913 ; AVX512-NEXT: retq 914 %1 = load <16 x i16>, <16 x i16>* %a0 915 %2 = bitcast <16 x i16> %1 to <16 x half> 916 %3 = fpext <16 x half> %2 to <16 x float> 917 ret <16 x float> %3 918 } 919 920 ; 921 ; Half to Double 922 ; 923 924 define double @cvt_i16_to_f64(i16 %a0) { 925 ; ALL-LABEL: cvt_i16_to_f64: 926 ; ALL: # BB#0: 927 ; ALL-NEXT: movswl %di, %eax 928 ; ALL-NEXT: vmovd %eax, %xmm0 929 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 930 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 931 ; ALL-NEXT: retq 932 %1 = bitcast i16 %a0 to half 933 %2 = fpext half %1 to double 934 ret double %2 935 } 936 937 define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) { 938 ; ALL-LABEL: cvt_2i16_to_2f64: 939 ; ALL: # BB#0: 940 ; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 941 ; ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 942 ; ALL-NEXT: vmovd %xmm0, %eax 943 ; ALL-NEXT: movswl %ax, %ecx 944 ; ALL-NEXT: shrl $16, %eax 945 ; ALL-NEXT: cwtl 946 ; ALL-NEXT: vmovd %eax, %xmm0 947 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 948 ; ALL-NEXT: vmovd %ecx, %xmm1 949 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 950 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 951 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 952 ; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] 953 ; ALL-NEXT: retq 954 %1 = bitcast <2 x i16> %a0 to <2 x half> 955 %2 = fpext <2 x half> %1 to <2 x double> 956 ret <2 x double> %2 957 } 958 959 define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) { 960 ; ALL-LABEL: cvt_4i16_to_4f64: 961 ; ALL: # BB#0: 962 ; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 963 ; ALL-NEXT: vmovq %xmm0, %rax 964 ; ALL-NEXT: movq %rax, %rcx 965 ; ALL-NEXT: movl %eax, %edx 966 ; ALL-NEXT: movswl %ax, %esi 967 ; ALL-NEXT: shrq $48, %rax 968 ; ALL-NEXT: shrq $32, %rcx 969 ; ALL-NEXT: shrl $16, %edx 970 ; ALL-NEXT: movswl %dx, %edx 971 ; ALL-NEXT: vmovd %edx, %xmm0 972 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 973 ; ALL-NEXT: vmovd %esi, %xmm1 974 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 975 ; ALL-NEXT: movswl %cx, %ecx 976 ; ALL-NEXT: vmovd %ecx, %xmm2 977 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 978 ; ALL-NEXT: cwtl 979 ; ALL-NEXT: vmovd %eax, %xmm3 980 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 981 ; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 982 ; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 983 ; ALL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] 984 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 985 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 986 ; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] 987 ; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 988 ; ALL-NEXT: retq 989 %1 = bitcast <4 x i16> %a0 to <4 x half> 990 %2 = fpext <4 x half> %1 to <4 x double> 991 ret <4 x double> %2 992 } 993 994 define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) { 995 ; ALL-LABEL: cvt_8i16_to_2f64: 996 ; ALL: # BB#0: 997 ; ALL-NEXT: vmovd %xmm0, %eax 998 ; ALL-NEXT: movswl %ax, %ecx 999 ; ALL-NEXT: shrl $16, %eax 1000 ; ALL-NEXT: cwtl 1001 ; ALL-NEXT: vmovd %eax, %xmm0 1002 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 1003 ; ALL-NEXT: vmovd %ecx, %xmm1 1004 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 1005 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1006 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1007 ; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1008 ; ALL-NEXT: retq 1009 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1> 1010 %2 = bitcast <2 x i16> %1 to <2 x half> 1011 %3 = fpext <2 x half> %2 to <2 x double> 1012 ret <2 x double> %3 1013 } 1014 1015 define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) { 1016 ; ALL-LABEL: cvt_8i16_to_4f64: 1017 ; ALL: # BB#0: 1018 ; ALL-NEXT: vmovq %xmm0, %rax 1019 ; ALL-NEXT: movq %rax, %rcx 1020 ; ALL-NEXT: movl %eax, %edx 1021 ; ALL-NEXT: movswl %ax, %esi 1022 ; ALL-NEXT: shrq $48, %rax 1023 ; ALL-NEXT: shrq $32, %rcx 1024 ; ALL-NEXT: shrl $16, %edx 1025 ; ALL-NEXT: movswl %dx, %edx 1026 ; ALL-NEXT: vmovd %edx, %xmm0 1027 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 1028 ; ALL-NEXT: vmovd %esi, %xmm1 1029 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 1030 ; ALL-NEXT: movswl %cx, %ecx 1031 ; ALL-NEXT: vmovd %ecx, %xmm2 1032 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 1033 ; ALL-NEXT: cwtl 1034 ; ALL-NEXT: vmovd %eax, %xmm3 1035 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 1036 ; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1037 ; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1038 ; ALL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1039 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1040 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1041 ; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1042 ; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1043 ; ALL-NEXT: retq 1044 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1045 %2 = bitcast <4 x i16> %1 to <4 x half> 1046 %3 = fpext <4 x half> %2 to <4 x double> 1047 ret <4 x double> %3 1048 } 1049 1050 define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) { 1051 ; AVX1-LABEL: cvt_8i16_to_8f64: 1052 ; AVX1: # BB#0: 1053 ; AVX1-NEXT: vmovq %xmm0, %rdx 1054 ; AVX1-NEXT: movq %rdx, %r9 1055 ; AVX1-NEXT: movl %edx, %r10d 1056 ; AVX1-NEXT: movswl %dx, %r8d 1057 ; AVX1-NEXT: shrq $48, %rdx 1058 ; AVX1-NEXT: shrq $32, %r9 1059 ; AVX1-NEXT: shrl $16, %r10d 1060 ; AVX1-NEXT: vpextrq $1, %xmm0, %rdi 1061 ; AVX1-NEXT: movq %rdi, %rsi 1062 ; AVX1-NEXT: movl %edi, %eax 1063 ; AVX1-NEXT: movswl %di, %ecx 1064 ; AVX1-NEXT: shrq $48, %rdi 1065 ; AVX1-NEXT: shrq $32, %rsi 1066 ; AVX1-NEXT: shrl $16, %eax 1067 ; AVX1-NEXT: cwtl 1068 ; AVX1-NEXT: vmovd %eax, %xmm0 1069 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1 1070 ; AVX1-NEXT: vmovd %ecx, %xmm0 1071 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2 1072 ; AVX1-NEXT: movswl %si, %eax 1073 ; AVX1-NEXT: vmovd %eax, %xmm0 1074 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3 1075 ; AVX1-NEXT: movswl %di, %eax 1076 ; AVX1-NEXT: vmovd %eax, %xmm0 1077 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm4 1078 ; AVX1-NEXT: movswl %r10w, %eax 1079 ; AVX1-NEXT: vmovd %eax, %xmm0 1080 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 1081 ; AVX1-NEXT: vmovd %r8d, %xmm5 1082 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 1083 ; AVX1-NEXT: movswl %r9w, %eax 1084 ; AVX1-NEXT: vmovd %eax, %xmm6 1085 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 1086 ; AVX1-NEXT: movswl %dx, %eax 1087 ; AVX1-NEXT: vmovd %eax, %xmm7 1088 ; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 1089 ; AVX1-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 1090 ; AVX1-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 1091 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] 1092 ; AVX1-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 1093 ; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1094 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm5[0],xmm0[0] 1095 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 1096 ; AVX1-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 1097 ; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1098 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] 1099 ; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1100 ; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1101 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] 1102 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1103 ; AVX1-NEXT: retq 1104 ; 1105 ; AVX2-LABEL: cvt_8i16_to_8f64: 1106 ; AVX2: # BB#0: 1107 ; AVX2-NEXT: vmovq %xmm0, %rdx 1108 ; AVX2-NEXT: movq %rdx, %r9 1109 ; AVX2-NEXT: movl %edx, %r10d 1110 ; AVX2-NEXT: movswl %dx, %r8d 1111 ; AVX2-NEXT: shrq $48, %rdx 1112 ; AVX2-NEXT: shrq $32, %r9 1113 ; AVX2-NEXT: shrl $16, %r10d 1114 ; AVX2-NEXT: vpextrq $1, %xmm0, %rdi 1115 ; AVX2-NEXT: movq %rdi, %rsi 1116 ; AVX2-NEXT: movl %edi, %eax 1117 ; AVX2-NEXT: movswl %di, %ecx 1118 ; AVX2-NEXT: shrq $48, %rdi 1119 ; AVX2-NEXT: shrq $32, %rsi 1120 ; AVX2-NEXT: shrl $16, %eax 1121 ; AVX2-NEXT: cwtl 1122 ; AVX2-NEXT: vmovd %eax, %xmm0 1123 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1 1124 ; AVX2-NEXT: vmovd %ecx, %xmm0 1125 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2 1126 ; AVX2-NEXT: movswl %si, %eax 1127 ; AVX2-NEXT: vmovd %eax, %xmm0 1128 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3 1129 ; AVX2-NEXT: movswl %di, %eax 1130 ; AVX2-NEXT: vmovd %eax, %xmm0 1131 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm4 1132 ; AVX2-NEXT: movswl %r10w, %eax 1133 ; AVX2-NEXT: vmovd %eax, %xmm0 1134 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 1135 ; AVX2-NEXT: vmovd %r8d, %xmm5 1136 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 1137 ; AVX2-NEXT: movswl %r9w, %eax 1138 ; AVX2-NEXT: vmovd %eax, %xmm6 1139 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 1140 ; AVX2-NEXT: movswl %dx, %eax 1141 ; AVX2-NEXT: vmovd %eax, %xmm7 1142 ; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 1143 ; AVX2-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 1144 ; AVX2-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 1145 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] 1146 ; AVX2-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 1147 ; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1148 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm5[0],xmm0[0] 1149 ; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 1150 ; AVX2-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 1151 ; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1152 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] 1153 ; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1154 ; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1155 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] 1156 ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1157 ; AVX2-NEXT: retq 1158 ; 1159 ; AVX512-LABEL: cvt_8i16_to_8f64: 1160 ; AVX512: # BB#0: 1161 ; AVX512-NEXT: vpextrq $1, %xmm0, %rdx 1162 ; AVX512-NEXT: movq %rdx, %r8 1163 ; AVX512-NEXT: movl %edx, %r10d 1164 ; AVX512-NEXT: movswl %dx, %r9d 1165 ; AVX512-NEXT: shrq $48, %rdx 1166 ; AVX512-NEXT: shrq $32, %r8 1167 ; AVX512-NEXT: shrl $16, %r10d 1168 ; AVX512-NEXT: vmovq %xmm0, %rdi 1169 ; AVX512-NEXT: movq %rdi, %rax 1170 ; AVX512-NEXT: movl %edi, %esi 1171 ; AVX512-NEXT: movswl %di, %ecx 1172 ; AVX512-NEXT: shrq $48, %rdi 1173 ; AVX512-NEXT: shrq $32, %rax 1174 ; AVX512-NEXT: shrl $16, %esi 1175 ; AVX512-NEXT: movswl %si, %esi 1176 ; AVX512-NEXT: vmovd %esi, %xmm0 1177 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1178 ; AVX512-NEXT: vmovd %ecx, %xmm1 1179 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 1180 ; AVX512-NEXT: cwtl 1181 ; AVX512-NEXT: vmovd %eax, %xmm2 1182 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 1183 ; AVX512-NEXT: movswl %di, %eax 1184 ; AVX512-NEXT: vmovd %eax, %xmm3 1185 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 1186 ; AVX512-NEXT: movswl %r10w, %eax 1187 ; AVX512-NEXT: vmovd %eax, %xmm4 1188 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 1189 ; AVX512-NEXT: vmovd %r9d, %xmm5 1190 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 1191 ; AVX512-NEXT: movswl %r8w, %eax 1192 ; AVX512-NEXT: vmovd %eax, %xmm6 1193 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6 1194 ; AVX512-NEXT: movswl %dx, %eax 1195 ; AVX512-NEXT: vmovd %eax, %xmm7 1196 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7 1197 ; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 1198 ; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 1199 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] 1200 ; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 1201 ; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 1202 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm5[0],xmm4[0] 1203 ; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 1204 ; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1205 ; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1206 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1207 ; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1208 ; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1209 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1210 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1211 ; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 1212 ; AVX512-NEXT: retq 1213 %1 = bitcast <8 x i16> %a0 to <8 x half> 1214 %2 = fpext <8 x half> %1 to <8 x double> 1215 ret <8 x double> %2 1216 } 1217 1218 ; 1219 ; Half to Double (Load) 1220 ; 1221 1222 define double @load_cvt_i16_to_f64(i16* %a0) { 1223 ; ALL-LABEL: load_cvt_i16_to_f64: 1224 ; ALL: # BB#0: 1225 ; ALL-NEXT: movswl (%rdi), %eax 1226 ; ALL-NEXT: vmovd %eax, %xmm0 1227 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 1228 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1229 ; ALL-NEXT: retq 1230 %1 = load i16, i16* %a0 1231 %2 = bitcast i16 %1 to half 1232 %3 = fpext half %2 to double 1233 ret double %3 1234 } 1235 1236 define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) { 1237 ; ALL-LABEL: load_cvt_2i16_to_2f64: 1238 ; ALL: # BB#0: 1239 ; ALL-NEXT: movswl (%rdi), %eax 1240 ; ALL-NEXT: vmovd %eax, %xmm0 1241 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 1242 ; ALL-NEXT: movswl 2(%rdi), %eax 1243 ; ALL-NEXT: vmovd %eax, %xmm1 1244 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 1245 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1246 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1247 ; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1248 ; ALL-NEXT: retq 1249 %1 = load <2 x i16>, <2 x i16>* %a0 1250 %2 = bitcast <2 x i16> %1 to <2 x half> 1251 %3 = fpext <2 x half> %2 to <2 x double> 1252 ret <2 x double> %3 1253 } 1254 1255 define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) { 1256 ; ALL-LABEL: load_cvt_4i16_to_4f64: 1257 ; ALL: # BB#0: 1258 ; ALL-NEXT: movswl (%rdi), %eax 1259 ; ALL-NEXT: vmovd %eax, %xmm0 1260 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 1261 ; ALL-NEXT: movswl 2(%rdi), %eax 1262 ; ALL-NEXT: vmovd %eax, %xmm1 1263 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 1264 ; ALL-NEXT: movswl 4(%rdi), %eax 1265 ; ALL-NEXT: vmovd %eax, %xmm2 1266 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 1267 ; ALL-NEXT: movswl 6(%rdi), %eax 1268 ; ALL-NEXT: vmovd %eax, %xmm3 1269 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 1270 ; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1271 ; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1272 ; ALL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1273 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1274 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1275 ; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1276 ; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1277 ; ALL-NEXT: retq 1278 %1 = load <4 x i16>, <4 x i16>* %a0 1279 %2 = bitcast <4 x i16> %1 to <4 x half> 1280 %3 = fpext <4 x half> %2 to <4 x double> 1281 ret <4 x double> %3 1282 } 1283 1284 define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) { 1285 ; ALL-LABEL: load_cvt_8i16_to_4f64: 1286 ; ALL: # BB#0: 1287 ; ALL-NEXT: movq (%rdi), %rax 1288 ; ALL-NEXT: movq %rax, %rcx 1289 ; ALL-NEXT: movl %eax, %edx 1290 ; ALL-NEXT: movswl %ax, %esi 1291 ; ALL-NEXT: shrq $48, %rax 1292 ; ALL-NEXT: shrq $32, %rcx 1293 ; ALL-NEXT: shrl $16, %edx 1294 ; ALL-NEXT: movswl %dx, %edx 1295 ; ALL-NEXT: vmovd %edx, %xmm0 1296 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 1297 ; ALL-NEXT: vmovd %esi, %xmm1 1298 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 1299 ; ALL-NEXT: movswl %cx, %ecx 1300 ; ALL-NEXT: vmovd %ecx, %xmm2 1301 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 1302 ; ALL-NEXT: cwtl 1303 ; ALL-NEXT: vmovd %eax, %xmm3 1304 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 1305 ; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1306 ; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1307 ; ALL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1308 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1309 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1310 ; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1311 ; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1312 ; ALL-NEXT: retq 1313 %1 = load <8 x i16>, <8 x i16>* %a0 1314 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1315 %3 = bitcast <4 x i16> %2 to <4 x half> 1316 %4 = fpext <4 x half> %3 to <4 x double> 1317 ret <4 x double> %4 1318 } 1319 1320 define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) { 1321 ; AVX1-LABEL: load_cvt_8i16_to_8f64: 1322 ; AVX1: # BB#0: 1323 ; AVX1-NEXT: movswl 8(%rdi), %eax 1324 ; AVX1-NEXT: vmovd %eax, %xmm0 1325 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1 1326 ; AVX1-NEXT: movswl 10(%rdi), %eax 1327 ; AVX1-NEXT: vmovd %eax, %xmm0 1328 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2 1329 ; AVX1-NEXT: movswl 12(%rdi), %eax 1330 ; AVX1-NEXT: vmovd %eax, %xmm0 1331 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3 1332 ; AVX1-NEXT: movswl 14(%rdi), %eax 1333 ; AVX1-NEXT: vmovd %eax, %xmm0 1334 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm4 1335 ; AVX1-NEXT: movswl (%rdi), %eax 1336 ; AVX1-NEXT: vmovd %eax, %xmm0 1337 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 1338 ; AVX1-NEXT: movswl 2(%rdi), %eax 1339 ; AVX1-NEXT: vmovd %eax, %xmm5 1340 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 1341 ; AVX1-NEXT: movswl 4(%rdi), %eax 1342 ; AVX1-NEXT: vmovd %eax, %xmm6 1343 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 1344 ; AVX1-NEXT: movswl 6(%rdi), %eax 1345 ; AVX1-NEXT: vmovd %eax, %xmm7 1346 ; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 1347 ; AVX1-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 1348 ; AVX1-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 1349 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] 1350 ; AVX1-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 1351 ; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1352 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0] 1353 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 1354 ; AVX1-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 1355 ; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1356 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] 1357 ; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1358 ; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1359 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1360 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1361 ; AVX1-NEXT: retq 1362 ; 1363 ; AVX2-LABEL: load_cvt_8i16_to_8f64: 1364 ; AVX2: # BB#0: 1365 ; AVX2-NEXT: movswl 8(%rdi), %eax 1366 ; AVX2-NEXT: vmovd %eax, %xmm0 1367 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1 1368 ; AVX2-NEXT: movswl 10(%rdi), %eax 1369 ; AVX2-NEXT: vmovd %eax, %xmm0 1370 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2 1371 ; AVX2-NEXT: movswl 12(%rdi), %eax 1372 ; AVX2-NEXT: vmovd %eax, %xmm0 1373 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3 1374 ; AVX2-NEXT: movswl 14(%rdi), %eax 1375 ; AVX2-NEXT: vmovd %eax, %xmm0 1376 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm4 1377 ; AVX2-NEXT: movswl (%rdi), %eax 1378 ; AVX2-NEXT: vmovd %eax, %xmm0 1379 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 1380 ; AVX2-NEXT: movswl 2(%rdi), %eax 1381 ; AVX2-NEXT: vmovd %eax, %xmm5 1382 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 1383 ; AVX2-NEXT: movswl 4(%rdi), %eax 1384 ; AVX2-NEXT: vmovd %eax, %xmm6 1385 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 1386 ; AVX2-NEXT: movswl 6(%rdi), %eax 1387 ; AVX2-NEXT: vmovd %eax, %xmm7 1388 ; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 1389 ; AVX2-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 1390 ; AVX2-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 1391 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] 1392 ; AVX2-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 1393 ; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1394 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0] 1395 ; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 1396 ; AVX2-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 1397 ; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1398 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] 1399 ; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1400 ; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1401 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1402 ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1403 ; AVX2-NEXT: retq 1404 ; 1405 ; AVX512-LABEL: load_cvt_8i16_to_8f64: 1406 ; AVX512: # BB#0: 1407 ; AVX512-NEXT: movswl (%rdi), %eax 1408 ; AVX512-NEXT: vmovd %eax, %xmm0 1409 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1410 ; AVX512-NEXT: movswl 2(%rdi), %eax 1411 ; AVX512-NEXT: vmovd %eax, %xmm1 1412 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 1413 ; AVX512-NEXT: movswl 4(%rdi), %eax 1414 ; AVX512-NEXT: vmovd %eax, %xmm2 1415 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 1416 ; AVX512-NEXT: movswl 6(%rdi), %eax 1417 ; AVX512-NEXT: vmovd %eax, %xmm3 1418 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 1419 ; AVX512-NEXT: movswl 8(%rdi), %eax 1420 ; AVX512-NEXT: vmovd %eax, %xmm4 1421 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 1422 ; AVX512-NEXT: movswl 10(%rdi), %eax 1423 ; AVX512-NEXT: vmovd %eax, %xmm5 1424 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 1425 ; AVX512-NEXT: movswl 12(%rdi), %eax 1426 ; AVX512-NEXT: vmovd %eax, %xmm6 1427 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6 1428 ; AVX512-NEXT: movswl 14(%rdi), %eax 1429 ; AVX512-NEXT: vmovd %eax, %xmm7 1430 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7 1431 ; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 1432 ; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 1433 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] 1434 ; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 1435 ; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 1436 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm5[0] 1437 ; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 1438 ; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1439 ; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1440 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1441 ; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1442 ; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1443 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1444 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1445 ; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 1446 ; AVX512-NEXT: retq 1447 %1 = load <8 x i16>, <8 x i16>* %a0 1448 %2 = bitcast <8 x i16> %1 to <8 x half> 1449 %3 = fpext <8 x half> %2 to <8 x double> 1450 ret <8 x double> %3 1451 } 1452 1453 ; 1454 ; Float to Half 1455 ; 1456 1457 define i16 @cvt_f32_to_i16(float %a0) { 1458 ; ALL-LABEL: cvt_f32_to_i16: 1459 ; ALL: # BB#0: 1460 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1461 ; ALL-NEXT: vmovd %xmm0, %eax 1462 ; ALL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> 1463 ; ALL-NEXT: retq 1464 %1 = fptrunc float %a0 to half 1465 %2 = bitcast half %1 to i16 1466 ret i16 %2 1467 } 1468 1469 define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) { 1470 ; ALL-LABEL: cvt_4f32_to_4i16: 1471 ; ALL: # BB#0: 1472 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1473 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1474 ; ALL-NEXT: vmovd %xmm1, %eax 1475 ; ALL-NEXT: shll $16, %eax 1476 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1477 ; ALL-NEXT: vmovd %xmm1, %ecx 1478 ; ALL-NEXT: movzwl %cx, %ecx 1479 ; ALL-NEXT: orl %eax, %ecx 1480 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1481 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1482 ; ALL-NEXT: vmovd %xmm1, %eax 1483 ; ALL-NEXT: shll $16, %eax 1484 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1485 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1486 ; ALL-NEXT: vmovd %xmm0, %edx 1487 ; ALL-NEXT: movzwl %dx, %edx 1488 ; ALL-NEXT: orl %eax, %edx 1489 ; ALL-NEXT: shlq $32, %rdx 1490 ; ALL-NEXT: orq %rcx, %rdx 1491 ; ALL-NEXT: vmovq %rdx, %xmm0 1492 ; ALL-NEXT: retq 1493 %1 = fptrunc <4 x float> %a0 to <4 x half> 1494 %2 = bitcast <4 x half> %1 to <4 x i16> 1495 ret <4 x i16> %2 1496 } 1497 1498 define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) { 1499 ; ALL-LABEL: cvt_4f32_to_8i16_undef: 1500 ; ALL: # BB#0: 1501 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1502 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1503 ; ALL-NEXT: vmovd %xmm1, %eax 1504 ; ALL-NEXT: shll $16, %eax 1505 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1506 ; ALL-NEXT: vmovd %xmm1, %ecx 1507 ; ALL-NEXT: movzwl %cx, %ecx 1508 ; ALL-NEXT: orl %eax, %ecx 1509 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1510 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1511 ; ALL-NEXT: vmovd %xmm1, %eax 1512 ; ALL-NEXT: shll $16, %eax 1513 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1514 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1515 ; ALL-NEXT: vmovd %xmm0, %edx 1516 ; ALL-NEXT: movzwl %dx, %edx 1517 ; ALL-NEXT: orl %eax, %edx 1518 ; ALL-NEXT: shlq $32, %rdx 1519 ; ALL-NEXT: orq %rcx, %rdx 1520 ; ALL-NEXT: vmovq %rdx, %xmm0 1521 ; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1522 ; ALL-NEXT: retq 1523 %1 = fptrunc <4 x float> %a0 to <4 x half> 1524 %2 = bitcast <4 x half> %1 to <4 x i16> 1525 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1526 ret <8 x i16> %3 1527 } 1528 1529 define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) { 1530 ; ALL-LABEL: cvt_4f32_to_8i16_zero: 1531 ; ALL: # BB#0: 1532 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1533 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1534 ; ALL-NEXT: vmovd %xmm1, %eax 1535 ; ALL-NEXT: shll $16, %eax 1536 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1537 ; ALL-NEXT: vmovd %xmm1, %ecx 1538 ; ALL-NEXT: movzwl %cx, %ecx 1539 ; ALL-NEXT: orl %eax, %ecx 1540 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1541 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1542 ; ALL-NEXT: vmovd %xmm1, %eax 1543 ; ALL-NEXT: shll $16, %eax 1544 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1545 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1546 ; ALL-NEXT: vmovd %xmm0, %edx 1547 ; ALL-NEXT: movzwl %dx, %edx 1548 ; ALL-NEXT: orl %eax, %edx 1549 ; ALL-NEXT: shlq $32, %rdx 1550 ; ALL-NEXT: orq %rcx, %rdx 1551 ; ALL-NEXT: vmovq %rdx, %xmm0 1552 ; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 1553 ; ALL-NEXT: retq 1554 %1 = fptrunc <4 x float> %a0 to <4 x half> 1555 %2 = bitcast <4 x half> %1 to <4 x i16> 1556 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1557 ret <8 x i16> %3 1558 } 1559 1560 define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) { 1561 ; AVX1-LABEL: cvt_8f32_to_8i16: 1562 ; AVX1: # BB#0: 1563 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1564 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1565 ; AVX1-NEXT: vmovd %xmm1, %eax 1566 ; AVX1-NEXT: shll $16, %eax 1567 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1568 ; AVX1-NEXT: vmovd %xmm1, %ecx 1569 ; AVX1-NEXT: movzwl %cx, %ecx 1570 ; AVX1-NEXT: orl %eax, %ecx 1571 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1572 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1573 ; AVX1-NEXT: vmovd %xmm1, %edx 1574 ; AVX1-NEXT: shll $16, %edx 1575 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1576 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1577 ; AVX1-NEXT: vmovd %xmm1, %eax 1578 ; AVX1-NEXT: movzwl %ax, %eax 1579 ; AVX1-NEXT: orl %edx, %eax 1580 ; AVX1-NEXT: shlq $32, %rax 1581 ; AVX1-NEXT: orq %rcx, %rax 1582 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1583 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1584 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1585 ; AVX1-NEXT: vmovd %xmm1, %ecx 1586 ; AVX1-NEXT: shll $16, %ecx 1587 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1588 ; AVX1-NEXT: vmovd %xmm1, %edx 1589 ; AVX1-NEXT: movzwl %dx, %edx 1590 ; AVX1-NEXT: orl %ecx, %edx 1591 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1592 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1593 ; AVX1-NEXT: vmovd %xmm1, %ecx 1594 ; AVX1-NEXT: shll $16, %ecx 1595 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1596 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1597 ; AVX1-NEXT: vmovd %xmm0, %esi 1598 ; AVX1-NEXT: movzwl %si, %esi 1599 ; AVX1-NEXT: orl %ecx, %esi 1600 ; AVX1-NEXT: shlq $32, %rsi 1601 ; AVX1-NEXT: orq %rdx, %rsi 1602 ; AVX1-NEXT: vmovq %rsi, %xmm0 1603 ; AVX1-NEXT: vmovq %rax, %xmm1 1604 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1605 ; AVX1-NEXT: vzeroupper 1606 ; AVX1-NEXT: retq 1607 ; 1608 ; AVX2-LABEL: cvt_8f32_to_8i16: 1609 ; AVX2: # BB#0: 1610 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1611 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1612 ; AVX2-NEXT: vmovd %xmm1, %eax 1613 ; AVX2-NEXT: shll $16, %eax 1614 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1615 ; AVX2-NEXT: vmovd %xmm1, %ecx 1616 ; AVX2-NEXT: movzwl %cx, %ecx 1617 ; AVX2-NEXT: orl %eax, %ecx 1618 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1619 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1620 ; AVX2-NEXT: vmovd %xmm1, %edx 1621 ; AVX2-NEXT: shll $16, %edx 1622 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1623 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1624 ; AVX2-NEXT: vmovd %xmm1, %eax 1625 ; AVX2-NEXT: movzwl %ax, %eax 1626 ; AVX2-NEXT: orl %edx, %eax 1627 ; AVX2-NEXT: shlq $32, %rax 1628 ; AVX2-NEXT: orq %rcx, %rax 1629 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 1630 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1631 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1632 ; AVX2-NEXT: vmovd %xmm1, %ecx 1633 ; AVX2-NEXT: shll $16, %ecx 1634 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1635 ; AVX2-NEXT: vmovd %xmm1, %edx 1636 ; AVX2-NEXT: movzwl %dx, %edx 1637 ; AVX2-NEXT: orl %ecx, %edx 1638 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1639 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1640 ; AVX2-NEXT: vmovd %xmm1, %ecx 1641 ; AVX2-NEXT: shll $16, %ecx 1642 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1643 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1644 ; AVX2-NEXT: vmovd %xmm0, %esi 1645 ; AVX2-NEXT: movzwl %si, %esi 1646 ; AVX2-NEXT: orl %ecx, %esi 1647 ; AVX2-NEXT: shlq $32, %rsi 1648 ; AVX2-NEXT: orq %rdx, %rsi 1649 ; AVX2-NEXT: vmovq %rsi, %xmm0 1650 ; AVX2-NEXT: vmovq %rax, %xmm1 1651 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1652 ; AVX2-NEXT: vzeroupper 1653 ; AVX2-NEXT: retq 1654 ; 1655 ; AVX512-LABEL: cvt_8f32_to_8i16: 1656 ; AVX512: # BB#0: 1657 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1658 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1659 ; AVX512-NEXT: vmovd %xmm1, %eax 1660 ; AVX512-NEXT: shll $16, %eax 1661 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1662 ; AVX512-NEXT: vmovd %xmm1, %ecx 1663 ; AVX512-NEXT: movzwl %cx, %ecx 1664 ; AVX512-NEXT: orl %eax, %ecx 1665 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1666 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1667 ; AVX512-NEXT: vmovd %xmm1, %edx 1668 ; AVX512-NEXT: shll $16, %edx 1669 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1670 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1671 ; AVX512-NEXT: vmovd %xmm1, %eax 1672 ; AVX512-NEXT: movzwl %ax, %eax 1673 ; AVX512-NEXT: orl %edx, %eax 1674 ; AVX512-NEXT: shlq $32, %rax 1675 ; AVX512-NEXT: orq %rcx, %rax 1676 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 1677 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1678 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1679 ; AVX512-NEXT: vmovd %xmm1, %ecx 1680 ; AVX512-NEXT: shll $16, %ecx 1681 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1682 ; AVX512-NEXT: vmovd %xmm1, %edx 1683 ; AVX512-NEXT: movzwl %dx, %edx 1684 ; AVX512-NEXT: orl %ecx, %edx 1685 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1686 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1687 ; AVX512-NEXT: vmovd %xmm1, %ecx 1688 ; AVX512-NEXT: shll $16, %ecx 1689 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1690 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1691 ; AVX512-NEXT: vmovd %xmm0, %esi 1692 ; AVX512-NEXT: movzwl %si, %esi 1693 ; AVX512-NEXT: orl %ecx, %esi 1694 ; AVX512-NEXT: shlq $32, %rsi 1695 ; AVX512-NEXT: orq %rdx, %rsi 1696 ; AVX512-NEXT: vmovq %rsi, %xmm0 1697 ; AVX512-NEXT: vmovq %rax, %xmm1 1698 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1699 ; AVX512-NEXT: retq 1700 %1 = fptrunc <8 x float> %a0 to <8 x half> 1701 %2 = bitcast <8 x half> %1 to <8 x i16> 1702 ret <8 x i16> %2 1703 } 1704 1705 define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) { 1706 ; AVX1-LABEL: cvt_16f32_to_16i16: 1707 ; AVX1: # BB#0: 1708 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm2 1709 ; AVX1-NEXT: vmovd %xmm2, %eax 1710 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 1711 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 1712 ; AVX1-NEXT: vmovd %eax, %xmm3 1713 ; AVX1-NEXT: vmovd %xmm2, %eax 1714 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1715 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 1716 ; AVX1-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 1717 ; AVX1-NEXT: vmovd %xmm2, %eax 1718 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1719 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 1720 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1721 ; AVX1-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 1722 ; AVX1-NEXT: vmovd %xmm1, %eax 1723 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm1 1724 ; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 1725 ; AVX1-NEXT: vmovd %xmm1, %eax 1726 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 1727 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1728 ; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 1729 ; AVX1-NEXT: vmovd %xmm1, %eax 1730 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1731 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1732 ; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 1733 ; AVX1-NEXT: vmovd %xmm1, %eax 1734 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1735 ; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 1736 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 1737 ; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 1738 ; AVX1-NEXT: vmovd %xmm2, %eax 1739 ; AVX1-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 1740 ; AVX1-NEXT: vmovd %xmm1, %eax 1741 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1742 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1743 ; AVX1-NEXT: vmovd %eax, %xmm3 1744 ; AVX1-NEXT: vmovd %xmm1, %eax 1745 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1746 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1747 ; AVX1-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 1748 ; AVX1-NEXT: vmovd %xmm1, %eax 1749 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1750 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 1751 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1752 ; AVX1-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 1753 ; AVX1-NEXT: vmovd %xmm0, %eax 1754 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm0 1755 ; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 1756 ; AVX1-NEXT: vmovd %xmm0, %eax 1757 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 1758 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1759 ; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 1760 ; AVX1-NEXT: vmovd %xmm0, %eax 1761 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3] 1762 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1763 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1764 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1765 ; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 1766 ; AVX1-NEXT: vmovd %xmm1, %eax 1767 ; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1 1768 ; AVX1-NEXT: vmovd %xmm0, %eax 1769 ; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 1770 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1771 ; AVX1-NEXT: retq 1772 ; 1773 ; AVX2-LABEL: cvt_16f32_to_16i16: 1774 ; AVX2: # BB#0: 1775 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm2 1776 ; AVX2-NEXT: vmovd %xmm2, %eax 1777 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 1778 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 1779 ; AVX2-NEXT: vmovd %eax, %xmm3 1780 ; AVX2-NEXT: vmovd %xmm2, %eax 1781 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1782 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 1783 ; AVX2-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 1784 ; AVX2-NEXT: vmovd %xmm2, %eax 1785 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 1786 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 1787 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1788 ; AVX2-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 1789 ; AVX2-NEXT: vmovd %xmm1, %eax 1790 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm1 1791 ; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 1792 ; AVX2-NEXT: vmovd %xmm1, %eax 1793 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 1794 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1795 ; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 1796 ; AVX2-NEXT: vmovd %xmm1, %eax 1797 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1798 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1799 ; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 1800 ; AVX2-NEXT: vmovd %xmm1, %eax 1801 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1802 ; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 1803 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 1804 ; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 1805 ; AVX2-NEXT: vmovd %xmm2, %eax 1806 ; AVX2-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 1807 ; AVX2-NEXT: vmovd %xmm1, %eax 1808 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1809 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1810 ; AVX2-NEXT: vmovd %eax, %xmm3 1811 ; AVX2-NEXT: vmovd %xmm1, %eax 1812 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1813 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1814 ; AVX2-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 1815 ; AVX2-NEXT: vmovd %xmm1, %eax 1816 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 1817 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 1818 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1819 ; AVX2-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 1820 ; AVX2-NEXT: vmovd %xmm0, %eax 1821 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm0 1822 ; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 1823 ; AVX2-NEXT: vmovd %xmm0, %eax 1824 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 1825 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1826 ; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 1827 ; AVX2-NEXT: vmovd %xmm0, %eax 1828 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3] 1829 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1830 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1831 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1832 ; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 1833 ; AVX2-NEXT: vmovd %xmm1, %eax 1834 ; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1 1835 ; AVX2-NEXT: vmovd %xmm0, %eax 1836 ; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 1837 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 1838 ; AVX2-NEXT: retq 1839 ; 1840 ; AVX512-LABEL: cvt_16f32_to_16i16: 1841 ; AVX512: # BB#0: 1842 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 1843 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm2 1844 ; AVX512-NEXT: vmovd %xmm2, %eax 1845 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 1846 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 1847 ; AVX512-NEXT: vmovd %eax, %xmm3 1848 ; AVX512-NEXT: vmovd %xmm2, %eax 1849 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1850 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 1851 ; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 1852 ; AVX512-NEXT: vmovd %xmm2, %eax 1853 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 1854 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 1855 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1856 ; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 1857 ; AVX512-NEXT: vmovd %xmm1, %eax 1858 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1 1859 ; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 1860 ; AVX512-NEXT: vmovd %xmm1, %eax 1861 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 1862 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1863 ; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 1864 ; AVX512-NEXT: vmovd %xmm1, %eax 1865 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1866 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1867 ; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 1868 ; AVX512-NEXT: vmovd %xmm1, %eax 1869 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1870 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 1871 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 1872 ; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 1873 ; AVX512-NEXT: vmovd %xmm2, %eax 1874 ; AVX512-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 1875 ; AVX512-NEXT: vmovd %xmm1, %eax 1876 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1877 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1878 ; AVX512-NEXT: vmovd %eax, %xmm3 1879 ; AVX512-NEXT: vmovd %xmm1, %eax 1880 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1881 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1882 ; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 1883 ; AVX512-NEXT: vmovd %xmm1, %eax 1884 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 1885 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 1886 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1887 ; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 1888 ; AVX512-NEXT: vmovd %xmm0, %eax 1889 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0 1890 ; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 1891 ; AVX512-NEXT: vmovd %xmm0, %eax 1892 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 1893 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1894 ; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 1895 ; AVX512-NEXT: vmovd %xmm0, %eax 1896 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] 1897 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1898 ; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 1899 ; AVX512-NEXT: vmovd %xmm0, %eax 1900 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3] 1901 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1902 ; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1 1903 ; AVX512-NEXT: vmovd %xmm0, %eax 1904 ; AVX512-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 1905 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 1906 ; AVX512-NEXT: retq 1907 %1 = fptrunc <16 x float> %a0 to <16 x half> 1908 %2 = bitcast <16 x half> %1 to <16 x i16> 1909 ret <16 x i16> %2 1910 } 1911 1912 ; 1913 ; Float to Half (Store) 1914 ; 1915 1916 define void @store_cvt_f32_to_i16(float %a0, i16* %a1) { 1917 ; ALL-LABEL: store_cvt_f32_to_i16: 1918 ; ALL: # BB#0: 1919 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1920 ; ALL-NEXT: vmovd %xmm0, %eax 1921 ; ALL-NEXT: movw %ax, (%rdi) 1922 ; ALL-NEXT: retq 1923 %1 = fptrunc float %a0 to half 1924 %2 = bitcast half %1 to i16 1925 store i16 %2, i16* %a1 1926 ret void 1927 } 1928 1929 define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) { 1930 ; ALL-LABEL: store_cvt_4f32_to_4i16: 1931 ; ALL: # BB#0: 1932 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1933 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1934 ; ALL-NEXT: vmovd %xmm1, %eax 1935 ; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1936 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1937 ; ALL-NEXT: vmovd %xmm1, %ecx 1938 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1939 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1940 ; ALL-NEXT: vmovd %xmm1, %edx 1941 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1942 ; ALL-NEXT: vmovd %xmm0, %esi 1943 ; ALL-NEXT: movw %si, (%rdi) 1944 ; ALL-NEXT: movw %dx, 6(%rdi) 1945 ; ALL-NEXT: movw %cx, 4(%rdi) 1946 ; ALL-NEXT: movw %ax, 2(%rdi) 1947 ; ALL-NEXT: retq 1948 %1 = fptrunc <4 x float> %a0 to <4 x half> 1949 %2 = bitcast <4 x half> %1 to <4 x i16> 1950 store <4 x i16> %2, <4 x i16>* %a1 1951 ret void 1952 } 1953 1954 define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) { 1955 ; ALL-LABEL: store_cvt_4f32_to_8i16_undef: 1956 ; ALL: # BB#0: 1957 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1958 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1959 ; ALL-NEXT: vmovd %xmm1, %eax 1960 ; ALL-NEXT: shll $16, %eax 1961 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1962 ; ALL-NEXT: vmovd %xmm1, %ecx 1963 ; ALL-NEXT: movzwl %cx, %ecx 1964 ; ALL-NEXT: orl %eax, %ecx 1965 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1966 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1967 ; ALL-NEXT: vmovd %xmm1, %eax 1968 ; ALL-NEXT: shll $16, %eax 1969 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1970 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1971 ; ALL-NEXT: vmovd %xmm0, %edx 1972 ; ALL-NEXT: movzwl %dx, %edx 1973 ; ALL-NEXT: orl %eax, %edx 1974 ; ALL-NEXT: shlq $32, %rdx 1975 ; ALL-NEXT: orq %rcx, %rdx 1976 ; ALL-NEXT: vmovq %rdx, %xmm0 1977 ; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1978 ; ALL-NEXT: vmovdqa %xmm0, (%rdi) 1979 ; ALL-NEXT: retq 1980 %1 = fptrunc <4 x float> %a0 to <4 x half> 1981 %2 = bitcast <4 x half> %1 to <4 x i16> 1982 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1983 store <8 x i16> %3, <8 x i16>* %a1 1984 ret void 1985 } 1986 1987 define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) { 1988 ; ALL-LABEL: store_cvt_4f32_to_8i16_zero: 1989 ; ALL: # BB#0: 1990 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1991 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1992 ; ALL-NEXT: vmovd %xmm1, %eax 1993 ; ALL-NEXT: shll $16, %eax 1994 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1995 ; ALL-NEXT: vmovd %xmm1, %ecx 1996 ; ALL-NEXT: movzwl %cx, %ecx 1997 ; ALL-NEXT: orl %eax, %ecx 1998 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1999 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2000 ; ALL-NEXT: vmovd %xmm1, %eax 2001 ; ALL-NEXT: shll $16, %eax 2002 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2003 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2004 ; ALL-NEXT: vmovd %xmm0, %edx 2005 ; ALL-NEXT: movzwl %dx, %edx 2006 ; ALL-NEXT: orl %eax, %edx 2007 ; ALL-NEXT: shlq $32, %rdx 2008 ; ALL-NEXT: orq %rcx, %rdx 2009 ; ALL-NEXT: vmovq %rdx, %xmm0 2010 ; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 2011 ; ALL-NEXT: vmovdqa %xmm0, (%rdi) 2012 ; ALL-NEXT: retq 2013 %1 = fptrunc <4 x float> %a0 to <4 x half> 2014 %2 = bitcast <4 x half> %1 to <4 x i16> 2015 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2016 store <8 x i16> %3, <8 x i16>* %a1 2017 ret void 2018 } 2019 2020 define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) { 2021 ; AVX1-LABEL: store_cvt_8f32_to_8i16: 2022 ; AVX1: # BB#0: 2023 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2024 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2025 ; AVX1-NEXT: vmovd %xmm1, %r8d 2026 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2027 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2028 ; AVX1-NEXT: vmovd %xmm1, %r9d 2029 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2030 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2031 ; AVX1-NEXT: vmovd %xmm1, %r10d 2032 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2033 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 2034 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2035 ; AVX1-NEXT: vmovd %xmm2, %r11d 2036 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 2037 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2038 ; AVX1-NEXT: vmovd %xmm2, %eax 2039 ; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 2040 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2041 ; AVX1-NEXT: vmovd %xmm2, %ecx 2042 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2043 ; AVX1-NEXT: vmovd %xmm0, %edx 2044 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm0 2045 ; AVX1-NEXT: vmovd %xmm0, %esi 2046 ; AVX1-NEXT: movw %si, 8(%rdi) 2047 ; AVX1-NEXT: movw %dx, (%rdi) 2048 ; AVX1-NEXT: movw %cx, 14(%rdi) 2049 ; AVX1-NEXT: movw %ax, 12(%rdi) 2050 ; AVX1-NEXT: movw %r11w, 10(%rdi) 2051 ; AVX1-NEXT: movw %r10w, 6(%rdi) 2052 ; AVX1-NEXT: movw %r9w, 4(%rdi) 2053 ; AVX1-NEXT: movw %r8w, 2(%rdi) 2054 ; AVX1-NEXT: vzeroupper 2055 ; AVX1-NEXT: retq 2056 ; 2057 ; AVX2-LABEL: store_cvt_8f32_to_8i16: 2058 ; AVX2: # BB#0: 2059 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2060 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2061 ; AVX2-NEXT: vmovd %xmm1, %r8d 2062 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2063 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2064 ; AVX2-NEXT: vmovd %xmm1, %r9d 2065 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2066 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2067 ; AVX2-NEXT: vmovd %xmm1, %r10d 2068 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 2069 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 2070 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2071 ; AVX2-NEXT: vmovd %xmm2, %r11d 2072 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 2073 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2074 ; AVX2-NEXT: vmovd %xmm2, %eax 2075 ; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 2076 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2077 ; AVX2-NEXT: vmovd %xmm2, %ecx 2078 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2079 ; AVX2-NEXT: vmovd %xmm0, %edx 2080 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm0 2081 ; AVX2-NEXT: vmovd %xmm0, %esi 2082 ; AVX2-NEXT: movw %si, 8(%rdi) 2083 ; AVX2-NEXT: movw %dx, (%rdi) 2084 ; AVX2-NEXT: movw %cx, 14(%rdi) 2085 ; AVX2-NEXT: movw %ax, 12(%rdi) 2086 ; AVX2-NEXT: movw %r11w, 10(%rdi) 2087 ; AVX2-NEXT: movw %r10w, 6(%rdi) 2088 ; AVX2-NEXT: movw %r9w, 4(%rdi) 2089 ; AVX2-NEXT: movw %r8w, 2(%rdi) 2090 ; AVX2-NEXT: vzeroupper 2091 ; AVX2-NEXT: retq 2092 ; 2093 ; AVX512-LABEL: store_cvt_8f32_to_8i16: 2094 ; AVX512: # BB#0: 2095 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2096 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2097 ; AVX512-NEXT: vmovd %xmm1, %r8d 2098 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2099 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2100 ; AVX512-NEXT: vmovd %xmm1, %r9d 2101 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2102 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2103 ; AVX512-NEXT: vmovd %xmm1, %r10d 2104 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 2105 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 2106 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2107 ; AVX512-NEXT: vmovd %xmm2, %r11d 2108 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 2109 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2110 ; AVX512-NEXT: vmovd %xmm2, %eax 2111 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 2112 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2113 ; AVX512-NEXT: vmovd %xmm2, %ecx 2114 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2115 ; AVX512-NEXT: vmovd %xmm0, %edx 2116 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0 2117 ; AVX512-NEXT: vmovd %xmm0, %esi 2118 ; AVX512-NEXT: movw %si, 8(%rdi) 2119 ; AVX512-NEXT: movw %dx, (%rdi) 2120 ; AVX512-NEXT: movw %cx, 14(%rdi) 2121 ; AVX512-NEXT: movw %ax, 12(%rdi) 2122 ; AVX512-NEXT: movw %r11w, 10(%rdi) 2123 ; AVX512-NEXT: movw %r10w, 6(%rdi) 2124 ; AVX512-NEXT: movw %r9w, 4(%rdi) 2125 ; AVX512-NEXT: movw %r8w, 2(%rdi) 2126 ; AVX512-NEXT: retq 2127 %1 = fptrunc <8 x float> %a0 to <8 x half> 2128 %2 = bitcast <8 x half> %1 to <8 x i16> 2129 store <8 x i16> %2, <8 x i16>* %a1 2130 ret void 2131 } 2132 2133 define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) { 2134 ; AVX1-LABEL: store_cvt_16f32_to_16i16: 2135 ; AVX1: # BB#0: 2136 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2137 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2138 ; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm4 2139 ; AVX1-NEXT: vmovd %xmm4, %eax 2140 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm4 2141 ; AVX1-NEXT: movw %ax, 24(%rdi) 2142 ; AVX1-NEXT: vmovd %xmm4, %eax 2143 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm4 2144 ; AVX1-NEXT: movw %ax, 16(%rdi) 2145 ; AVX1-NEXT: vmovd %xmm4, %eax 2146 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm4 2147 ; AVX1-NEXT: movw %ax, 8(%rdi) 2148 ; AVX1-NEXT: vmovd %xmm4, %eax 2149 ; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3] 2150 ; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2151 ; AVX1-NEXT: movw %ax, (%rdi) 2152 ; AVX1-NEXT: vmovd %xmm4, %eax 2153 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] 2154 ; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2155 ; AVX1-NEXT: movw %ax, 30(%rdi) 2156 ; AVX1-NEXT: vmovd %xmm4, %eax 2157 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 2158 ; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2159 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] 2160 ; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2161 ; AVX1-NEXT: movw %ax, 28(%rdi) 2162 ; AVX1-NEXT: vmovd %xmm3, %eax 2163 ; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3] 2164 ; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2165 ; AVX1-NEXT: movw %ax, 26(%rdi) 2166 ; AVX1-NEXT: vmovd %xmm3, %eax 2167 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 2168 ; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2169 ; AVX1-NEXT: movw %ax, 22(%rdi) 2170 ; AVX1-NEXT: vmovd %xmm3, %eax 2171 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 2172 ; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2173 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 2174 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2175 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 2176 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2177 ; AVX1-NEXT: movw %ax, 20(%rdi) 2178 ; AVX1-NEXT: vmovd %xmm1, %eax 2179 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3] 2180 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2181 ; AVX1-NEXT: movw %ax, 18(%rdi) 2182 ; AVX1-NEXT: vmovd %xmm1, %eax 2183 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 2184 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2185 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 2186 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2187 ; AVX1-NEXT: movw %ax, 14(%rdi) 2188 ; AVX1-NEXT: vmovd %xmm2, %eax 2189 ; AVX1-NEXT: movw %ax, 12(%rdi) 2190 ; AVX1-NEXT: vmovd %xmm1, %eax 2191 ; AVX1-NEXT: movw %ax, 10(%rdi) 2192 ; AVX1-NEXT: vmovd %xmm0, %eax 2193 ; AVX1-NEXT: movw %ax, 6(%rdi) 2194 ; AVX1-NEXT: vmovd %xmm3, %eax 2195 ; AVX1-NEXT: movw %ax, 4(%rdi) 2196 ; AVX1-NEXT: vmovd %xmm4, %eax 2197 ; AVX1-NEXT: movw %ax, 2(%rdi) 2198 ; AVX1-NEXT: vzeroupper 2199 ; AVX1-NEXT: retq 2200 ; 2201 ; AVX2-LABEL: store_cvt_16f32_to_16i16: 2202 ; AVX2: # BB#0: 2203 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 2204 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm3 2205 ; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm4 2206 ; AVX2-NEXT: vmovd %xmm4, %eax 2207 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm4 2208 ; AVX2-NEXT: movw %ax, 24(%rdi) 2209 ; AVX2-NEXT: vmovd %xmm4, %eax 2210 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm4 2211 ; AVX2-NEXT: movw %ax, 16(%rdi) 2212 ; AVX2-NEXT: vmovd %xmm4, %eax 2213 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm4 2214 ; AVX2-NEXT: movw %ax, 8(%rdi) 2215 ; AVX2-NEXT: vmovd %xmm4, %eax 2216 ; AVX2-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3] 2217 ; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2218 ; AVX2-NEXT: movw %ax, (%rdi) 2219 ; AVX2-NEXT: vmovd %xmm4, %eax 2220 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] 2221 ; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2222 ; AVX2-NEXT: movw %ax, 30(%rdi) 2223 ; AVX2-NEXT: vmovd %xmm4, %eax 2224 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 2225 ; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2226 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] 2227 ; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2228 ; AVX2-NEXT: movw %ax, 28(%rdi) 2229 ; AVX2-NEXT: vmovd %xmm3, %eax 2230 ; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3] 2231 ; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2232 ; AVX2-NEXT: movw %ax, 26(%rdi) 2233 ; AVX2-NEXT: vmovd %xmm3, %eax 2234 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 2235 ; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2236 ; AVX2-NEXT: movw %ax, 22(%rdi) 2237 ; AVX2-NEXT: vmovd %xmm3, %eax 2238 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 2239 ; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2240 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 2241 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2242 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 2243 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2244 ; AVX2-NEXT: movw %ax, 20(%rdi) 2245 ; AVX2-NEXT: vmovd %xmm1, %eax 2246 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3] 2247 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2248 ; AVX2-NEXT: movw %ax, 18(%rdi) 2249 ; AVX2-NEXT: vmovd %xmm1, %eax 2250 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 2251 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2252 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 2253 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2254 ; AVX2-NEXT: movw %ax, 14(%rdi) 2255 ; AVX2-NEXT: vmovd %xmm2, %eax 2256 ; AVX2-NEXT: movw %ax, 12(%rdi) 2257 ; AVX2-NEXT: vmovd %xmm1, %eax 2258 ; AVX2-NEXT: movw %ax, 10(%rdi) 2259 ; AVX2-NEXT: vmovd %xmm0, %eax 2260 ; AVX2-NEXT: movw %ax, 6(%rdi) 2261 ; AVX2-NEXT: vmovd %xmm3, %eax 2262 ; AVX2-NEXT: movw %ax, 4(%rdi) 2263 ; AVX2-NEXT: vmovd %xmm4, %eax 2264 ; AVX2-NEXT: movw %ax, 2(%rdi) 2265 ; AVX2-NEXT: vzeroupper 2266 ; AVX2-NEXT: retq 2267 ; 2268 ; AVX512-LABEL: store_cvt_16f32_to_16i16: 2269 ; AVX512: # BB#0: 2270 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 2271 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm2 2272 ; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm3 2273 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm4 2274 ; AVX512-NEXT: vmovd %xmm4, %eax 2275 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm4 2276 ; AVX512-NEXT: movw %ax, 24(%rdi) 2277 ; AVX512-NEXT: vmovd %xmm4, %eax 2278 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm4 2279 ; AVX512-NEXT: movw %ax, 16(%rdi) 2280 ; AVX512-NEXT: vmovd %xmm4, %eax 2281 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm4 2282 ; AVX512-NEXT: movw %ax, 8(%rdi) 2283 ; AVX512-NEXT: vmovd %xmm4, %eax 2284 ; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3] 2285 ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2286 ; AVX512-NEXT: movw %ax, (%rdi) 2287 ; AVX512-NEXT: vmovd %xmm4, %eax 2288 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] 2289 ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2290 ; AVX512-NEXT: movw %ax, 30(%rdi) 2291 ; AVX512-NEXT: vmovd %xmm4, %eax 2292 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 2293 ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2294 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] 2295 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2296 ; AVX512-NEXT: movw %ax, 28(%rdi) 2297 ; AVX512-NEXT: vmovd %xmm3, %eax 2298 ; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3] 2299 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2300 ; AVX512-NEXT: movw %ax, 26(%rdi) 2301 ; AVX512-NEXT: vmovd %xmm3, %eax 2302 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 2303 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2304 ; AVX512-NEXT: movw %ax, 22(%rdi) 2305 ; AVX512-NEXT: vmovd %xmm3, %eax 2306 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 2307 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2308 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 2309 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2310 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] 2311 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2312 ; AVX512-NEXT: movw %ax, 20(%rdi) 2313 ; AVX512-NEXT: vmovd %xmm2, %eax 2314 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 2315 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2316 ; AVX512-NEXT: movw %ax, 18(%rdi) 2317 ; AVX512-NEXT: vmovd %xmm2, %eax 2318 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 2319 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2320 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 2321 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2322 ; AVX512-NEXT: movw %ax, 14(%rdi) 2323 ; AVX512-NEXT: vmovd %xmm1, %eax 2324 ; AVX512-NEXT: movw %ax, 12(%rdi) 2325 ; AVX512-NEXT: vmovd %xmm2, %eax 2326 ; AVX512-NEXT: movw %ax, 10(%rdi) 2327 ; AVX512-NEXT: vmovd %xmm0, %eax 2328 ; AVX512-NEXT: movw %ax, 6(%rdi) 2329 ; AVX512-NEXT: vmovd %xmm3, %eax 2330 ; AVX512-NEXT: movw %ax, 4(%rdi) 2331 ; AVX512-NEXT: vmovd %xmm4, %eax 2332 ; AVX512-NEXT: movw %ax, 2(%rdi) 2333 ; AVX512-NEXT: retq 2334 %1 = fptrunc <16 x float> %a0 to <16 x half> 2335 %2 = bitcast <16 x half> %1 to <16 x i16> 2336 store <16 x i16> %2, <16 x i16>* %a1 2337 ret void 2338 } 2339 2340 ; 2341 ; Double to Half 2342 ; 2343 2344 define i16 @cvt_f64_to_i16(double %a0) { 2345 ; ALL-LABEL: cvt_f64_to_i16: 2346 ; ALL: # BB#0: 2347 ; ALL-NEXT: jmp __truncdfhf2 # TAILCALL 2348 %1 = fptrunc double %a0 to half 2349 %2 = bitcast half %1 to i16 2350 ret i16 %2 2351 } 2352 2353 define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) { 2354 ; ALL-LABEL: cvt_2f64_to_2i16: 2355 ; ALL: # BB#0: 2356 ; ALL-NEXT: pushq %rbx 2357 ; ALL-NEXT: .Ltmp0: 2358 ; ALL-NEXT: .cfi_def_cfa_offset 16 2359 ; ALL-NEXT: subq $16, %rsp 2360 ; ALL-NEXT: .Ltmp1: 2361 ; ALL-NEXT: .cfi_def_cfa_offset 32 2362 ; ALL-NEXT: .Ltmp2: 2363 ; ALL-NEXT: .cfi_offset %rbx, -16 2364 ; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2365 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2366 ; ALL-NEXT: callq __truncdfhf2 2367 ; ALL-NEXT: movw %ax, %bx 2368 ; ALL-NEXT: shll $16, %ebx 2369 ; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2370 ; ALL-NEXT: callq __truncdfhf2 2371 ; ALL-NEXT: movzwl %ax, %eax 2372 ; ALL-NEXT: orl %ebx, %eax 2373 ; ALL-NEXT: vmovd %eax, %xmm0 2374 ; ALL-NEXT: addq $16, %rsp 2375 ; ALL-NEXT: popq %rbx 2376 ; ALL-NEXT: retq 2377 %1 = fptrunc <2 x double> %a0 to <2 x half> 2378 %2 = bitcast <2 x half> %1 to <2 x i16> 2379 ret <2 x i16> %2 2380 } 2381 2382 define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) { 2383 ; AVX1-LABEL: cvt_4f64_to_4i16: 2384 ; AVX1: # BB#0: 2385 ; AVX1-NEXT: pushq %r14 2386 ; AVX1-NEXT: .Ltmp3: 2387 ; AVX1-NEXT: .cfi_def_cfa_offset 16 2388 ; AVX1-NEXT: pushq %rbx 2389 ; AVX1-NEXT: .Ltmp4: 2390 ; AVX1-NEXT: .cfi_def_cfa_offset 24 2391 ; AVX1-NEXT: subq $40, %rsp 2392 ; AVX1-NEXT: .Ltmp5: 2393 ; AVX1-NEXT: .cfi_def_cfa_offset 64 2394 ; AVX1-NEXT: .Ltmp6: 2395 ; AVX1-NEXT: .cfi_offset %rbx, -24 2396 ; AVX1-NEXT: .Ltmp7: 2397 ; AVX1-NEXT: .cfi_offset %r14, -16 2398 ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 2399 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2400 ; AVX1-NEXT: vzeroupper 2401 ; AVX1-NEXT: callq __truncdfhf2 2402 ; AVX1-NEXT: movw %ax, %bx 2403 ; AVX1-NEXT: shll $16, %ebx 2404 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2405 ; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2406 ; AVX1-NEXT: vzeroupper 2407 ; AVX1-NEXT: callq __truncdfhf2 2408 ; AVX1-NEXT: movzwl %ax, %r14d 2409 ; AVX1-NEXT: orl %ebx, %r14d 2410 ; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2411 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2412 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2413 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2414 ; AVX1-NEXT: vzeroupper 2415 ; AVX1-NEXT: callq __truncdfhf2 2416 ; AVX1-NEXT: movw %ax, %bx 2417 ; AVX1-NEXT: shll $16, %ebx 2418 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2419 ; AVX1-NEXT: callq __truncdfhf2 2420 ; AVX1-NEXT: movzwl %ax, %eax 2421 ; AVX1-NEXT: orl %ebx, %eax 2422 ; AVX1-NEXT: shlq $32, %rax 2423 ; AVX1-NEXT: orq %r14, %rax 2424 ; AVX1-NEXT: vmovq %rax, %xmm0 2425 ; AVX1-NEXT: addq $40, %rsp 2426 ; AVX1-NEXT: popq %rbx 2427 ; AVX1-NEXT: popq %r14 2428 ; AVX1-NEXT: retq 2429 ; 2430 ; AVX2-LABEL: cvt_4f64_to_4i16: 2431 ; AVX2: # BB#0: 2432 ; AVX2-NEXT: pushq %r14 2433 ; AVX2-NEXT: .Ltmp3: 2434 ; AVX2-NEXT: .cfi_def_cfa_offset 16 2435 ; AVX2-NEXT: pushq %rbx 2436 ; AVX2-NEXT: .Ltmp4: 2437 ; AVX2-NEXT: .cfi_def_cfa_offset 24 2438 ; AVX2-NEXT: subq $40, %rsp 2439 ; AVX2-NEXT: .Ltmp5: 2440 ; AVX2-NEXT: .cfi_def_cfa_offset 64 2441 ; AVX2-NEXT: .Ltmp6: 2442 ; AVX2-NEXT: .cfi_offset %rbx, -24 2443 ; AVX2-NEXT: .Ltmp7: 2444 ; AVX2-NEXT: .cfi_offset %r14, -16 2445 ; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 2446 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2447 ; AVX2-NEXT: vzeroupper 2448 ; AVX2-NEXT: callq __truncdfhf2 2449 ; AVX2-NEXT: movw %ax, %bx 2450 ; AVX2-NEXT: shll $16, %ebx 2451 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2452 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2453 ; AVX2-NEXT: vzeroupper 2454 ; AVX2-NEXT: callq __truncdfhf2 2455 ; AVX2-NEXT: movzwl %ax, %r14d 2456 ; AVX2-NEXT: orl %ebx, %r14d 2457 ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2458 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 2459 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2460 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2461 ; AVX2-NEXT: vzeroupper 2462 ; AVX2-NEXT: callq __truncdfhf2 2463 ; AVX2-NEXT: movw %ax, %bx 2464 ; AVX2-NEXT: shll $16, %ebx 2465 ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2466 ; AVX2-NEXT: callq __truncdfhf2 2467 ; AVX2-NEXT: movzwl %ax, %eax 2468 ; AVX2-NEXT: orl %ebx, %eax 2469 ; AVX2-NEXT: shlq $32, %rax 2470 ; AVX2-NEXT: orq %r14, %rax 2471 ; AVX2-NEXT: vmovq %rax, %xmm0 2472 ; AVX2-NEXT: addq $40, %rsp 2473 ; AVX2-NEXT: popq %rbx 2474 ; AVX2-NEXT: popq %r14 2475 ; AVX2-NEXT: retq 2476 ; 2477 ; AVX512-LABEL: cvt_4f64_to_4i16: 2478 ; AVX512: # BB#0: 2479 ; AVX512-NEXT: pushq %r14 2480 ; AVX512-NEXT: .Ltmp3: 2481 ; AVX512-NEXT: .cfi_def_cfa_offset 16 2482 ; AVX512-NEXT: pushq %rbx 2483 ; AVX512-NEXT: .Ltmp4: 2484 ; AVX512-NEXT: .cfi_def_cfa_offset 24 2485 ; AVX512-NEXT: subq $40, %rsp 2486 ; AVX512-NEXT: .Ltmp5: 2487 ; AVX512-NEXT: .cfi_def_cfa_offset 64 2488 ; AVX512-NEXT: .Ltmp6: 2489 ; AVX512-NEXT: .cfi_offset %rbx, -24 2490 ; AVX512-NEXT: .Ltmp7: 2491 ; AVX512-NEXT: .cfi_offset %r14, -16 2492 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 2493 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2494 ; AVX512-NEXT: callq __truncdfhf2 2495 ; AVX512-NEXT: movw %ax, %bx 2496 ; AVX512-NEXT: shll $16, %ebx 2497 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2498 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2499 ; AVX512-NEXT: callq __truncdfhf2 2500 ; AVX512-NEXT: movzwl %ax, %r14d 2501 ; AVX512-NEXT: orl %ebx, %r14d 2502 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2503 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 2504 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2505 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2506 ; AVX512-NEXT: callq __truncdfhf2 2507 ; AVX512-NEXT: movw %ax, %bx 2508 ; AVX512-NEXT: shll $16, %ebx 2509 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2510 ; AVX512-NEXT: callq __truncdfhf2 2511 ; AVX512-NEXT: movzwl %ax, %eax 2512 ; AVX512-NEXT: orl %ebx, %eax 2513 ; AVX512-NEXT: shlq $32, %rax 2514 ; AVX512-NEXT: orq %r14, %rax 2515 ; AVX512-NEXT: vmovq %rax, %xmm0 2516 ; AVX512-NEXT: addq $40, %rsp 2517 ; AVX512-NEXT: popq %rbx 2518 ; AVX512-NEXT: popq %r14 2519 ; AVX512-NEXT: retq 2520 %1 = fptrunc <4 x double> %a0 to <4 x half> 2521 %2 = bitcast <4 x half> %1 to <4 x i16> 2522 ret <4 x i16> %2 2523 } 2524 2525 define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) { 2526 ; AVX1-LABEL: cvt_4f64_to_8i16_undef: 2527 ; AVX1: # BB#0: 2528 ; AVX1-NEXT: pushq %r14 2529 ; AVX1-NEXT: .Ltmp8: 2530 ; AVX1-NEXT: .cfi_def_cfa_offset 16 2531 ; AVX1-NEXT: pushq %rbx 2532 ; AVX1-NEXT: .Ltmp9: 2533 ; AVX1-NEXT: .cfi_def_cfa_offset 24 2534 ; AVX1-NEXT: subq $40, %rsp 2535 ; AVX1-NEXT: .Ltmp10: 2536 ; AVX1-NEXT: .cfi_def_cfa_offset 64 2537 ; AVX1-NEXT: .Ltmp11: 2538 ; AVX1-NEXT: .cfi_offset %rbx, -24 2539 ; AVX1-NEXT: .Ltmp12: 2540 ; AVX1-NEXT: .cfi_offset %r14, -16 2541 ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 2542 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2543 ; AVX1-NEXT: vzeroupper 2544 ; AVX1-NEXT: callq __truncdfhf2 2545 ; AVX1-NEXT: movw %ax, %bx 2546 ; AVX1-NEXT: shll $16, %ebx 2547 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2548 ; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2549 ; AVX1-NEXT: vzeroupper 2550 ; AVX1-NEXT: callq __truncdfhf2 2551 ; AVX1-NEXT: movzwl %ax, %r14d 2552 ; AVX1-NEXT: orl %ebx, %r14d 2553 ; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2554 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2555 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2556 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2557 ; AVX1-NEXT: vzeroupper 2558 ; AVX1-NEXT: callq __truncdfhf2 2559 ; AVX1-NEXT: movw %ax, %bx 2560 ; AVX1-NEXT: shll $16, %ebx 2561 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2562 ; AVX1-NEXT: callq __truncdfhf2 2563 ; AVX1-NEXT: movzwl %ax, %eax 2564 ; AVX1-NEXT: orl %ebx, %eax 2565 ; AVX1-NEXT: shlq $32, %rax 2566 ; AVX1-NEXT: orq %r14, %rax 2567 ; AVX1-NEXT: vmovq %rax, %xmm0 2568 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2569 ; AVX1-NEXT: addq $40, %rsp 2570 ; AVX1-NEXT: popq %rbx 2571 ; AVX1-NEXT: popq %r14 2572 ; AVX1-NEXT: retq 2573 ; 2574 ; AVX2-LABEL: cvt_4f64_to_8i16_undef: 2575 ; AVX2: # BB#0: 2576 ; AVX2-NEXT: pushq %r14 2577 ; AVX2-NEXT: .Ltmp8: 2578 ; AVX2-NEXT: .cfi_def_cfa_offset 16 2579 ; AVX2-NEXT: pushq %rbx 2580 ; AVX2-NEXT: .Ltmp9: 2581 ; AVX2-NEXT: .cfi_def_cfa_offset 24 2582 ; AVX2-NEXT: subq $40, %rsp 2583 ; AVX2-NEXT: .Ltmp10: 2584 ; AVX2-NEXT: .cfi_def_cfa_offset 64 2585 ; AVX2-NEXT: .Ltmp11: 2586 ; AVX2-NEXT: .cfi_offset %rbx, -24 2587 ; AVX2-NEXT: .Ltmp12: 2588 ; AVX2-NEXT: .cfi_offset %r14, -16 2589 ; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 2590 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2591 ; AVX2-NEXT: vzeroupper 2592 ; AVX2-NEXT: callq __truncdfhf2 2593 ; AVX2-NEXT: movw %ax, %bx 2594 ; AVX2-NEXT: shll $16, %ebx 2595 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2596 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2597 ; AVX2-NEXT: vzeroupper 2598 ; AVX2-NEXT: callq __truncdfhf2 2599 ; AVX2-NEXT: movzwl %ax, %r14d 2600 ; AVX2-NEXT: orl %ebx, %r14d 2601 ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2602 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 2603 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2604 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2605 ; AVX2-NEXT: vzeroupper 2606 ; AVX2-NEXT: callq __truncdfhf2 2607 ; AVX2-NEXT: movw %ax, %bx 2608 ; AVX2-NEXT: shll $16, %ebx 2609 ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2610 ; AVX2-NEXT: callq __truncdfhf2 2611 ; AVX2-NEXT: movzwl %ax, %eax 2612 ; AVX2-NEXT: orl %ebx, %eax 2613 ; AVX2-NEXT: shlq $32, %rax 2614 ; AVX2-NEXT: orq %r14, %rax 2615 ; AVX2-NEXT: vmovq %rax, %xmm0 2616 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2617 ; AVX2-NEXT: addq $40, %rsp 2618 ; AVX2-NEXT: popq %rbx 2619 ; AVX2-NEXT: popq %r14 2620 ; AVX2-NEXT: retq 2621 ; 2622 ; AVX512-LABEL: cvt_4f64_to_8i16_undef: 2623 ; AVX512: # BB#0: 2624 ; AVX512-NEXT: pushq %r14 2625 ; AVX512-NEXT: .Ltmp8: 2626 ; AVX512-NEXT: .cfi_def_cfa_offset 16 2627 ; AVX512-NEXT: pushq %rbx 2628 ; AVX512-NEXT: .Ltmp9: 2629 ; AVX512-NEXT: .cfi_def_cfa_offset 24 2630 ; AVX512-NEXT: subq $40, %rsp 2631 ; AVX512-NEXT: .Ltmp10: 2632 ; AVX512-NEXT: .cfi_def_cfa_offset 64 2633 ; AVX512-NEXT: .Ltmp11: 2634 ; AVX512-NEXT: .cfi_offset %rbx, -24 2635 ; AVX512-NEXT: .Ltmp12: 2636 ; AVX512-NEXT: .cfi_offset %r14, -16 2637 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 2638 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2639 ; AVX512-NEXT: callq __truncdfhf2 2640 ; AVX512-NEXT: movw %ax, %bx 2641 ; AVX512-NEXT: shll $16, %ebx 2642 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2643 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2644 ; AVX512-NEXT: callq __truncdfhf2 2645 ; AVX512-NEXT: movzwl %ax, %r14d 2646 ; AVX512-NEXT: orl %ebx, %r14d 2647 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2648 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 2649 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2650 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2651 ; AVX512-NEXT: callq __truncdfhf2 2652 ; AVX512-NEXT: movw %ax, %bx 2653 ; AVX512-NEXT: shll $16, %ebx 2654 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2655 ; AVX512-NEXT: callq __truncdfhf2 2656 ; AVX512-NEXT: movzwl %ax, %eax 2657 ; AVX512-NEXT: orl %ebx, %eax 2658 ; AVX512-NEXT: shlq $32, %rax 2659 ; AVX512-NEXT: orq %r14, %rax 2660 ; AVX512-NEXT: vmovq %rax, %xmm0 2661 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2662 ; AVX512-NEXT: addq $40, %rsp 2663 ; AVX512-NEXT: popq %rbx 2664 ; AVX512-NEXT: popq %r14 2665 ; AVX512-NEXT: retq 2666 %1 = fptrunc <4 x double> %a0 to <4 x half> 2667 %2 = bitcast <4 x half> %1 to <4 x i16> 2668 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2669 ret <8 x i16> %3 2670 } 2671 2672 define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) { 2673 ; AVX1-LABEL: cvt_4f64_to_8i16_zero: 2674 ; AVX1: # BB#0: 2675 ; AVX1-NEXT: pushq %r14 2676 ; AVX1-NEXT: .Ltmp13: 2677 ; AVX1-NEXT: .cfi_def_cfa_offset 16 2678 ; AVX1-NEXT: pushq %rbx 2679 ; AVX1-NEXT: .Ltmp14: 2680 ; AVX1-NEXT: .cfi_def_cfa_offset 24 2681 ; AVX1-NEXT: subq $40, %rsp 2682 ; AVX1-NEXT: .Ltmp15: 2683 ; AVX1-NEXT: .cfi_def_cfa_offset 64 2684 ; AVX1-NEXT: .Ltmp16: 2685 ; AVX1-NEXT: .cfi_offset %rbx, -24 2686 ; AVX1-NEXT: .Ltmp17: 2687 ; AVX1-NEXT: .cfi_offset %r14, -16 2688 ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 2689 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2690 ; AVX1-NEXT: vzeroupper 2691 ; AVX1-NEXT: callq __truncdfhf2 2692 ; AVX1-NEXT: movw %ax, %bx 2693 ; AVX1-NEXT: shll $16, %ebx 2694 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2695 ; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2696 ; AVX1-NEXT: vzeroupper 2697 ; AVX1-NEXT: callq __truncdfhf2 2698 ; AVX1-NEXT: movzwl %ax, %r14d 2699 ; AVX1-NEXT: orl %ebx, %r14d 2700 ; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2701 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2702 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2703 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2704 ; AVX1-NEXT: vzeroupper 2705 ; AVX1-NEXT: callq __truncdfhf2 2706 ; AVX1-NEXT: movw %ax, %bx 2707 ; AVX1-NEXT: shll $16, %ebx 2708 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2709 ; AVX1-NEXT: callq __truncdfhf2 2710 ; AVX1-NEXT: movzwl %ax, %eax 2711 ; AVX1-NEXT: orl %ebx, %eax 2712 ; AVX1-NEXT: shlq $32, %rax 2713 ; AVX1-NEXT: orq %r14, %rax 2714 ; AVX1-NEXT: vmovq %rax, %xmm0 2715 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 2716 ; AVX1-NEXT: addq $40, %rsp 2717 ; AVX1-NEXT: popq %rbx 2718 ; AVX1-NEXT: popq %r14 2719 ; AVX1-NEXT: retq 2720 ; 2721 ; AVX2-LABEL: cvt_4f64_to_8i16_zero: 2722 ; AVX2: # BB#0: 2723 ; AVX2-NEXT: pushq %r14 2724 ; AVX2-NEXT: .Ltmp13: 2725 ; AVX2-NEXT: .cfi_def_cfa_offset 16 2726 ; AVX2-NEXT: pushq %rbx 2727 ; AVX2-NEXT: .Ltmp14: 2728 ; AVX2-NEXT: .cfi_def_cfa_offset 24 2729 ; AVX2-NEXT: subq $40, %rsp 2730 ; AVX2-NEXT: .Ltmp15: 2731 ; AVX2-NEXT: .cfi_def_cfa_offset 64 2732 ; AVX2-NEXT: .Ltmp16: 2733 ; AVX2-NEXT: .cfi_offset %rbx, -24 2734 ; AVX2-NEXT: .Ltmp17: 2735 ; AVX2-NEXT: .cfi_offset %r14, -16 2736 ; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 2737 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2738 ; AVX2-NEXT: vzeroupper 2739 ; AVX2-NEXT: callq __truncdfhf2 2740 ; AVX2-NEXT: movw %ax, %bx 2741 ; AVX2-NEXT: shll $16, %ebx 2742 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2743 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2744 ; AVX2-NEXT: vzeroupper 2745 ; AVX2-NEXT: callq __truncdfhf2 2746 ; AVX2-NEXT: movzwl %ax, %r14d 2747 ; AVX2-NEXT: orl %ebx, %r14d 2748 ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2749 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 2750 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2751 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2752 ; AVX2-NEXT: vzeroupper 2753 ; AVX2-NEXT: callq __truncdfhf2 2754 ; AVX2-NEXT: movw %ax, %bx 2755 ; AVX2-NEXT: shll $16, %ebx 2756 ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2757 ; AVX2-NEXT: callq __truncdfhf2 2758 ; AVX2-NEXT: movzwl %ax, %eax 2759 ; AVX2-NEXT: orl %ebx, %eax 2760 ; AVX2-NEXT: shlq $32, %rax 2761 ; AVX2-NEXT: orq %r14, %rax 2762 ; AVX2-NEXT: vmovq %rax, %xmm0 2763 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 2764 ; AVX2-NEXT: addq $40, %rsp 2765 ; AVX2-NEXT: popq %rbx 2766 ; AVX2-NEXT: popq %r14 2767 ; AVX2-NEXT: retq 2768 ; 2769 ; AVX512-LABEL: cvt_4f64_to_8i16_zero: 2770 ; AVX512: # BB#0: 2771 ; AVX512-NEXT: pushq %r14 2772 ; AVX512-NEXT: .Ltmp13: 2773 ; AVX512-NEXT: .cfi_def_cfa_offset 16 2774 ; AVX512-NEXT: pushq %rbx 2775 ; AVX512-NEXT: .Ltmp14: 2776 ; AVX512-NEXT: .cfi_def_cfa_offset 24 2777 ; AVX512-NEXT: subq $40, %rsp 2778 ; AVX512-NEXT: .Ltmp15: 2779 ; AVX512-NEXT: .cfi_def_cfa_offset 64 2780 ; AVX512-NEXT: .Ltmp16: 2781 ; AVX512-NEXT: .cfi_offset %rbx, -24 2782 ; AVX512-NEXT: .Ltmp17: 2783 ; AVX512-NEXT: .cfi_offset %r14, -16 2784 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 2785 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2786 ; AVX512-NEXT: callq __truncdfhf2 2787 ; AVX512-NEXT: movw %ax, %bx 2788 ; AVX512-NEXT: shll $16, %ebx 2789 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2790 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2791 ; AVX512-NEXT: callq __truncdfhf2 2792 ; AVX512-NEXT: movzwl %ax, %r14d 2793 ; AVX512-NEXT: orl %ebx, %r14d 2794 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2795 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 2796 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2797 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2798 ; AVX512-NEXT: callq __truncdfhf2 2799 ; AVX512-NEXT: movw %ax, %bx 2800 ; AVX512-NEXT: shll $16, %ebx 2801 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2802 ; AVX512-NEXT: callq __truncdfhf2 2803 ; AVX512-NEXT: movzwl %ax, %eax 2804 ; AVX512-NEXT: orl %ebx, %eax 2805 ; AVX512-NEXT: shlq $32, %rax 2806 ; AVX512-NEXT: orq %r14, %rax 2807 ; AVX512-NEXT: vmovq %rax, %xmm0 2808 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 2809 ; AVX512-NEXT: addq $40, %rsp 2810 ; AVX512-NEXT: popq %rbx 2811 ; AVX512-NEXT: popq %r14 2812 ; AVX512-NEXT: retq 2813 %1 = fptrunc <4 x double> %a0 to <4 x half> 2814 %2 = bitcast <4 x half> %1 to <4 x i16> 2815 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2816 ret <8 x i16> %3 2817 } 2818 2819 define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) { 2820 ; AVX1-LABEL: cvt_8f64_to_8i16: 2821 ; AVX1: # BB#0: 2822 ; AVX1-NEXT: pushq %r15 2823 ; AVX1-NEXT: .Ltmp18: 2824 ; AVX1-NEXT: .cfi_def_cfa_offset 16 2825 ; AVX1-NEXT: pushq %r14 2826 ; AVX1-NEXT: .Ltmp19: 2827 ; AVX1-NEXT: .cfi_def_cfa_offset 24 2828 ; AVX1-NEXT: pushq %rbx 2829 ; AVX1-NEXT: .Ltmp20: 2830 ; AVX1-NEXT: .cfi_def_cfa_offset 32 2831 ; AVX1-NEXT: subq $64, %rsp 2832 ; AVX1-NEXT: .Ltmp21: 2833 ; AVX1-NEXT: .cfi_def_cfa_offset 96 2834 ; AVX1-NEXT: .Ltmp22: 2835 ; AVX1-NEXT: .cfi_offset %rbx, -32 2836 ; AVX1-NEXT: .Ltmp23: 2837 ; AVX1-NEXT: .cfi_offset %r14, -24 2838 ; AVX1-NEXT: .Ltmp24: 2839 ; AVX1-NEXT: .cfi_offset %r15, -16 2840 ; AVX1-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill 2841 ; AVX1-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill 2842 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2843 ; AVX1-NEXT: vzeroupper 2844 ; AVX1-NEXT: callq __truncdfhf2 2845 ; AVX1-NEXT: movw %ax, %bx 2846 ; AVX1-NEXT: shll $16, %ebx 2847 ; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 2848 ; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2849 ; AVX1-NEXT: vzeroupper 2850 ; AVX1-NEXT: callq __truncdfhf2 2851 ; AVX1-NEXT: movzwl %ax, %r15d 2852 ; AVX1-NEXT: orl %ebx, %r15d 2853 ; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 2854 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2855 ; AVX1-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 2856 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2857 ; AVX1-NEXT: vzeroupper 2858 ; AVX1-NEXT: callq __truncdfhf2 2859 ; AVX1-NEXT: movw %ax, %bx 2860 ; AVX1-NEXT: shll $16, %ebx 2861 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 2862 ; AVX1-NEXT: callq __truncdfhf2 2863 ; AVX1-NEXT: movzwl %ax, %r14d 2864 ; AVX1-NEXT: orl %ebx, %r14d 2865 ; AVX1-NEXT: shlq $32, %r14 2866 ; AVX1-NEXT: orq %r15, %r14 2867 ; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2868 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2869 ; AVX1-NEXT: vzeroupper 2870 ; AVX1-NEXT: callq __truncdfhf2 2871 ; AVX1-NEXT: movw %ax, %bx 2872 ; AVX1-NEXT: shll $16, %ebx 2873 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2874 ; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2875 ; AVX1-NEXT: vzeroupper 2876 ; AVX1-NEXT: callq __truncdfhf2 2877 ; AVX1-NEXT: movzwl %ax, %r15d 2878 ; AVX1-NEXT: orl %ebx, %r15d 2879 ; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2880 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2881 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2882 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2883 ; AVX1-NEXT: vzeroupper 2884 ; AVX1-NEXT: callq __truncdfhf2 2885 ; AVX1-NEXT: movw %ax, %bx 2886 ; AVX1-NEXT: shll $16, %ebx 2887 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2888 ; AVX1-NEXT: callq __truncdfhf2 2889 ; AVX1-NEXT: movzwl %ax, %eax 2890 ; AVX1-NEXT: orl %ebx, %eax 2891 ; AVX1-NEXT: shlq $32, %rax 2892 ; AVX1-NEXT: orq %r15, %rax 2893 ; AVX1-NEXT: vmovq %rax, %xmm0 2894 ; AVX1-NEXT: vmovq %r14, %xmm1 2895 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2896 ; AVX1-NEXT: addq $64, %rsp 2897 ; AVX1-NEXT: popq %rbx 2898 ; AVX1-NEXT: popq %r14 2899 ; AVX1-NEXT: popq %r15 2900 ; AVX1-NEXT: retq 2901 ; 2902 ; AVX2-LABEL: cvt_8f64_to_8i16: 2903 ; AVX2: # BB#0: 2904 ; AVX2-NEXT: pushq %r15 2905 ; AVX2-NEXT: .Ltmp18: 2906 ; AVX2-NEXT: .cfi_def_cfa_offset 16 2907 ; AVX2-NEXT: pushq %r14 2908 ; AVX2-NEXT: .Ltmp19: 2909 ; AVX2-NEXT: .cfi_def_cfa_offset 24 2910 ; AVX2-NEXT: pushq %rbx 2911 ; AVX2-NEXT: .Ltmp20: 2912 ; AVX2-NEXT: .cfi_def_cfa_offset 32 2913 ; AVX2-NEXT: subq $64, %rsp 2914 ; AVX2-NEXT: .Ltmp21: 2915 ; AVX2-NEXT: .cfi_def_cfa_offset 96 2916 ; AVX2-NEXT: .Ltmp22: 2917 ; AVX2-NEXT: .cfi_offset %rbx, -32 2918 ; AVX2-NEXT: .Ltmp23: 2919 ; AVX2-NEXT: .cfi_offset %r14, -24 2920 ; AVX2-NEXT: .Ltmp24: 2921 ; AVX2-NEXT: .cfi_offset %r15, -16 2922 ; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill 2923 ; AVX2-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill 2924 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2925 ; AVX2-NEXT: vzeroupper 2926 ; AVX2-NEXT: callq __truncdfhf2 2927 ; AVX2-NEXT: movw %ax, %bx 2928 ; AVX2-NEXT: shll $16, %ebx 2929 ; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 2930 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2931 ; AVX2-NEXT: vzeroupper 2932 ; AVX2-NEXT: callq __truncdfhf2 2933 ; AVX2-NEXT: movzwl %ax, %r15d 2934 ; AVX2-NEXT: orl %ebx, %r15d 2935 ; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 2936 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 2937 ; AVX2-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 2938 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2939 ; AVX2-NEXT: vzeroupper 2940 ; AVX2-NEXT: callq __truncdfhf2 2941 ; AVX2-NEXT: movw %ax, %bx 2942 ; AVX2-NEXT: shll $16, %ebx 2943 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 2944 ; AVX2-NEXT: callq __truncdfhf2 2945 ; AVX2-NEXT: movzwl %ax, %r14d 2946 ; AVX2-NEXT: orl %ebx, %r14d 2947 ; AVX2-NEXT: shlq $32, %r14 2948 ; AVX2-NEXT: orq %r15, %r14 2949 ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2950 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2951 ; AVX2-NEXT: vzeroupper 2952 ; AVX2-NEXT: callq __truncdfhf2 2953 ; AVX2-NEXT: movw %ax, %bx 2954 ; AVX2-NEXT: shll $16, %ebx 2955 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2956 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2957 ; AVX2-NEXT: vzeroupper 2958 ; AVX2-NEXT: callq __truncdfhf2 2959 ; AVX2-NEXT: movzwl %ax, %r15d 2960 ; AVX2-NEXT: orl %ebx, %r15d 2961 ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2962 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 2963 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2964 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2965 ; AVX2-NEXT: vzeroupper 2966 ; AVX2-NEXT: callq __truncdfhf2 2967 ; AVX2-NEXT: movw %ax, %bx 2968 ; AVX2-NEXT: shll $16, %ebx 2969 ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2970 ; AVX2-NEXT: callq __truncdfhf2 2971 ; AVX2-NEXT: movzwl %ax, %eax 2972 ; AVX2-NEXT: orl %ebx, %eax 2973 ; AVX2-NEXT: shlq $32, %rax 2974 ; AVX2-NEXT: orq %r15, %rax 2975 ; AVX2-NEXT: vmovq %rax, %xmm0 2976 ; AVX2-NEXT: vmovq %r14, %xmm1 2977 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2978 ; AVX2-NEXT: addq $64, %rsp 2979 ; AVX2-NEXT: popq %rbx 2980 ; AVX2-NEXT: popq %r14 2981 ; AVX2-NEXT: popq %r15 2982 ; AVX2-NEXT: retq 2983 ; 2984 ; AVX512-LABEL: cvt_8f64_to_8i16: 2985 ; AVX512: # BB#0: 2986 ; AVX512-NEXT: pushq %r15 2987 ; AVX512-NEXT: .Ltmp18: 2988 ; AVX512-NEXT: .cfi_def_cfa_offset 16 2989 ; AVX512-NEXT: pushq %r14 2990 ; AVX512-NEXT: .Ltmp19: 2991 ; AVX512-NEXT: .cfi_def_cfa_offset 24 2992 ; AVX512-NEXT: pushq %rbx 2993 ; AVX512-NEXT: .Ltmp20: 2994 ; AVX512-NEXT: .cfi_def_cfa_offset 32 2995 ; AVX512-NEXT: subq $96, %rsp 2996 ; AVX512-NEXT: .Ltmp21: 2997 ; AVX512-NEXT: .cfi_def_cfa_offset 128 2998 ; AVX512-NEXT: .Ltmp22: 2999 ; AVX512-NEXT: .cfi_offset %rbx, -32 3000 ; AVX512-NEXT: .Ltmp23: 3001 ; AVX512-NEXT: .cfi_offset %r14, -24 3002 ; AVX512-NEXT: .Ltmp24: 3003 ; AVX512-NEXT: .cfi_offset %r15, -16 3004 ; AVX512-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill 3005 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3006 ; AVX512-NEXT: callq __truncdfhf2 3007 ; AVX512-NEXT: movw %ax, %bx 3008 ; AVX512-NEXT: shll $16, %ebx 3009 ; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload 3010 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> 3011 ; AVX512-NEXT: callq __truncdfhf2 3012 ; AVX512-NEXT: movzwl %ax, %r15d 3013 ; AVX512-NEXT: orl %ebx, %r15d 3014 ; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload 3015 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3016 ; AVX512-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 3017 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3018 ; AVX512-NEXT: callq __truncdfhf2 3019 ; AVX512-NEXT: movw %ax, %bx 3020 ; AVX512-NEXT: shll $16, %ebx 3021 ; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 3022 ; AVX512-NEXT: callq __truncdfhf2 3023 ; AVX512-NEXT: movzwl %ax, %r14d 3024 ; AVX512-NEXT: orl %ebx, %r14d 3025 ; AVX512-NEXT: shlq $32, %r14 3026 ; AVX512-NEXT: orq %r15, %r14 3027 ; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload 3028 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 3029 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3030 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3031 ; AVX512-NEXT: callq __truncdfhf2 3032 ; AVX512-NEXT: movw %ax, %bx 3033 ; AVX512-NEXT: shll $16, %ebx 3034 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3035 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3036 ; AVX512-NEXT: callq __truncdfhf2 3037 ; AVX512-NEXT: movzwl %ax, %r15d 3038 ; AVX512-NEXT: orl %ebx, %r15d 3039 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3040 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3041 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3042 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3043 ; AVX512-NEXT: callq __truncdfhf2 3044 ; AVX512-NEXT: movw %ax, %bx 3045 ; AVX512-NEXT: shll $16, %ebx 3046 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3047 ; AVX512-NEXT: callq __truncdfhf2 3048 ; AVX512-NEXT: movzwl %ax, %eax 3049 ; AVX512-NEXT: orl %ebx, %eax 3050 ; AVX512-NEXT: shlq $32, %rax 3051 ; AVX512-NEXT: orq %r15, %rax 3052 ; AVX512-NEXT: vmovq %rax, %xmm0 3053 ; AVX512-NEXT: vmovq %r14, %xmm1 3054 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 3055 ; AVX512-NEXT: addq $96, %rsp 3056 ; AVX512-NEXT: popq %rbx 3057 ; AVX512-NEXT: popq %r14 3058 ; AVX512-NEXT: popq %r15 3059 ; AVX512-NEXT: retq 3060 %1 = fptrunc <8 x double> %a0 to <8 x half> 3061 %2 = bitcast <8 x half> %1 to <8 x i16> 3062 ret <8 x i16> %2 3063 } 3064 3065 ; 3066 ; Double to Half (Store) 3067 ; 3068 3069 define void @store_cvt_f64_to_i16(double %a0, i16* %a1) { 3070 ; ALL-LABEL: store_cvt_f64_to_i16: 3071 ; ALL: # BB#0: 3072 ; ALL-NEXT: pushq %rbx 3073 ; ALL-NEXT: .Ltmp25: 3074 ; ALL-NEXT: .cfi_def_cfa_offset 16 3075 ; ALL-NEXT: .Ltmp26: 3076 ; ALL-NEXT: .cfi_offset %rbx, -16 3077 ; ALL-NEXT: movq %rdi, %rbx 3078 ; ALL-NEXT: callq __truncdfhf2 3079 ; ALL-NEXT: movw %ax, (%rbx) 3080 ; ALL-NEXT: popq %rbx 3081 ; ALL-NEXT: retq 3082 %1 = fptrunc double %a0 to half 3083 %2 = bitcast half %1 to i16 3084 store i16 %2, i16* %a1 3085 ret void 3086 } 3087 3088 define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) { 3089 ; ALL-LABEL: store_cvt_2f64_to_2i16: 3090 ; ALL: # BB#0: 3091 ; ALL-NEXT: pushq %rbp 3092 ; ALL-NEXT: .Ltmp27: 3093 ; ALL-NEXT: .cfi_def_cfa_offset 16 3094 ; ALL-NEXT: pushq %rbx 3095 ; ALL-NEXT: .Ltmp28: 3096 ; ALL-NEXT: .cfi_def_cfa_offset 24 3097 ; ALL-NEXT: subq $24, %rsp 3098 ; ALL-NEXT: .Ltmp29: 3099 ; ALL-NEXT: .cfi_def_cfa_offset 48 3100 ; ALL-NEXT: .Ltmp30: 3101 ; ALL-NEXT: .cfi_offset %rbx, -24 3102 ; ALL-NEXT: .Ltmp31: 3103 ; ALL-NEXT: .cfi_offset %rbp, -16 3104 ; ALL-NEXT: movq %rdi, %rbx 3105 ; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3106 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3107 ; ALL-NEXT: callq __truncdfhf2 3108 ; ALL-NEXT: movl %eax, %ebp 3109 ; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3110 ; ALL-NEXT: callq __truncdfhf2 3111 ; ALL-NEXT: movw %ax, (%rbx) 3112 ; ALL-NEXT: movw %bp, 2(%rbx) 3113 ; ALL-NEXT: addq $24, %rsp 3114 ; ALL-NEXT: popq %rbx 3115 ; ALL-NEXT: popq %rbp 3116 ; ALL-NEXT: retq 3117 %1 = fptrunc <2 x double> %a0 to <2 x half> 3118 %2 = bitcast <2 x half> %1 to <2 x i16> 3119 store <2 x i16> %2, <2 x i16>* %a1 3120 ret void 3121 } 3122 3123 define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) { 3124 ; AVX1-LABEL: store_cvt_4f64_to_4i16: 3125 ; AVX1: # BB#0: 3126 ; AVX1-NEXT: pushq %rbp 3127 ; AVX1-NEXT: .Ltmp32: 3128 ; AVX1-NEXT: .cfi_def_cfa_offset 16 3129 ; AVX1-NEXT: pushq %r15 3130 ; AVX1-NEXT: .Ltmp33: 3131 ; AVX1-NEXT: .cfi_def_cfa_offset 24 3132 ; AVX1-NEXT: pushq %r14 3133 ; AVX1-NEXT: .Ltmp34: 3134 ; AVX1-NEXT: .cfi_def_cfa_offset 32 3135 ; AVX1-NEXT: pushq %rbx 3136 ; AVX1-NEXT: .Ltmp35: 3137 ; AVX1-NEXT: .cfi_def_cfa_offset 40 3138 ; AVX1-NEXT: subq $88, %rsp 3139 ; AVX1-NEXT: .Ltmp36: 3140 ; AVX1-NEXT: .cfi_def_cfa_offset 128 3141 ; AVX1-NEXT: .Ltmp37: 3142 ; AVX1-NEXT: .cfi_offset %rbx, -40 3143 ; AVX1-NEXT: .Ltmp38: 3144 ; AVX1-NEXT: .cfi_offset %r14, -32 3145 ; AVX1-NEXT: .Ltmp39: 3146 ; AVX1-NEXT: .cfi_offset %r15, -24 3147 ; AVX1-NEXT: .Ltmp40: 3148 ; AVX1-NEXT: .cfi_offset %rbp, -16 3149 ; AVX1-NEXT: movq %rdi, %rbx 3150 ; AVX1-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill 3151 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3152 ; AVX1-NEXT: vzeroupper 3153 ; AVX1-NEXT: callq __truncdfhf2 3154 ; AVX1-NEXT: movl %eax, %r14d 3155 ; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3156 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3157 ; AVX1-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 3158 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3159 ; AVX1-NEXT: vzeroupper 3160 ; AVX1-NEXT: callq __truncdfhf2 3161 ; AVX1-NEXT: movl %eax, %r15d 3162 ; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3163 ; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3164 ; AVX1-NEXT: vzeroupper 3165 ; AVX1-NEXT: callq __truncdfhf2 3166 ; AVX1-NEXT: movl %eax, %ebp 3167 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 3168 ; AVX1-NEXT: callq __truncdfhf2 3169 ; AVX1-NEXT: movw %ax, 4(%rbx) 3170 ; AVX1-NEXT: movw %bp, (%rbx) 3171 ; AVX1-NEXT: movw %r15w, 6(%rbx) 3172 ; AVX1-NEXT: movw %r14w, 2(%rbx) 3173 ; AVX1-NEXT: addq $88, %rsp 3174 ; AVX1-NEXT: popq %rbx 3175 ; AVX1-NEXT: popq %r14 3176 ; AVX1-NEXT: popq %r15 3177 ; AVX1-NEXT: popq %rbp 3178 ; AVX1-NEXT: retq 3179 ; 3180 ; AVX2-LABEL: store_cvt_4f64_to_4i16: 3181 ; AVX2: # BB#0: 3182 ; AVX2-NEXT: pushq %rbp 3183 ; AVX2-NEXT: .Ltmp32: 3184 ; AVX2-NEXT: .cfi_def_cfa_offset 16 3185 ; AVX2-NEXT: pushq %r15 3186 ; AVX2-NEXT: .Ltmp33: 3187 ; AVX2-NEXT: .cfi_def_cfa_offset 24 3188 ; AVX2-NEXT: pushq %r14 3189 ; AVX2-NEXT: .Ltmp34: 3190 ; AVX2-NEXT: .cfi_def_cfa_offset 32 3191 ; AVX2-NEXT: pushq %rbx 3192 ; AVX2-NEXT: .Ltmp35: 3193 ; AVX2-NEXT: .cfi_def_cfa_offset 40 3194 ; AVX2-NEXT: subq $88, %rsp 3195 ; AVX2-NEXT: .Ltmp36: 3196 ; AVX2-NEXT: .cfi_def_cfa_offset 128 3197 ; AVX2-NEXT: .Ltmp37: 3198 ; AVX2-NEXT: .cfi_offset %rbx, -40 3199 ; AVX2-NEXT: .Ltmp38: 3200 ; AVX2-NEXT: .cfi_offset %r14, -32 3201 ; AVX2-NEXT: .Ltmp39: 3202 ; AVX2-NEXT: .cfi_offset %r15, -24 3203 ; AVX2-NEXT: .Ltmp40: 3204 ; AVX2-NEXT: .cfi_offset %rbp, -16 3205 ; AVX2-NEXT: movq %rdi, %rbx 3206 ; AVX2-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill 3207 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3208 ; AVX2-NEXT: vzeroupper 3209 ; AVX2-NEXT: callq __truncdfhf2 3210 ; AVX2-NEXT: movl %eax, %r14d 3211 ; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3212 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3213 ; AVX2-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 3214 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3215 ; AVX2-NEXT: vzeroupper 3216 ; AVX2-NEXT: callq __truncdfhf2 3217 ; AVX2-NEXT: movl %eax, %r15d 3218 ; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3219 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3220 ; AVX2-NEXT: vzeroupper 3221 ; AVX2-NEXT: callq __truncdfhf2 3222 ; AVX2-NEXT: movl %eax, %ebp 3223 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 3224 ; AVX2-NEXT: callq __truncdfhf2 3225 ; AVX2-NEXT: movw %ax, 4(%rbx) 3226 ; AVX2-NEXT: movw %bp, (%rbx) 3227 ; AVX2-NEXT: movw %r15w, 6(%rbx) 3228 ; AVX2-NEXT: movw %r14w, 2(%rbx) 3229 ; AVX2-NEXT: addq $88, %rsp 3230 ; AVX2-NEXT: popq %rbx 3231 ; AVX2-NEXT: popq %r14 3232 ; AVX2-NEXT: popq %r15 3233 ; AVX2-NEXT: popq %rbp 3234 ; AVX2-NEXT: retq 3235 ; 3236 ; AVX512-LABEL: store_cvt_4f64_to_4i16: 3237 ; AVX512: # BB#0: 3238 ; AVX512-NEXT: pushq %rbp 3239 ; AVX512-NEXT: .Ltmp32: 3240 ; AVX512-NEXT: .cfi_def_cfa_offset 16 3241 ; AVX512-NEXT: pushq %r15 3242 ; AVX512-NEXT: .Ltmp33: 3243 ; AVX512-NEXT: .cfi_def_cfa_offset 24 3244 ; AVX512-NEXT: pushq %r14 3245 ; AVX512-NEXT: .Ltmp34: 3246 ; AVX512-NEXT: .cfi_def_cfa_offset 32 3247 ; AVX512-NEXT: pushq %rbx 3248 ; AVX512-NEXT: .Ltmp35: 3249 ; AVX512-NEXT: .cfi_def_cfa_offset 40 3250 ; AVX512-NEXT: subq $88, %rsp 3251 ; AVX512-NEXT: .Ltmp36: 3252 ; AVX512-NEXT: .cfi_def_cfa_offset 128 3253 ; AVX512-NEXT: .Ltmp37: 3254 ; AVX512-NEXT: .cfi_offset %rbx, -40 3255 ; AVX512-NEXT: .Ltmp38: 3256 ; AVX512-NEXT: .cfi_offset %r14, -32 3257 ; AVX512-NEXT: .Ltmp39: 3258 ; AVX512-NEXT: .cfi_offset %r15, -24 3259 ; AVX512-NEXT: .Ltmp40: 3260 ; AVX512-NEXT: .cfi_offset %rbp, -16 3261 ; AVX512-NEXT: movq %rdi, %rbx 3262 ; AVX512-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill 3263 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3264 ; AVX512-NEXT: callq __truncdfhf2 3265 ; AVX512-NEXT: movl %eax, %r14d 3266 ; AVX512-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3267 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3268 ; AVX512-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 3269 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3270 ; AVX512-NEXT: callq __truncdfhf2 3271 ; AVX512-NEXT: movl %eax, %r15d 3272 ; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3273 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3274 ; AVX512-NEXT: callq __truncdfhf2 3275 ; AVX512-NEXT: movl %eax, %ebp 3276 ; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 3277 ; AVX512-NEXT: callq __truncdfhf2 3278 ; AVX512-NEXT: movw %ax, 4(%rbx) 3279 ; AVX512-NEXT: movw %bp, (%rbx) 3280 ; AVX512-NEXT: movw %r15w, 6(%rbx) 3281 ; AVX512-NEXT: movw %r14w, 2(%rbx) 3282 ; AVX512-NEXT: addq $88, %rsp 3283 ; AVX512-NEXT: popq %rbx 3284 ; AVX512-NEXT: popq %r14 3285 ; AVX512-NEXT: popq %r15 3286 ; AVX512-NEXT: popq %rbp 3287 ; AVX512-NEXT: retq 3288 %1 = fptrunc <4 x double> %a0 to <4 x half> 3289 %2 = bitcast <4 x half> %1 to <4 x i16> 3290 store <4 x i16> %2, <4 x i16>* %a1 3291 ret void 3292 } 3293 3294 define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) { 3295 ; AVX1-LABEL: store_cvt_4f64_to_8i16_undef: 3296 ; AVX1: # BB#0: 3297 ; AVX1-NEXT: pushq %rbp 3298 ; AVX1-NEXT: .Ltmp41: 3299 ; AVX1-NEXT: .cfi_def_cfa_offset 16 3300 ; AVX1-NEXT: pushq %r14 3301 ; AVX1-NEXT: .Ltmp42: 3302 ; AVX1-NEXT: .cfi_def_cfa_offset 24 3303 ; AVX1-NEXT: pushq %rbx 3304 ; AVX1-NEXT: .Ltmp43: 3305 ; AVX1-NEXT: .cfi_def_cfa_offset 32 3306 ; AVX1-NEXT: subq $32, %rsp 3307 ; AVX1-NEXT: .Ltmp44: 3308 ; AVX1-NEXT: .cfi_def_cfa_offset 64 3309 ; AVX1-NEXT: .Ltmp45: 3310 ; AVX1-NEXT: .cfi_offset %rbx, -32 3311 ; AVX1-NEXT: .Ltmp46: 3312 ; AVX1-NEXT: .cfi_offset %r14, -24 3313 ; AVX1-NEXT: .Ltmp47: 3314 ; AVX1-NEXT: .cfi_offset %rbp, -16 3315 ; AVX1-NEXT: movq %rdi, %r14 3316 ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3317 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3318 ; AVX1-NEXT: vzeroupper 3319 ; AVX1-NEXT: callq __truncdfhf2 3320 ; AVX1-NEXT: movw %ax, %bp 3321 ; AVX1-NEXT: shll $16, %ebp 3322 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3323 ; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3324 ; AVX1-NEXT: vzeroupper 3325 ; AVX1-NEXT: callq __truncdfhf2 3326 ; AVX1-NEXT: movzwl %ax, %ebx 3327 ; AVX1-NEXT: orl %ebp, %ebx 3328 ; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3329 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3330 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3331 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3332 ; AVX1-NEXT: vzeroupper 3333 ; AVX1-NEXT: callq __truncdfhf2 3334 ; AVX1-NEXT: movw %ax, %bp 3335 ; AVX1-NEXT: shll $16, %ebp 3336 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3337 ; AVX1-NEXT: callq __truncdfhf2 3338 ; AVX1-NEXT: movzwl %ax, %eax 3339 ; AVX1-NEXT: orl %ebp, %eax 3340 ; AVX1-NEXT: shlq $32, %rax 3341 ; AVX1-NEXT: orq %rbx, %rax 3342 ; AVX1-NEXT: vmovq %rax, %xmm0 3343 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 3344 ; AVX1-NEXT: vmovdqa %xmm0, (%r14) 3345 ; AVX1-NEXT: addq $32, %rsp 3346 ; AVX1-NEXT: popq %rbx 3347 ; AVX1-NEXT: popq %r14 3348 ; AVX1-NEXT: popq %rbp 3349 ; AVX1-NEXT: retq 3350 ; 3351 ; AVX2-LABEL: store_cvt_4f64_to_8i16_undef: 3352 ; AVX2: # BB#0: 3353 ; AVX2-NEXT: pushq %rbp 3354 ; AVX2-NEXT: .Ltmp41: 3355 ; AVX2-NEXT: .cfi_def_cfa_offset 16 3356 ; AVX2-NEXT: pushq %r14 3357 ; AVX2-NEXT: .Ltmp42: 3358 ; AVX2-NEXT: .cfi_def_cfa_offset 24 3359 ; AVX2-NEXT: pushq %rbx 3360 ; AVX2-NEXT: .Ltmp43: 3361 ; AVX2-NEXT: .cfi_def_cfa_offset 32 3362 ; AVX2-NEXT: subq $32, %rsp 3363 ; AVX2-NEXT: .Ltmp44: 3364 ; AVX2-NEXT: .cfi_def_cfa_offset 64 3365 ; AVX2-NEXT: .Ltmp45: 3366 ; AVX2-NEXT: .cfi_offset %rbx, -32 3367 ; AVX2-NEXT: .Ltmp46: 3368 ; AVX2-NEXT: .cfi_offset %r14, -24 3369 ; AVX2-NEXT: .Ltmp47: 3370 ; AVX2-NEXT: .cfi_offset %rbp, -16 3371 ; AVX2-NEXT: movq %rdi, %r14 3372 ; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3373 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3374 ; AVX2-NEXT: vzeroupper 3375 ; AVX2-NEXT: callq __truncdfhf2 3376 ; AVX2-NEXT: movw %ax, %bp 3377 ; AVX2-NEXT: shll $16, %ebp 3378 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3379 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3380 ; AVX2-NEXT: vzeroupper 3381 ; AVX2-NEXT: callq __truncdfhf2 3382 ; AVX2-NEXT: movzwl %ax, %ebx 3383 ; AVX2-NEXT: orl %ebp, %ebx 3384 ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3385 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3386 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3387 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3388 ; AVX2-NEXT: vzeroupper 3389 ; AVX2-NEXT: callq __truncdfhf2 3390 ; AVX2-NEXT: movw %ax, %bp 3391 ; AVX2-NEXT: shll $16, %ebp 3392 ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3393 ; AVX2-NEXT: callq __truncdfhf2 3394 ; AVX2-NEXT: movzwl %ax, %eax 3395 ; AVX2-NEXT: orl %ebp, %eax 3396 ; AVX2-NEXT: shlq $32, %rax 3397 ; AVX2-NEXT: orq %rbx, %rax 3398 ; AVX2-NEXT: vmovq %rax, %xmm0 3399 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 3400 ; AVX2-NEXT: vmovdqa %xmm0, (%r14) 3401 ; AVX2-NEXT: addq $32, %rsp 3402 ; AVX2-NEXT: popq %rbx 3403 ; AVX2-NEXT: popq %r14 3404 ; AVX2-NEXT: popq %rbp 3405 ; AVX2-NEXT: retq 3406 ; 3407 ; AVX512-LABEL: store_cvt_4f64_to_8i16_undef: 3408 ; AVX512: # BB#0: 3409 ; AVX512-NEXT: pushq %rbp 3410 ; AVX512-NEXT: .Ltmp41: 3411 ; AVX512-NEXT: .cfi_def_cfa_offset 16 3412 ; AVX512-NEXT: pushq %r14 3413 ; AVX512-NEXT: .Ltmp42: 3414 ; AVX512-NEXT: .cfi_def_cfa_offset 24 3415 ; AVX512-NEXT: pushq %rbx 3416 ; AVX512-NEXT: .Ltmp43: 3417 ; AVX512-NEXT: .cfi_def_cfa_offset 32 3418 ; AVX512-NEXT: subq $32, %rsp 3419 ; AVX512-NEXT: .Ltmp44: 3420 ; AVX512-NEXT: .cfi_def_cfa_offset 64 3421 ; AVX512-NEXT: .Ltmp45: 3422 ; AVX512-NEXT: .cfi_offset %rbx, -32 3423 ; AVX512-NEXT: .Ltmp46: 3424 ; AVX512-NEXT: .cfi_offset %r14, -24 3425 ; AVX512-NEXT: .Ltmp47: 3426 ; AVX512-NEXT: .cfi_offset %rbp, -16 3427 ; AVX512-NEXT: movq %rdi, %r14 3428 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3429 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3430 ; AVX512-NEXT: callq __truncdfhf2 3431 ; AVX512-NEXT: movw %ax, %bp 3432 ; AVX512-NEXT: shll $16, %ebp 3433 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3434 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3435 ; AVX512-NEXT: callq __truncdfhf2 3436 ; AVX512-NEXT: movzwl %ax, %ebx 3437 ; AVX512-NEXT: orl %ebp, %ebx 3438 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3439 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3440 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3441 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3442 ; AVX512-NEXT: callq __truncdfhf2 3443 ; AVX512-NEXT: movw %ax, %bp 3444 ; AVX512-NEXT: shll $16, %ebp 3445 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3446 ; AVX512-NEXT: callq __truncdfhf2 3447 ; AVX512-NEXT: movzwl %ax, %eax 3448 ; AVX512-NEXT: orl %ebp, %eax 3449 ; AVX512-NEXT: shlq $32, %rax 3450 ; AVX512-NEXT: orq %rbx, %rax 3451 ; AVX512-NEXT: vmovq %rax, %xmm0 3452 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 3453 ; AVX512-NEXT: vmovdqa %xmm0, (%r14) 3454 ; AVX512-NEXT: addq $32, %rsp 3455 ; AVX512-NEXT: popq %rbx 3456 ; AVX512-NEXT: popq %r14 3457 ; AVX512-NEXT: popq %rbp 3458 ; AVX512-NEXT: retq 3459 %1 = fptrunc <4 x double> %a0 to <4 x half> 3460 %2 = bitcast <4 x half> %1 to <4 x i16> 3461 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3462 store <8 x i16> %3, <8 x i16>* %a1 3463 ret void 3464 } 3465 3466 define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) { 3467 ; AVX1-LABEL: store_cvt_4f64_to_8i16_zero: 3468 ; AVX1: # BB#0: 3469 ; AVX1-NEXT: pushq %rbp 3470 ; AVX1-NEXT: .Ltmp48: 3471 ; AVX1-NEXT: .cfi_def_cfa_offset 16 3472 ; AVX1-NEXT: pushq %r14 3473 ; AVX1-NEXT: .Ltmp49: 3474 ; AVX1-NEXT: .cfi_def_cfa_offset 24 3475 ; AVX1-NEXT: pushq %rbx 3476 ; AVX1-NEXT: .Ltmp50: 3477 ; AVX1-NEXT: .cfi_def_cfa_offset 32 3478 ; AVX1-NEXT: subq $32, %rsp 3479 ; AVX1-NEXT: .Ltmp51: 3480 ; AVX1-NEXT: .cfi_def_cfa_offset 64 3481 ; AVX1-NEXT: .Ltmp52: 3482 ; AVX1-NEXT: .cfi_offset %rbx, -32 3483 ; AVX1-NEXT: .Ltmp53: 3484 ; AVX1-NEXT: .cfi_offset %r14, -24 3485 ; AVX1-NEXT: .Ltmp54: 3486 ; AVX1-NEXT: .cfi_offset %rbp, -16 3487 ; AVX1-NEXT: movq %rdi, %r14 3488 ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3489 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3490 ; AVX1-NEXT: vzeroupper 3491 ; AVX1-NEXT: callq __truncdfhf2 3492 ; AVX1-NEXT: movw %ax, %bp 3493 ; AVX1-NEXT: shll $16, %ebp 3494 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3495 ; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3496 ; AVX1-NEXT: vzeroupper 3497 ; AVX1-NEXT: callq __truncdfhf2 3498 ; AVX1-NEXT: movzwl %ax, %ebx 3499 ; AVX1-NEXT: orl %ebp, %ebx 3500 ; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3501 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3502 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3503 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3504 ; AVX1-NEXT: vzeroupper 3505 ; AVX1-NEXT: callq __truncdfhf2 3506 ; AVX1-NEXT: movw %ax, %bp 3507 ; AVX1-NEXT: shll $16, %ebp 3508 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3509 ; AVX1-NEXT: callq __truncdfhf2 3510 ; AVX1-NEXT: movzwl %ax, %eax 3511 ; AVX1-NEXT: orl %ebp, %eax 3512 ; AVX1-NEXT: shlq $32, %rax 3513 ; AVX1-NEXT: orq %rbx, %rax 3514 ; AVX1-NEXT: vmovq %rax, %xmm0 3515 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 3516 ; AVX1-NEXT: vmovdqa %xmm0, (%r14) 3517 ; AVX1-NEXT: addq $32, %rsp 3518 ; AVX1-NEXT: popq %rbx 3519 ; AVX1-NEXT: popq %r14 3520 ; AVX1-NEXT: popq %rbp 3521 ; AVX1-NEXT: retq 3522 ; 3523 ; AVX2-LABEL: store_cvt_4f64_to_8i16_zero: 3524 ; AVX2: # BB#0: 3525 ; AVX2-NEXT: pushq %rbp 3526 ; AVX2-NEXT: .Ltmp48: 3527 ; AVX2-NEXT: .cfi_def_cfa_offset 16 3528 ; AVX2-NEXT: pushq %r14 3529 ; AVX2-NEXT: .Ltmp49: 3530 ; AVX2-NEXT: .cfi_def_cfa_offset 24 3531 ; AVX2-NEXT: pushq %rbx 3532 ; AVX2-NEXT: .Ltmp50: 3533 ; AVX2-NEXT: .cfi_def_cfa_offset 32 3534 ; AVX2-NEXT: subq $32, %rsp 3535 ; AVX2-NEXT: .Ltmp51: 3536 ; AVX2-NEXT: .cfi_def_cfa_offset 64 3537 ; AVX2-NEXT: .Ltmp52: 3538 ; AVX2-NEXT: .cfi_offset %rbx, -32 3539 ; AVX2-NEXT: .Ltmp53: 3540 ; AVX2-NEXT: .cfi_offset %r14, -24 3541 ; AVX2-NEXT: .Ltmp54: 3542 ; AVX2-NEXT: .cfi_offset %rbp, -16 3543 ; AVX2-NEXT: movq %rdi, %r14 3544 ; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3545 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3546 ; AVX2-NEXT: vzeroupper 3547 ; AVX2-NEXT: callq __truncdfhf2 3548 ; AVX2-NEXT: movw %ax, %bp 3549 ; AVX2-NEXT: shll $16, %ebp 3550 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3551 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3552 ; AVX2-NEXT: vzeroupper 3553 ; AVX2-NEXT: callq __truncdfhf2 3554 ; AVX2-NEXT: movzwl %ax, %ebx 3555 ; AVX2-NEXT: orl %ebp, %ebx 3556 ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3557 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3558 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3559 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3560 ; AVX2-NEXT: vzeroupper 3561 ; AVX2-NEXT: callq __truncdfhf2 3562 ; AVX2-NEXT: movw %ax, %bp 3563 ; AVX2-NEXT: shll $16, %ebp 3564 ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3565 ; AVX2-NEXT: callq __truncdfhf2 3566 ; AVX2-NEXT: movzwl %ax, %eax 3567 ; AVX2-NEXT: orl %ebp, %eax 3568 ; AVX2-NEXT: shlq $32, %rax 3569 ; AVX2-NEXT: orq %rbx, %rax 3570 ; AVX2-NEXT: vmovq %rax, %xmm0 3571 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 3572 ; AVX2-NEXT: vmovdqa %xmm0, (%r14) 3573 ; AVX2-NEXT: addq $32, %rsp 3574 ; AVX2-NEXT: popq %rbx 3575 ; AVX2-NEXT: popq %r14 3576 ; AVX2-NEXT: popq %rbp 3577 ; AVX2-NEXT: retq 3578 ; 3579 ; AVX512-LABEL: store_cvt_4f64_to_8i16_zero: 3580 ; AVX512: # BB#0: 3581 ; AVX512-NEXT: pushq %rbp 3582 ; AVX512-NEXT: .Ltmp48: 3583 ; AVX512-NEXT: .cfi_def_cfa_offset 16 3584 ; AVX512-NEXT: pushq %r14 3585 ; AVX512-NEXT: .Ltmp49: 3586 ; AVX512-NEXT: .cfi_def_cfa_offset 24 3587 ; AVX512-NEXT: pushq %rbx 3588 ; AVX512-NEXT: .Ltmp50: 3589 ; AVX512-NEXT: .cfi_def_cfa_offset 32 3590 ; AVX512-NEXT: subq $32, %rsp 3591 ; AVX512-NEXT: .Ltmp51: 3592 ; AVX512-NEXT: .cfi_def_cfa_offset 64 3593 ; AVX512-NEXT: .Ltmp52: 3594 ; AVX512-NEXT: .cfi_offset %rbx, -32 3595 ; AVX512-NEXT: .Ltmp53: 3596 ; AVX512-NEXT: .cfi_offset %r14, -24 3597 ; AVX512-NEXT: .Ltmp54: 3598 ; AVX512-NEXT: .cfi_offset %rbp, -16 3599 ; AVX512-NEXT: movq %rdi, %r14 3600 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3601 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3602 ; AVX512-NEXT: callq __truncdfhf2 3603 ; AVX512-NEXT: movw %ax, %bp 3604 ; AVX512-NEXT: shll $16, %ebp 3605 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3606 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3607 ; AVX512-NEXT: callq __truncdfhf2 3608 ; AVX512-NEXT: movzwl %ax, %ebx 3609 ; AVX512-NEXT: orl %ebp, %ebx 3610 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3611 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3612 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3613 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3614 ; AVX512-NEXT: callq __truncdfhf2 3615 ; AVX512-NEXT: movw %ax, %bp 3616 ; AVX512-NEXT: shll $16, %ebp 3617 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3618 ; AVX512-NEXT: callq __truncdfhf2 3619 ; AVX512-NEXT: movzwl %ax, %eax 3620 ; AVX512-NEXT: orl %ebp, %eax 3621 ; AVX512-NEXT: shlq $32, %rax 3622 ; AVX512-NEXT: orq %rbx, %rax 3623 ; AVX512-NEXT: vmovq %rax, %xmm0 3624 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 3625 ; AVX512-NEXT: vmovdqa %xmm0, (%r14) 3626 ; AVX512-NEXT: addq $32, %rsp 3627 ; AVX512-NEXT: popq %rbx 3628 ; AVX512-NEXT: popq %r14 3629 ; AVX512-NEXT: popq %rbp 3630 ; AVX512-NEXT: retq 3631 %1 = fptrunc <4 x double> %a0 to <4 x half> 3632 %2 = bitcast <4 x half> %1 to <4 x i16> 3633 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3634 store <8 x i16> %3, <8 x i16>* %a1 3635 ret void 3636 } 3637 3638 define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) { 3639 ; AVX1-LABEL: store_cvt_8f64_to_8i16: 3640 ; AVX1: # BB#0: 3641 ; AVX1-NEXT: pushq %rbp 3642 ; AVX1-NEXT: .Ltmp55: 3643 ; AVX1-NEXT: .cfi_def_cfa_offset 16 3644 ; AVX1-NEXT: pushq %r15 3645 ; AVX1-NEXT: .Ltmp56: 3646 ; AVX1-NEXT: .cfi_def_cfa_offset 24 3647 ; AVX1-NEXT: pushq %r14 3648 ; AVX1-NEXT: .Ltmp57: 3649 ; AVX1-NEXT: .cfi_def_cfa_offset 32 3650 ; AVX1-NEXT: pushq %r13 3651 ; AVX1-NEXT: .Ltmp58: 3652 ; AVX1-NEXT: .cfi_def_cfa_offset 40 3653 ; AVX1-NEXT: pushq %r12 3654 ; AVX1-NEXT: .Ltmp59: 3655 ; AVX1-NEXT: .cfi_def_cfa_offset 48 3656 ; AVX1-NEXT: pushq %rbx 3657 ; AVX1-NEXT: .Ltmp60: 3658 ; AVX1-NEXT: .cfi_def_cfa_offset 56 3659 ; AVX1-NEXT: subq $136, %rsp 3660 ; AVX1-NEXT: .Ltmp61: 3661 ; AVX1-NEXT: .cfi_def_cfa_offset 192 3662 ; AVX1-NEXT: .Ltmp62: 3663 ; AVX1-NEXT: .cfi_offset %rbx, -56 3664 ; AVX1-NEXT: .Ltmp63: 3665 ; AVX1-NEXT: .cfi_offset %r12, -48 3666 ; AVX1-NEXT: .Ltmp64: 3667 ; AVX1-NEXT: .cfi_offset %r13, -40 3668 ; AVX1-NEXT: .Ltmp65: 3669 ; AVX1-NEXT: .cfi_offset %r14, -32 3670 ; AVX1-NEXT: .Ltmp66: 3671 ; AVX1-NEXT: .cfi_offset %r15, -24 3672 ; AVX1-NEXT: .Ltmp67: 3673 ; AVX1-NEXT: .cfi_offset %rbp, -16 3674 ; AVX1-NEXT: movq %rdi, %rbx 3675 ; AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill 3676 ; AVX1-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill 3677 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3678 ; AVX1-NEXT: vzeroupper 3679 ; AVX1-NEXT: callq __truncdfhf2 3680 ; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill 3681 ; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3682 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3683 ; AVX1-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 3684 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3685 ; AVX1-NEXT: vzeroupper 3686 ; AVX1-NEXT: callq __truncdfhf2 3687 ; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill 3688 ; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3689 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3690 ; AVX1-NEXT: vzeroupper 3691 ; AVX1-NEXT: callq __truncdfhf2 3692 ; AVX1-NEXT: movl %eax, %r12d 3693 ; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3694 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3695 ; AVX1-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 3696 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3697 ; AVX1-NEXT: vzeroupper 3698 ; AVX1-NEXT: callq __truncdfhf2 3699 ; AVX1-NEXT: movl %eax, %r13d 3700 ; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3701 ; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3702 ; AVX1-NEXT: vzeroupper 3703 ; AVX1-NEXT: callq __truncdfhf2 3704 ; AVX1-NEXT: movl %eax, %ebp 3705 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 3706 ; AVX1-NEXT: callq __truncdfhf2 3707 ; AVX1-NEXT: movl %eax, %r14d 3708 ; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3709 ; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3710 ; AVX1-NEXT: vzeroupper 3711 ; AVX1-NEXT: callq __truncdfhf2 3712 ; AVX1-NEXT: movl %eax, %r15d 3713 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 3714 ; AVX1-NEXT: callq __truncdfhf2 3715 ; AVX1-NEXT: movw %ax, 12(%rbx) 3716 ; AVX1-NEXT: movw %r15w, 8(%rbx) 3717 ; AVX1-NEXT: movw %r14w, 4(%rbx) 3718 ; AVX1-NEXT: movw %bp, (%rbx) 3719 ; AVX1-NEXT: movw %r13w, 14(%rbx) 3720 ; AVX1-NEXT: movw %r12w, 10(%rbx) 3721 ; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload 3722 ; AVX1-NEXT: movw %ax, 6(%rbx) 3723 ; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload 3724 ; AVX1-NEXT: movw %ax, 2(%rbx) 3725 ; AVX1-NEXT: addq $136, %rsp 3726 ; AVX1-NEXT: popq %rbx 3727 ; AVX1-NEXT: popq %r12 3728 ; AVX1-NEXT: popq %r13 3729 ; AVX1-NEXT: popq %r14 3730 ; AVX1-NEXT: popq %r15 3731 ; AVX1-NEXT: popq %rbp 3732 ; AVX1-NEXT: retq 3733 ; 3734 ; AVX2-LABEL: store_cvt_8f64_to_8i16: 3735 ; AVX2: # BB#0: 3736 ; AVX2-NEXT: pushq %rbp 3737 ; AVX2-NEXT: .Ltmp55: 3738 ; AVX2-NEXT: .cfi_def_cfa_offset 16 3739 ; AVX2-NEXT: pushq %r15 3740 ; AVX2-NEXT: .Ltmp56: 3741 ; AVX2-NEXT: .cfi_def_cfa_offset 24 3742 ; AVX2-NEXT: pushq %r14 3743 ; AVX2-NEXT: .Ltmp57: 3744 ; AVX2-NEXT: .cfi_def_cfa_offset 32 3745 ; AVX2-NEXT: pushq %r13 3746 ; AVX2-NEXT: .Ltmp58: 3747 ; AVX2-NEXT: .cfi_def_cfa_offset 40 3748 ; AVX2-NEXT: pushq %r12 3749 ; AVX2-NEXT: .Ltmp59: 3750 ; AVX2-NEXT: .cfi_def_cfa_offset 48 3751 ; AVX2-NEXT: pushq %rbx 3752 ; AVX2-NEXT: .Ltmp60: 3753 ; AVX2-NEXT: .cfi_def_cfa_offset 56 3754 ; AVX2-NEXT: subq $136, %rsp 3755 ; AVX2-NEXT: .Ltmp61: 3756 ; AVX2-NEXT: .cfi_def_cfa_offset 192 3757 ; AVX2-NEXT: .Ltmp62: 3758 ; AVX2-NEXT: .cfi_offset %rbx, -56 3759 ; AVX2-NEXT: .Ltmp63: 3760 ; AVX2-NEXT: .cfi_offset %r12, -48 3761 ; AVX2-NEXT: .Ltmp64: 3762 ; AVX2-NEXT: .cfi_offset %r13, -40 3763 ; AVX2-NEXT: .Ltmp65: 3764 ; AVX2-NEXT: .cfi_offset %r14, -32 3765 ; AVX2-NEXT: .Ltmp66: 3766 ; AVX2-NEXT: .cfi_offset %r15, -24 3767 ; AVX2-NEXT: .Ltmp67: 3768 ; AVX2-NEXT: .cfi_offset %rbp, -16 3769 ; AVX2-NEXT: movq %rdi, %rbx 3770 ; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill 3771 ; AVX2-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill 3772 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3773 ; AVX2-NEXT: vzeroupper 3774 ; AVX2-NEXT: callq __truncdfhf2 3775 ; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill 3776 ; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3777 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3778 ; AVX2-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 3779 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3780 ; AVX2-NEXT: vzeroupper 3781 ; AVX2-NEXT: callq __truncdfhf2 3782 ; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill 3783 ; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3784 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3785 ; AVX2-NEXT: vzeroupper 3786 ; AVX2-NEXT: callq __truncdfhf2 3787 ; AVX2-NEXT: movl %eax, %r12d 3788 ; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3789 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3790 ; AVX2-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 3791 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3792 ; AVX2-NEXT: vzeroupper 3793 ; AVX2-NEXT: callq __truncdfhf2 3794 ; AVX2-NEXT: movl %eax, %r13d 3795 ; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3796 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3797 ; AVX2-NEXT: vzeroupper 3798 ; AVX2-NEXT: callq __truncdfhf2 3799 ; AVX2-NEXT: movl %eax, %ebp 3800 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 3801 ; AVX2-NEXT: callq __truncdfhf2 3802 ; AVX2-NEXT: movl %eax, %r14d 3803 ; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3804 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3805 ; AVX2-NEXT: vzeroupper 3806 ; AVX2-NEXT: callq __truncdfhf2 3807 ; AVX2-NEXT: movl %eax, %r15d 3808 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 3809 ; AVX2-NEXT: callq __truncdfhf2 3810 ; AVX2-NEXT: movw %ax, 12(%rbx) 3811 ; AVX2-NEXT: movw %r15w, 8(%rbx) 3812 ; AVX2-NEXT: movw %r14w, 4(%rbx) 3813 ; AVX2-NEXT: movw %bp, (%rbx) 3814 ; AVX2-NEXT: movw %r13w, 14(%rbx) 3815 ; AVX2-NEXT: movw %r12w, 10(%rbx) 3816 ; AVX2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload 3817 ; AVX2-NEXT: movw %ax, 6(%rbx) 3818 ; AVX2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload 3819 ; AVX2-NEXT: movw %ax, 2(%rbx) 3820 ; AVX2-NEXT: addq $136, %rsp 3821 ; AVX2-NEXT: popq %rbx 3822 ; AVX2-NEXT: popq %r12 3823 ; AVX2-NEXT: popq %r13 3824 ; AVX2-NEXT: popq %r14 3825 ; AVX2-NEXT: popq %r15 3826 ; AVX2-NEXT: popq %rbp 3827 ; AVX2-NEXT: retq 3828 ; 3829 ; AVX512-LABEL: store_cvt_8f64_to_8i16: 3830 ; AVX512: # BB#0: 3831 ; AVX512-NEXT: pushq %rbp 3832 ; AVX512-NEXT: .Ltmp55: 3833 ; AVX512-NEXT: .cfi_def_cfa_offset 16 3834 ; AVX512-NEXT: pushq %r15 3835 ; AVX512-NEXT: .Ltmp56: 3836 ; AVX512-NEXT: .cfi_def_cfa_offset 24 3837 ; AVX512-NEXT: pushq %r14 3838 ; AVX512-NEXT: .Ltmp57: 3839 ; AVX512-NEXT: .cfi_def_cfa_offset 32 3840 ; AVX512-NEXT: pushq %r13 3841 ; AVX512-NEXT: .Ltmp58: 3842 ; AVX512-NEXT: .cfi_def_cfa_offset 40 3843 ; AVX512-NEXT: pushq %r12 3844 ; AVX512-NEXT: .Ltmp59: 3845 ; AVX512-NEXT: .cfi_def_cfa_offset 48 3846 ; AVX512-NEXT: pushq %rbx 3847 ; AVX512-NEXT: .Ltmp60: 3848 ; AVX512-NEXT: .cfi_def_cfa_offset 56 3849 ; AVX512-NEXT: subq $200, %rsp 3850 ; AVX512-NEXT: .Ltmp61: 3851 ; AVX512-NEXT: .cfi_def_cfa_offset 256 3852 ; AVX512-NEXT: .Ltmp62: 3853 ; AVX512-NEXT: .cfi_offset %rbx, -56 3854 ; AVX512-NEXT: .Ltmp63: 3855 ; AVX512-NEXT: .cfi_offset %r12, -48 3856 ; AVX512-NEXT: .Ltmp64: 3857 ; AVX512-NEXT: .cfi_offset %r13, -40 3858 ; AVX512-NEXT: .Ltmp65: 3859 ; AVX512-NEXT: .cfi_offset %r14, -32 3860 ; AVX512-NEXT: .Ltmp66: 3861 ; AVX512-NEXT: .cfi_offset %r15, -24 3862 ; AVX512-NEXT: .Ltmp67: 3863 ; AVX512-NEXT: .cfi_offset %rbp, -16 3864 ; AVX512-NEXT: movq %rdi, %rbx 3865 ; AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill 3866 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3867 ; AVX512-NEXT: callq __truncdfhf2 3868 ; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill 3869 ; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload 3870 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3871 ; AVX512-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 3872 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3873 ; AVX512-NEXT: callq __truncdfhf2 3874 ; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill 3875 ; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload 3876 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 3877 ; AVX512-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill 3878 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3879 ; AVX512-NEXT: callq __truncdfhf2 3880 ; AVX512-NEXT: movl %eax, %r12d 3881 ; AVX512-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3882 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3883 ; AVX512-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 3884 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3885 ; AVX512-NEXT: callq __truncdfhf2 3886 ; AVX512-NEXT: movl %eax, %r13d 3887 ; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload 3888 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> 3889 ; AVX512-NEXT: callq __truncdfhf2 3890 ; AVX512-NEXT: movl %eax, %ebp 3891 ; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 3892 ; AVX512-NEXT: callq __truncdfhf2 3893 ; AVX512-NEXT: movl %eax, %r14d 3894 ; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3895 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3896 ; AVX512-NEXT: callq __truncdfhf2 3897 ; AVX512-NEXT: movl %eax, %r15d 3898 ; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 3899 ; AVX512-NEXT: callq __truncdfhf2 3900 ; AVX512-NEXT: movw %ax, 12(%rbx) 3901 ; AVX512-NEXT: movw %r15w, 8(%rbx) 3902 ; AVX512-NEXT: movw %r14w, 4(%rbx) 3903 ; AVX512-NEXT: movw %bp, (%rbx) 3904 ; AVX512-NEXT: movw %r13w, 14(%rbx) 3905 ; AVX512-NEXT: movw %r12w, 10(%rbx) 3906 ; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload 3907 ; AVX512-NEXT: movw %ax, 6(%rbx) 3908 ; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload 3909 ; AVX512-NEXT: movw %ax, 2(%rbx) 3910 ; AVX512-NEXT: addq $200, %rsp 3911 ; AVX512-NEXT: popq %rbx 3912 ; AVX512-NEXT: popq %r12 3913 ; AVX512-NEXT: popq %r13 3914 ; AVX512-NEXT: popq %r14 3915 ; AVX512-NEXT: popq %r15 3916 ; AVX512-NEXT: popq %rbp 3917 ; AVX512-NEXT: retq 3918 %1 = fptrunc <8 x double> %a0 to <8 x half> 3919 %2 = bitcast <8 x half> %1 to <8 x i16> 3920 store <8 x i16> %2, <8 x i16>* %a1 3921 ret void 3922 } 3923