1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW 8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512DQ 9 10 ; 11 ; add 12 ; 13 14 define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 15 ; SSE-LABEL: trunc_add_v4i64_v4i32: 16 ; SSE: # %bb.0: 17 ; SSE-NEXT: paddq %xmm3, %xmm1 18 ; SSE-NEXT: paddq %xmm2, %xmm0 19 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 20 ; SSE-NEXT: retq 21 ; 22 ; AVX1-LABEL: trunc_add_v4i64_v4i32: 23 ; AVX1: # %bb.0: 24 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 25 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 26 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 27 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 28 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 29 ; AVX1-NEXT: vzeroupper 30 ; AVX1-NEXT: retq 31 ; 32 ; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32: 33 ; AVX2-SLOW: # %bb.0: 34 ; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 35 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 36 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 37 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 38 ; AVX2-SLOW-NEXT: vzeroupper 39 ; AVX2-SLOW-NEXT: retq 40 ; 41 ; AVX2-FAST-LABEL: trunc_add_v4i64_v4i32: 42 ; AVX2-FAST: # %bb.0: 43 ; AVX2-FAST-NEXT: vpaddq %ymm1, %ymm0, %ymm0 44 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] 45 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 46 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 47 ; AVX2-FAST-NEXT: vzeroupper 48 ; AVX2-FAST-NEXT: retq 49 ; 50 ; AVX512-LABEL: trunc_add_v4i64_v4i32: 51 ; AVX512: # %bb.0: 52 ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 53 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 54 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 55 ; AVX512-NEXT: vzeroupper 56 ; AVX512-NEXT: retq 57 %1 = add <4 x i64> %a0, %a1 58 %2 = trunc <4 x i64> %1 to <4 x i32> 59 ret <4 x i32> %2 60 } 61 62 define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 63 ; SSE-LABEL: trunc_add_v8i64_v8i16: 64 ; SSE: # %bb.0: 65 ; SSE-NEXT: paddq %xmm6, %xmm2 66 ; SSE-NEXT: paddq %xmm7, %xmm3 67 ; SSE-NEXT: paddq %xmm4, %xmm0 68 ; SSE-NEXT: paddq %xmm5, %xmm1 69 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 70 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 71 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 72 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 73 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 74 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 75 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 76 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 77 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 78 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 79 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 80 ; SSE-NEXT: retq 81 ; 82 ; AVX1-LABEL: trunc_add_v8i64_v8i16: 83 ; AVX1: # %bb.0: 84 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 85 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 86 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 87 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 88 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm2 89 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 90 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 91 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 92 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 93 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 94 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 95 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 96 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 97 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] 98 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 99 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 100 ; AVX1-NEXT: vzeroupper 101 ; AVX1-NEXT: retq 102 ; 103 ; AVX2-SLOW-LABEL: trunc_add_v8i64_v8i16: 104 ; AVX2-SLOW: # %bb.0: 105 ; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1 106 ; AVX2-SLOW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 107 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 108 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 109 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 110 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 111 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 112 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 113 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 114 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 115 ; AVX2-SLOW-NEXT: vzeroupper 116 ; AVX2-SLOW-NEXT: retq 117 ; 118 ; AVX2-FAST-LABEL: trunc_add_v8i64_v8i16: 119 ; AVX2-FAST: # %bb.0: 120 ; AVX2-FAST-NEXT: vpaddq %ymm3, %ymm1, %ymm1 121 ; AVX2-FAST-NEXT: vpaddq %ymm2, %ymm0, %ymm0 122 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 123 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 124 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 125 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 126 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 127 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 128 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 129 ; AVX2-FAST-NEXT: vzeroupper 130 ; AVX2-FAST-NEXT: retq 131 ; 132 ; AVX512-LABEL: trunc_add_v8i64_v8i16: 133 ; AVX512: # %bb.0: 134 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 135 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 136 ; AVX512-NEXT: vzeroupper 137 ; AVX512-NEXT: retq 138 %1 = add <8 x i64> %a0, %a1 139 %2 = trunc <8 x i64> %1 to <8 x i16> 140 ret <8 x i16> %2 141 } 142 143 define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 144 ; SSE-LABEL: trunc_add_v8i32_v8i16: 145 ; SSE: # %bb.0: 146 ; SSE-NEXT: paddd %xmm2, %xmm0 147 ; SSE-NEXT: paddd %xmm3, %xmm1 148 ; SSE-NEXT: pslld $16, %xmm1 149 ; SSE-NEXT: psrad $16, %xmm1 150 ; SSE-NEXT: pslld $16, %xmm0 151 ; SSE-NEXT: psrad $16, %xmm0 152 ; SSE-NEXT: packssdw %xmm1, %xmm0 153 ; SSE-NEXT: retq 154 ; 155 ; AVX1-LABEL: trunc_add_v8i32_v8i16: 156 ; AVX1: # %bb.0: 157 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 158 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 159 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 160 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 161 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 162 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 163 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 164 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 165 ; AVX1-NEXT: vzeroupper 166 ; AVX1-NEXT: retq 167 ; 168 ; AVX2-LABEL: trunc_add_v8i32_v8i16: 169 ; AVX2: # %bb.0: 170 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 171 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 172 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 173 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 174 ; AVX2-NEXT: vzeroupper 175 ; AVX2-NEXT: retq 176 ; 177 ; AVX512-LABEL: trunc_add_v8i32_v8i16: 178 ; AVX512: # %bb.0: 179 ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 180 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 181 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 182 ; AVX512-NEXT: vzeroupper 183 ; AVX512-NEXT: retq 184 %1 = add <8 x i32> %a0, %a1 185 %2 = trunc <8 x i32> %1 to <8 x i16> 186 ret <8 x i16> %2 187 } 188 189 define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 190 ; SSE-LABEL: trunc_add_v16i64_v16i8: 191 ; SSE: # %bb.0: 192 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm0 193 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm1 194 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm2 195 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm3 196 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm4 197 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm5 198 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm6 199 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm7 200 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 201 ; SSE-NEXT: pand %xmm8, %xmm7 202 ; SSE-NEXT: pand %xmm8, %xmm6 203 ; SSE-NEXT: packuswb %xmm7, %xmm6 204 ; SSE-NEXT: pand %xmm8, %xmm5 205 ; SSE-NEXT: pand %xmm8, %xmm4 206 ; SSE-NEXT: packuswb %xmm5, %xmm4 207 ; SSE-NEXT: packuswb %xmm6, %xmm4 208 ; SSE-NEXT: pand %xmm8, %xmm3 209 ; SSE-NEXT: pand %xmm8, %xmm2 210 ; SSE-NEXT: packuswb %xmm3, %xmm2 211 ; SSE-NEXT: pand %xmm8, %xmm1 212 ; SSE-NEXT: pand %xmm8, %xmm0 213 ; SSE-NEXT: packuswb %xmm1, %xmm0 214 ; SSE-NEXT: packuswb %xmm2, %xmm0 215 ; SSE-NEXT: packuswb %xmm4, %xmm0 216 ; SSE-NEXT: retq 217 ; 218 ; AVX1-LABEL: trunc_add_v16i64_v16i8: 219 ; AVX1: # %bb.0: 220 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8 221 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 222 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 223 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 224 ; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm4 225 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 226 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 227 ; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 228 ; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm5 229 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 230 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 231 ; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2 232 ; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm6 233 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 234 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 235 ; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3 236 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 237 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 238 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 239 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 240 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 241 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 242 ; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 243 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 244 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 245 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 246 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 247 ; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 248 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 249 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 250 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 251 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 252 ; AVX1-NEXT: vzeroupper 253 ; AVX1-NEXT: retq 254 ; 255 ; AVX2-SLOW-LABEL: trunc_add_v16i64_v16i8: 256 ; AVX2-SLOW: # %bb.0: 257 ; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1 258 ; AVX2-SLOW-NEXT: vpaddq %ymm4, %ymm0, %ymm0 259 ; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3 260 ; AVX2-SLOW-NEXT: vpaddq %ymm6, %ymm2, %ymm2 261 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 262 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 263 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] 264 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] 265 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 266 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 267 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 268 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 269 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 270 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 271 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 272 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 273 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 274 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 275 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 276 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 277 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 278 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 279 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 280 ; AVX2-SLOW-NEXT: vzeroupper 281 ; AVX2-SLOW-NEXT: retq 282 ; 283 ; AVX2-FAST-LABEL: trunc_add_v16i64_v16i8: 284 ; AVX2-FAST: # %bb.0: 285 ; AVX2-FAST-NEXT: vpaddq %ymm5, %ymm1, %ymm1 286 ; AVX2-FAST-NEXT: vpaddq %ymm4, %ymm0, %ymm0 287 ; AVX2-FAST-NEXT: vpaddq %ymm7, %ymm3, %ymm3 288 ; AVX2-FAST-NEXT: vpaddq %ymm6, %ymm2, %ymm2 289 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 290 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 291 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 292 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 293 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 294 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 295 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 296 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 297 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 298 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 299 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 300 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 301 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 302 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 303 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 304 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 305 ; AVX2-FAST-NEXT: vzeroupper 306 ; AVX2-FAST-NEXT: retq 307 ; 308 ; AVX512-LABEL: trunc_add_v16i64_v16i8: 309 ; AVX512: # %bb.0: 310 ; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1 311 ; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0 312 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 313 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1 314 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 315 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 316 ; AVX512-NEXT: vzeroupper 317 ; AVX512-NEXT: retq 318 %1 = add <16 x i64> %a0, %a1 319 %2 = trunc <16 x i64> %1 to <16 x i8> 320 ret <16 x i8> %2 321 } 322 323 define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 324 ; SSE-LABEL: trunc_add_v16i32_v16i8: 325 ; SSE: # %bb.0: 326 ; SSE-NEXT: paddd %xmm4, %xmm0 327 ; SSE-NEXT: paddd %xmm5, %xmm1 328 ; SSE-NEXT: paddd %xmm6, %xmm2 329 ; SSE-NEXT: paddd %xmm7, %xmm3 330 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 331 ; SSE-NEXT: pand %xmm4, %xmm3 332 ; SSE-NEXT: pand %xmm4, %xmm2 333 ; SSE-NEXT: packuswb %xmm3, %xmm2 334 ; SSE-NEXT: pand %xmm4, %xmm1 335 ; SSE-NEXT: pand %xmm4, %xmm0 336 ; SSE-NEXT: packuswb %xmm1, %xmm0 337 ; SSE-NEXT: packuswb %xmm2, %xmm0 338 ; SSE-NEXT: retq 339 ; 340 ; AVX1-LABEL: trunc_add_v16i32_v16i8: 341 ; AVX1: # %bb.0: 342 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 343 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 344 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 345 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 346 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm2 347 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 348 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 349 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 350 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 351 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 352 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 353 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 354 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 355 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 356 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 357 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 358 ; AVX1-NEXT: vzeroupper 359 ; AVX1-NEXT: retq 360 ; 361 ; AVX2-LABEL: trunc_add_v16i32_v16i8: 362 ; AVX2: # %bb.0: 363 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 364 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 365 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 366 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 367 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 368 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 369 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 370 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 371 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 372 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 373 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 374 ; AVX2-NEXT: vzeroupper 375 ; AVX2-NEXT: retq 376 ; 377 ; AVX512-LABEL: trunc_add_v16i32_v16i8: 378 ; AVX512: # %bb.0: 379 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 380 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 381 ; AVX512-NEXT: vzeroupper 382 ; AVX512-NEXT: retq 383 %1 = add <16 x i32> %a0, %a1 384 %2 = trunc <16 x i32> %1 to <16 x i8> 385 ret <16 x i8> %2 386 } 387 388 define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 389 ; SSE-LABEL: trunc_add_v16i16_v16i8: 390 ; SSE: # %bb.0: 391 ; SSE-NEXT: paddw %xmm2, %xmm0 392 ; SSE-NEXT: paddw %xmm3, %xmm1 393 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 394 ; SSE-NEXT: pand %xmm2, %xmm1 395 ; SSE-NEXT: pand %xmm2, %xmm0 396 ; SSE-NEXT: packuswb %xmm1, %xmm0 397 ; SSE-NEXT: retq 398 ; 399 ; AVX1-LABEL: trunc_add_v16i16_v16i8: 400 ; AVX1: # %bb.0: 401 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 402 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 403 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 404 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 405 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 406 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 407 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 408 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 409 ; AVX1-NEXT: vzeroupper 410 ; AVX1-NEXT: retq 411 ; 412 ; AVX2-LABEL: trunc_add_v16i16_v16i8: 413 ; AVX2: # %bb.0: 414 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 415 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 416 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 417 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 418 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 419 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 420 ; AVX2-NEXT: vzeroupper 421 ; AVX2-NEXT: retq 422 ; 423 ; AVX512F-LABEL: trunc_add_v16i16_v16i8: 424 ; AVX512F: # %bb.0: 425 ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0 426 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 427 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 428 ; AVX512F-NEXT: vzeroupper 429 ; AVX512F-NEXT: retq 430 ; 431 ; AVX512BW-LABEL: trunc_add_v16i16_v16i8: 432 ; AVX512BW: # %bb.0: 433 ; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0 434 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 435 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 436 ; AVX512BW-NEXT: vzeroupper 437 ; AVX512BW-NEXT: retq 438 ; 439 ; AVX512DQ-LABEL: trunc_add_v16i16_v16i8: 440 ; AVX512DQ: # %bb.0: 441 ; AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 442 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 443 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 444 ; AVX512DQ-NEXT: vzeroupper 445 ; AVX512DQ-NEXT: retq 446 %1 = add <16 x i16> %a0, %a1 447 %2 = trunc <16 x i16> %1 to <16 x i8> 448 ret <16 x i8> %2 449 } 450 451 define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) { 452 ; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 453 ; SSE: # %bb.0: 454 ; SSE-NEXT: pslld $16, %xmm2 455 ; SSE-NEXT: psrad $16, %xmm2 456 ; SSE-NEXT: pslld $16, %xmm1 457 ; SSE-NEXT: psrad $16, %xmm1 458 ; SSE-NEXT: packssdw %xmm2, %xmm1 459 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 460 ; SSE-NEXT: psraw $8, %xmm0 461 ; SSE-NEXT: paddw %xmm1, %xmm0 462 ; SSE-NEXT: retq 463 ; 464 ; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 465 ; AVX1: # %bb.0: 466 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 467 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 468 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 469 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 470 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 471 ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 472 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 473 ; AVX1-NEXT: vzeroupper 474 ; AVX1-NEXT: retq 475 ; 476 ; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 477 ; AVX2: # %bb.0: 478 ; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0 479 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 480 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 481 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 482 ; AVX2-NEXT: vzeroupper 483 ; AVX2-NEXT: retq 484 ; 485 ; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 486 ; AVX512: # %bb.0: 487 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 488 ; AVX512-NEXT: vpmovdw %zmm1, %ymm1 489 ; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0 490 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 491 ; AVX512-NEXT: vzeroupper 492 ; AVX512-NEXT: retq 493 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 494 %2 = sext <8 x i8> %1 to <8 x i32> 495 %3 = add <8 x i32> %2, %a1 496 %4 = trunc <8 x i32> %3 to <8 x i16> 497 ret <8 x i16> %4 498 } 499 500 ; 501 ; add to constant 502 ; 503 504 define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 505 ; SSE-LABEL: trunc_add_const_v4i64_v4i32: 506 ; SSE: # %bb.0: 507 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 508 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 509 ; SSE-NEXT: retq 510 ; 511 ; AVX1-LABEL: trunc_add_const_v4i64_v4i32: 512 ; AVX1: # %bb.0: 513 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 514 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 515 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 516 ; AVX1-NEXT: vzeroupper 517 ; AVX1-NEXT: retq 518 ; 519 ; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32: 520 ; AVX2-SLOW: # %bb.0: 521 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 522 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 523 ; AVX2-SLOW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 524 ; AVX2-SLOW-NEXT: vzeroupper 525 ; AVX2-SLOW-NEXT: retq 526 ; 527 ; AVX2-FAST-LABEL: trunc_add_const_v4i64_v4i32: 528 ; AVX2-FAST: # %bb.0: 529 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] 530 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 531 ; AVX2-FAST-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 532 ; AVX2-FAST-NEXT: vzeroupper 533 ; AVX2-FAST-NEXT: retq 534 ; 535 ; AVX512-LABEL: trunc_add_const_v4i64_v4i32: 536 ; AVX512: # %bb.0: 537 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 538 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 539 ; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 540 ; AVX512-NEXT: vzeroupper 541 ; AVX512-NEXT: retq 542 %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 543 %2 = trunc <4 x i64> %1 to <4 x i32> 544 ret <4 x i32> %2 545 } 546 547 define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 548 ; SSE-LABEL: trunc_add_const_v8i64_v8i16: 549 ; SSE: # %bb.0: 550 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 551 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 552 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 553 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 554 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 555 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 556 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 557 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 558 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 559 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 560 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 561 ; SSE-NEXT: paddw {{.*}}(%rip), %xmm0 562 ; SSE-NEXT: retq 563 ; 564 ; AVX1-LABEL: trunc_add_const_v8i64_v8i16: 565 ; AVX1: # %bb.0: 566 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 567 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 568 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 569 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 570 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 571 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 572 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 573 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 574 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 575 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 576 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 577 ; AVX1-NEXT: vzeroupper 578 ; AVX1-NEXT: retq 579 ; 580 ; AVX2-SLOW-LABEL: trunc_add_const_v8i64_v8i16: 581 ; AVX2-SLOW: # %bb.0: 582 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 583 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 584 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 585 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 586 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 587 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 588 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 589 ; AVX2-SLOW-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 590 ; AVX2-SLOW-NEXT: vzeroupper 591 ; AVX2-SLOW-NEXT: retq 592 ; 593 ; AVX2-FAST-LABEL: trunc_add_const_v8i64_v8i16: 594 ; AVX2-FAST: # %bb.0: 595 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 596 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 597 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 598 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 599 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 600 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 601 ; AVX2-FAST-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 602 ; AVX2-FAST-NEXT: vzeroupper 603 ; AVX2-FAST-NEXT: retq 604 ; 605 ; AVX512-LABEL: trunc_add_const_v8i64_v8i16: 606 ; AVX512: # %bb.0: 607 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 608 ; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 609 ; AVX512-NEXT: vzeroupper 610 ; AVX512-NEXT: retq 611 %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 612 %2 = trunc <8 x i64> %1 to <8 x i16> 613 ret <8 x i16> %2 614 } 615 616 define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 617 ; SSE-LABEL: trunc_add_const_v8i32_v8i16: 618 ; SSE: # %bb.0: 619 ; SSE-NEXT: pslld $16, %xmm1 620 ; SSE-NEXT: psrad $16, %xmm1 621 ; SSE-NEXT: pslld $16, %xmm0 622 ; SSE-NEXT: psrad $16, %xmm0 623 ; SSE-NEXT: packssdw %xmm1, %xmm0 624 ; SSE-NEXT: paddw {{.*}}(%rip), %xmm0 625 ; SSE-NEXT: retq 626 ; 627 ; AVX1-LABEL: trunc_add_const_v8i32_v8i16: 628 ; AVX1: # %bb.0: 629 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 630 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 631 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 632 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 633 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 634 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 635 ; AVX1-NEXT: vzeroupper 636 ; AVX1-NEXT: retq 637 ; 638 ; AVX2-LABEL: trunc_add_const_v8i32_v8i16: 639 ; AVX2: # %bb.0: 640 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 641 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 642 ; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 643 ; AVX2-NEXT: vzeroupper 644 ; AVX2-NEXT: retq 645 ; 646 ; AVX512-LABEL: trunc_add_const_v8i32_v8i16: 647 ; AVX512: # %bb.0: 648 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 649 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 650 ; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 651 ; AVX512-NEXT: vzeroupper 652 ; AVX512-NEXT: retq 653 %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 654 %2 = trunc <8 x i32> %1 to <8 x i16> 655 ret <8 x i16> %2 656 } 657 658 define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 659 ; SSE-LABEL: trunc_add_const_v16i64_v16i8: 660 ; SSE: # %bb.0: 661 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 662 ; SSE-NEXT: pand %xmm8, %xmm7 663 ; SSE-NEXT: pand %xmm8, %xmm6 664 ; SSE-NEXT: packuswb %xmm7, %xmm6 665 ; SSE-NEXT: pand %xmm8, %xmm5 666 ; SSE-NEXT: pand %xmm8, %xmm4 667 ; SSE-NEXT: packuswb %xmm5, %xmm4 668 ; SSE-NEXT: packuswb %xmm6, %xmm4 669 ; SSE-NEXT: pand %xmm8, %xmm3 670 ; SSE-NEXT: pand %xmm8, %xmm2 671 ; SSE-NEXT: packuswb %xmm3, %xmm2 672 ; SSE-NEXT: pand %xmm8, %xmm1 673 ; SSE-NEXT: pand %xmm8, %xmm0 674 ; SSE-NEXT: packuswb %xmm1, %xmm0 675 ; SSE-NEXT: packuswb %xmm2, %xmm0 676 ; SSE-NEXT: packuswb %xmm4, %xmm0 677 ; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 678 ; SSE-NEXT: retq 679 ; 680 ; AVX1-LABEL: trunc_add_const_v16i64_v16i8: 681 ; AVX1: # %bb.0: 682 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 683 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 684 ; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 685 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 686 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 687 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 688 ; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 689 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 690 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 691 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 692 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 693 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 694 ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 695 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 696 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 697 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 698 ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 699 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 700 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 701 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 702 ; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 703 ; AVX1-NEXT: vzeroupper 704 ; AVX1-NEXT: retq 705 ; 706 ; AVX2-SLOW-LABEL: trunc_add_const_v16i64_v16i8: 707 ; AVX2-SLOW: # %bb.0: 708 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 709 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 710 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] 711 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] 712 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 713 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 714 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 715 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 716 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 717 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 718 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 719 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 720 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 721 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 722 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 723 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 724 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 725 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 726 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 727 ; AVX2-SLOW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 728 ; AVX2-SLOW-NEXT: vzeroupper 729 ; AVX2-SLOW-NEXT: retq 730 ; 731 ; AVX2-FAST-LABEL: trunc_add_const_v16i64_v16i8: 732 ; AVX2-FAST: # %bb.0: 733 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 734 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 735 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 736 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 737 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 738 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 739 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 740 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 741 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 742 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 743 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 744 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 745 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 746 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 747 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 748 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 749 ; AVX2-FAST-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 750 ; AVX2-FAST-NEXT: vzeroupper 751 ; AVX2-FAST-NEXT: retq 752 ; 753 ; AVX512-LABEL: trunc_add_const_v16i64_v16i8: 754 ; AVX512: # %bb.0: 755 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 756 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1 757 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 758 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 759 ; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 760 ; AVX512-NEXT: vzeroupper 761 ; AVX512-NEXT: retq 762 %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 763 %2 = trunc <16 x i64> %1 to <16 x i8> 764 ret <16 x i8> %2 765 } 766 767 define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 768 ; SSE-LABEL: trunc_add_const_v16i32_v16i8: 769 ; SSE: # %bb.0: 770 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 771 ; SSE-NEXT: pand %xmm4, %xmm3 772 ; SSE-NEXT: pand %xmm4, %xmm2 773 ; SSE-NEXT: packuswb %xmm3, %xmm2 774 ; SSE-NEXT: pand %xmm4, %xmm1 775 ; SSE-NEXT: pand %xmm4, %xmm0 776 ; SSE-NEXT: packuswb %xmm1, %xmm0 777 ; SSE-NEXT: packuswb %xmm2, %xmm0 778 ; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 779 ; SSE-NEXT: retq 780 ; 781 ; AVX1-LABEL: trunc_add_const_v16i32_v16i8: 782 ; AVX1: # %bb.0: 783 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 784 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 785 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 786 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 787 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 788 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 789 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 790 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 791 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 792 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 793 ; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 794 ; AVX1-NEXT: vzeroupper 795 ; AVX1-NEXT: retq 796 ; 797 ; AVX2-LABEL: trunc_add_const_v16i32_v16i8: 798 ; AVX2: # %bb.0: 799 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 800 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 801 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 802 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 803 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 804 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 805 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 806 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 807 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 808 ; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 809 ; AVX2-NEXT: vzeroupper 810 ; AVX2-NEXT: retq 811 ; 812 ; AVX512-LABEL: trunc_add_const_v16i32_v16i8: 813 ; AVX512: # %bb.0: 814 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 815 ; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 816 ; AVX512-NEXT: vzeroupper 817 ; AVX512-NEXT: retq 818 %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 819 %2 = trunc <16 x i32> %1 to <16 x i8> 820 ret <16 x i8> %2 821 } 822 823 define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 824 ; SSE-LABEL: trunc_add_const_v16i16_v16i8: 825 ; SSE: # %bb.0: 826 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 827 ; SSE-NEXT: pand %xmm2, %xmm1 828 ; SSE-NEXT: pand %xmm2, %xmm0 829 ; SSE-NEXT: packuswb %xmm1, %xmm0 830 ; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 831 ; SSE-NEXT: retq 832 ; 833 ; AVX1-LABEL: trunc_add_const_v16i16_v16i8: 834 ; AVX1: # %bb.0: 835 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 836 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 837 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 838 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 839 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 840 ; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 841 ; AVX1-NEXT: vzeroupper 842 ; AVX1-NEXT: retq 843 ; 844 ; AVX2-LABEL: trunc_add_const_v16i16_v16i8: 845 ; AVX2: # %bb.0: 846 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 847 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 848 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 849 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 850 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 851 ; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 852 ; AVX2-NEXT: vzeroupper 853 ; AVX2-NEXT: retq 854 ; 855 ; AVX512F-LABEL: trunc_add_const_v16i16_v16i8: 856 ; AVX512F: # %bb.0: 857 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 858 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 859 ; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 860 ; AVX512F-NEXT: vzeroupper 861 ; AVX512F-NEXT: retq 862 ; 863 ; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8: 864 ; AVX512BW: # %bb.0: 865 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 866 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 867 ; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 868 ; AVX512BW-NEXT: vzeroupper 869 ; AVX512BW-NEXT: retq 870 ; 871 ; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8: 872 ; AVX512DQ: # %bb.0: 873 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 874 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 875 ; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 876 ; AVX512DQ-NEXT: vzeroupper 877 ; AVX512DQ-NEXT: retq 878 %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 879 %2 = trunc <16 x i16> %1 to <16 x i8> 880 ret <16 x i8> %2 881 } 882 883 ; 884 ; sub 885 ; 886 887 define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 888 ; SSE-LABEL: trunc_sub_v4i64_v4i32: 889 ; SSE: # %bb.0: 890 ; SSE-NEXT: psubq %xmm3, %xmm1 891 ; SSE-NEXT: psubq %xmm2, %xmm0 892 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 893 ; SSE-NEXT: retq 894 ; 895 ; AVX1-LABEL: trunc_sub_v4i64_v4i32: 896 ; AVX1: # %bb.0: 897 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 898 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 899 ; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 900 ; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 901 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 902 ; AVX1-NEXT: vzeroupper 903 ; AVX1-NEXT: retq 904 ; 905 ; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32: 906 ; AVX2-SLOW: # %bb.0: 907 ; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0 908 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 909 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 910 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 911 ; AVX2-SLOW-NEXT: vzeroupper 912 ; AVX2-SLOW-NEXT: retq 913 ; 914 ; AVX2-FAST-LABEL: trunc_sub_v4i64_v4i32: 915 ; AVX2-FAST: # %bb.0: 916 ; AVX2-FAST-NEXT: vpsubq %ymm1, %ymm0, %ymm0 917 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] 918 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 919 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 920 ; AVX2-FAST-NEXT: vzeroupper 921 ; AVX2-FAST-NEXT: retq 922 ; 923 ; AVX512-LABEL: trunc_sub_v4i64_v4i32: 924 ; AVX512: # %bb.0: 925 ; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0 926 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 927 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 928 ; AVX512-NEXT: vzeroupper 929 ; AVX512-NEXT: retq 930 %1 = sub <4 x i64> %a0, %a1 931 %2 = trunc <4 x i64> %1 to <4 x i32> 932 ret <4 x i32> %2 933 } 934 935 define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 936 ; SSE-LABEL: trunc_sub_v8i64_v8i16: 937 ; SSE: # %bb.0: 938 ; SSE-NEXT: psubq %xmm6, %xmm2 939 ; SSE-NEXT: psubq %xmm7, %xmm3 940 ; SSE-NEXT: psubq %xmm4, %xmm0 941 ; SSE-NEXT: psubq %xmm5, %xmm1 942 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 943 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 944 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 945 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 946 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 947 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 948 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 949 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 950 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 951 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 952 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 953 ; SSE-NEXT: retq 954 ; 955 ; AVX1-LABEL: trunc_sub_v8i64_v8i16: 956 ; AVX1: # %bb.0: 957 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm4 958 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 959 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 960 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 961 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm2 962 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 963 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 964 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 965 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 966 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 967 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 968 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 969 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 970 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] 971 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 972 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 973 ; AVX1-NEXT: vzeroupper 974 ; AVX1-NEXT: retq 975 ; 976 ; AVX2-SLOW-LABEL: trunc_sub_v8i64_v8i16: 977 ; AVX2-SLOW: # %bb.0: 978 ; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1 979 ; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0 980 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 981 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 982 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 983 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 984 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 985 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 986 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 987 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 988 ; AVX2-SLOW-NEXT: vzeroupper 989 ; AVX2-SLOW-NEXT: retq 990 ; 991 ; AVX2-FAST-LABEL: trunc_sub_v8i64_v8i16: 992 ; AVX2-FAST: # %bb.0: 993 ; AVX2-FAST-NEXT: vpsubq %ymm3, %ymm1, %ymm1 994 ; AVX2-FAST-NEXT: vpsubq %ymm2, %ymm0, %ymm0 995 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 996 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 997 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 998 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 999 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1000 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1001 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1002 ; AVX2-FAST-NEXT: vzeroupper 1003 ; AVX2-FAST-NEXT: retq 1004 ; 1005 ; AVX512-LABEL: trunc_sub_v8i64_v8i16: 1006 ; AVX512: # %bb.0: 1007 ; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 1008 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 1009 ; AVX512-NEXT: vzeroupper 1010 ; AVX512-NEXT: retq 1011 %1 = sub <8 x i64> %a0, %a1 1012 %2 = trunc <8 x i64> %1 to <8 x i16> 1013 ret <8 x i16> %2 1014 } 1015 1016 define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 1017 ; SSE-LABEL: trunc_sub_v8i32_v8i16: 1018 ; SSE: # %bb.0: 1019 ; SSE-NEXT: psubd %xmm2, %xmm0 1020 ; SSE-NEXT: psubd %xmm3, %xmm1 1021 ; SSE-NEXT: pslld $16, %xmm1 1022 ; SSE-NEXT: psrad $16, %xmm1 1023 ; SSE-NEXT: pslld $16, %xmm0 1024 ; SSE-NEXT: psrad $16, %xmm0 1025 ; SSE-NEXT: packssdw %xmm1, %xmm0 1026 ; SSE-NEXT: retq 1027 ; 1028 ; AVX1-LABEL: trunc_sub_v8i32_v8i16: 1029 ; AVX1: # %bb.0: 1030 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2 1031 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1032 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1033 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 1034 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1035 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 1036 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 1037 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1038 ; AVX1-NEXT: vzeroupper 1039 ; AVX1-NEXT: retq 1040 ; 1041 ; AVX2-LABEL: trunc_sub_v8i32_v8i16: 1042 ; AVX2: # %bb.0: 1043 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 1044 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1045 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1046 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1047 ; AVX2-NEXT: vzeroupper 1048 ; AVX2-NEXT: retq 1049 ; 1050 ; AVX512-LABEL: trunc_sub_v8i32_v8i16: 1051 ; AVX512: # %bb.0: 1052 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 1053 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 1054 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1055 ; AVX512-NEXT: vzeroupper 1056 ; AVX512-NEXT: retq 1057 %1 = sub <8 x i32> %a0, %a1 1058 %2 = trunc <8 x i32> %1 to <8 x i16> 1059 ret <8 x i16> %2 1060 } 1061 1062 define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 1063 ; SSE-LABEL: trunc_sub_v16i64_v16i8: 1064 ; SSE: # %bb.0: 1065 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm0 1066 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm1 1067 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm2 1068 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm3 1069 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm4 1070 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm5 1071 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm6 1072 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm7 1073 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 1074 ; SSE-NEXT: pand %xmm8, %xmm7 1075 ; SSE-NEXT: pand %xmm8, %xmm6 1076 ; SSE-NEXT: packuswb %xmm7, %xmm6 1077 ; SSE-NEXT: pand %xmm8, %xmm5 1078 ; SSE-NEXT: pand %xmm8, %xmm4 1079 ; SSE-NEXT: packuswb %xmm5, %xmm4 1080 ; SSE-NEXT: packuswb %xmm6, %xmm4 1081 ; SSE-NEXT: pand %xmm8, %xmm3 1082 ; SSE-NEXT: pand %xmm8, %xmm2 1083 ; SSE-NEXT: packuswb %xmm3, %xmm2 1084 ; SSE-NEXT: pand %xmm8, %xmm1 1085 ; SSE-NEXT: pand %xmm8, %xmm0 1086 ; SSE-NEXT: packuswb %xmm1, %xmm0 1087 ; SSE-NEXT: packuswb %xmm2, %xmm0 1088 ; SSE-NEXT: packuswb %xmm4, %xmm0 1089 ; SSE-NEXT: retq 1090 ; 1091 ; AVX1-LABEL: trunc_sub_v16i64_v16i8: 1092 ; AVX1: # %bb.0: 1093 ; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8 1094 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 1095 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1096 ; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0 1097 ; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm4 1098 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 1099 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1100 ; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm1 1101 ; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm5 1102 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 1103 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1104 ; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2 1105 ; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm6 1106 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 1107 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1108 ; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3 1109 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 1110 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 1111 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 1112 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 1113 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 1114 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 1115 ; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 1116 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1117 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 1118 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 1119 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 1120 ; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 1121 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 1122 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 1123 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1124 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1125 ; AVX1-NEXT: vzeroupper 1126 ; AVX1-NEXT: retq 1127 ; 1128 ; AVX2-SLOW-LABEL: trunc_sub_v16i64_v16i8: 1129 ; AVX2-SLOW: # %bb.0: 1130 ; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1 1131 ; AVX2-SLOW-NEXT: vpsubq %ymm4, %ymm0, %ymm0 1132 ; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3 1133 ; AVX2-SLOW-NEXT: vpsubq %ymm6, %ymm2, %ymm2 1134 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 1135 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 1136 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] 1137 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] 1138 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1139 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1140 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 1141 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 1142 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1143 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 1144 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 1145 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1146 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 1147 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 1148 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1149 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 1150 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1151 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 1152 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 1153 ; AVX2-SLOW-NEXT: vzeroupper 1154 ; AVX2-SLOW-NEXT: retq 1155 ; 1156 ; AVX2-FAST-LABEL: trunc_sub_v16i64_v16i8: 1157 ; AVX2-FAST: # %bb.0: 1158 ; AVX2-FAST-NEXT: vpsubq %ymm5, %ymm1, %ymm1 1159 ; AVX2-FAST-NEXT: vpsubq %ymm4, %ymm0, %ymm0 1160 ; AVX2-FAST-NEXT: vpsubq %ymm7, %ymm3, %ymm3 1161 ; AVX2-FAST-NEXT: vpsubq %ymm6, %ymm2, %ymm2 1162 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 1163 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 1164 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 1165 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1166 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1167 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 1168 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 1169 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1170 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 1171 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 1172 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 1173 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1174 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 1175 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1176 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 1177 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 1178 ; AVX2-FAST-NEXT: vzeroupper 1179 ; AVX2-FAST-NEXT: retq 1180 ; 1181 ; AVX512-LABEL: trunc_sub_v16i64_v16i8: 1182 ; AVX512: # %bb.0: 1183 ; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1 1184 ; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0 1185 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 1186 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1 1187 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1188 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1189 ; AVX512-NEXT: vzeroupper 1190 ; AVX512-NEXT: retq 1191 %1 = sub <16 x i64> %a0, %a1 1192 %2 = trunc <16 x i64> %1 to <16 x i8> 1193 ret <16 x i8> %2 1194 } 1195 1196 define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 1197 ; SSE-LABEL: trunc_sub_v16i32_v16i8: 1198 ; SSE: # %bb.0: 1199 ; SSE-NEXT: psubd %xmm4, %xmm0 1200 ; SSE-NEXT: psubd %xmm5, %xmm1 1201 ; SSE-NEXT: psubd %xmm6, %xmm2 1202 ; SSE-NEXT: psubd %xmm7, %xmm3 1203 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1204 ; SSE-NEXT: pand %xmm4, %xmm3 1205 ; SSE-NEXT: pand %xmm4, %xmm2 1206 ; SSE-NEXT: packuswb %xmm3, %xmm2 1207 ; SSE-NEXT: pand %xmm4, %xmm1 1208 ; SSE-NEXT: pand %xmm4, %xmm0 1209 ; SSE-NEXT: packuswb %xmm1, %xmm0 1210 ; SSE-NEXT: packuswb %xmm2, %xmm0 1211 ; SSE-NEXT: retq 1212 ; 1213 ; AVX1-LABEL: trunc_sub_v16i32_v16i8: 1214 ; AVX1: # %bb.0: 1215 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm4 1216 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1217 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1218 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 1219 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm2 1220 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1221 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1222 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 1223 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1224 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1225 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1226 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 1227 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1228 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 1229 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 1230 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1231 ; AVX1-NEXT: vzeroupper 1232 ; AVX1-NEXT: retq 1233 ; 1234 ; AVX2-LABEL: trunc_sub_v16i32_v16i8: 1235 ; AVX2: # %bb.0: 1236 ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 1237 ; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1 1238 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1239 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 1240 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 1241 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1242 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1243 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1244 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1245 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1246 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1247 ; AVX2-NEXT: vzeroupper 1248 ; AVX2-NEXT: retq 1249 ; 1250 ; AVX512-LABEL: trunc_sub_v16i32_v16i8: 1251 ; AVX512: # %bb.0: 1252 ; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0 1253 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1254 ; AVX512-NEXT: vzeroupper 1255 ; AVX512-NEXT: retq 1256 %1 = sub <16 x i32> %a0, %a1 1257 %2 = trunc <16 x i32> %1 to <16 x i8> 1258 ret <16 x i8> %2 1259 } 1260 1261 define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 1262 ; SSE-LABEL: trunc_sub_v16i16_v16i8: 1263 ; SSE: # %bb.0: 1264 ; SSE-NEXT: psubw %xmm2, %xmm0 1265 ; SSE-NEXT: psubw %xmm3, %xmm1 1266 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1267 ; SSE-NEXT: pand %xmm2, %xmm1 1268 ; SSE-NEXT: pand %xmm2, %xmm0 1269 ; SSE-NEXT: packuswb %xmm1, %xmm0 1270 ; SSE-NEXT: retq 1271 ; 1272 ; AVX1-LABEL: trunc_sub_v16i16_v16i8: 1273 ; AVX1: # %bb.0: 1274 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2 1275 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1276 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1277 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 1278 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1279 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 1280 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 1281 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1282 ; AVX1-NEXT: vzeroupper 1283 ; AVX1-NEXT: retq 1284 ; 1285 ; AVX2-LABEL: trunc_sub_v16i16_v16i8: 1286 ; AVX2: # %bb.0: 1287 ; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1288 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1289 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1290 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1291 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1292 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1293 ; AVX2-NEXT: vzeroupper 1294 ; AVX2-NEXT: retq 1295 ; 1296 ; AVX512F-LABEL: trunc_sub_v16i16_v16i8: 1297 ; AVX512F: # %bb.0: 1298 ; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1299 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 1300 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1301 ; AVX512F-NEXT: vzeroupper 1302 ; AVX512F-NEXT: retq 1303 ; 1304 ; AVX512BW-LABEL: trunc_sub_v16i16_v16i8: 1305 ; AVX512BW: # %bb.0: 1306 ; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1307 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1308 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1309 ; AVX512BW-NEXT: vzeroupper 1310 ; AVX512BW-NEXT: retq 1311 ; 1312 ; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8: 1313 ; AVX512DQ: # %bb.0: 1314 ; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1315 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 1316 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1317 ; AVX512DQ-NEXT: vzeroupper 1318 ; AVX512DQ-NEXT: retq 1319 %1 = sub <16 x i16> %a0, %a1 1320 %2 = trunc <16 x i16> %1 to <16 x i8> 1321 ret <16 x i8> %2 1322 } 1323 1324 ; 1325 ; sub to constant 1326 ; 1327 1328 define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 1329 ; SSE-LABEL: trunc_sub_const_v4i64_v4i32: 1330 ; SSE: # %bb.0: 1331 ; SSE-NEXT: movl $1, %eax 1332 ; SSE-NEXT: movq %rax, %xmm2 1333 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] 1334 ; SSE-NEXT: psubq %xmm2, %xmm0 1335 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm1 1336 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1337 ; SSE-NEXT: retq 1338 ; 1339 ; AVX1-LABEL: trunc_sub_const_v4i64_v4i32: 1340 ; AVX1: # %bb.0: 1341 ; AVX1-NEXT: movl $1, %eax 1342 ; AVX1-NEXT: vmovq %rax, %xmm1 1343 ; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] 1344 ; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 1345 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1346 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 1347 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2] 1348 ; AVX1-NEXT: vzeroupper 1349 ; AVX1-NEXT: retq 1350 ; 1351 ; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32: 1352 ; AVX2-SLOW: # %bb.0: 1353 ; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 1354 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 1355 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1356 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1357 ; AVX2-SLOW-NEXT: vzeroupper 1358 ; AVX2-SLOW-NEXT: retq 1359 ; 1360 ; AVX2-FAST-LABEL: trunc_sub_const_v4i64_v4i32: 1361 ; AVX2-FAST: # %bb.0: 1362 ; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 1363 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] 1364 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 1365 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1366 ; AVX2-FAST-NEXT: vzeroupper 1367 ; AVX2-FAST-NEXT: retq 1368 ; 1369 ; AVX512-LABEL: trunc_sub_const_v4i64_v4i32: 1370 ; AVX512: # %bb.0: 1371 ; AVX512-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 1372 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 1373 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1374 ; AVX512-NEXT: vzeroupper 1375 ; AVX512-NEXT: retq 1376 %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 1377 %2 = trunc <4 x i64> %1 to <4 x i32> 1378 ret <4 x i32> %2 1379 } 1380 1381 define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 1382 ; SSE-LABEL: trunc_sub_const_v8i64_v8i16: 1383 ; SSE: # %bb.0: 1384 ; SSE-NEXT: movl $1, %eax 1385 ; SSE-NEXT: movq %rax, %xmm4 1386 ; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] 1387 ; SSE-NEXT: psubq %xmm4, %xmm0 1388 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm1 1389 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm2 1390 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm3 1391 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 1392 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] 1393 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1394 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] 1395 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 1396 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1397 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1398 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1399 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1400 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1401 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 1402 ; SSE-NEXT: movapd %xmm2, %xmm0 1403 ; SSE-NEXT: retq 1404 ; 1405 ; AVX1-LABEL: trunc_sub_const_v8i64_v8i16: 1406 ; AVX1: # %bb.0: 1407 ; AVX1-NEXT: movl $1, %eax 1408 ; AVX1-NEXT: vmovq %rax, %xmm2 1409 ; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] 1410 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2 1411 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1412 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 1413 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm3 1414 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1415 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1 1416 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 1417 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] 1418 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] 1419 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 1420 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] 1421 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] 1422 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 1423 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1424 ; AVX1-NEXT: vzeroupper 1425 ; AVX1-NEXT: retq 1426 ; 1427 ; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16: 1428 ; AVX2-SLOW: # %bb.0: 1429 ; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 1430 ; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 1431 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 1432 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1433 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 1434 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 1435 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1436 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1437 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1438 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1439 ; AVX2-SLOW-NEXT: vzeroupper 1440 ; AVX2-SLOW-NEXT: retq 1441 ; 1442 ; AVX2-FAST-LABEL: trunc_sub_const_v8i64_v8i16: 1443 ; AVX2-FAST: # %bb.0: 1444 ; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 1445 ; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 1446 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 1447 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 1448 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 1449 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1450 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1451 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1452 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1453 ; AVX2-FAST-NEXT: vzeroupper 1454 ; AVX2-FAST-NEXT: retq 1455 ; 1456 ; AVX512-LABEL: trunc_sub_const_v8i64_v8i16: 1457 ; AVX512: # %bb.0: 1458 ; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0 1459 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 1460 ; AVX512-NEXT: vzeroupper 1461 ; AVX512-NEXT: retq 1462 %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 1463 %2 = trunc <8 x i64> %1 to <8 x i16> 1464 ret <8 x i16> %2 1465 } 1466 1467 define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 1468 ; SSE-LABEL: trunc_sub_const_v8i32_v8i16: 1469 ; SSE: # %bb.0: 1470 ; SSE-NEXT: psubd {{.*}}(%rip), %xmm0 1471 ; SSE-NEXT: psubd {{.*}}(%rip), %xmm1 1472 ; SSE-NEXT: pslld $16, %xmm1 1473 ; SSE-NEXT: psrad $16, %xmm1 1474 ; SSE-NEXT: pslld $16, %xmm0 1475 ; SSE-NEXT: psrad $16, %xmm0 1476 ; SSE-NEXT: packssdw %xmm1, %xmm0 1477 ; SSE-NEXT: retq 1478 ; 1479 ; AVX1-LABEL: trunc_sub_const_v8i32_v8i16: 1480 ; AVX1: # %bb.0: 1481 ; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm1 1482 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1483 ; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 1484 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1485 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1486 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1487 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1488 ; AVX1-NEXT: vzeroupper 1489 ; AVX1-NEXT: retq 1490 ; 1491 ; AVX2-LABEL: trunc_sub_const_v8i32_v8i16: 1492 ; AVX2: # %bb.0: 1493 ; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0 1494 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1495 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1496 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1497 ; AVX2-NEXT: vzeroupper 1498 ; AVX2-NEXT: retq 1499 ; 1500 ; AVX512-LABEL: trunc_sub_const_v8i32_v8i16: 1501 ; AVX512: # %bb.0: 1502 ; AVX512-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0 1503 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 1504 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1505 ; AVX512-NEXT: vzeroupper 1506 ; AVX512-NEXT: retq 1507 %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1508 %2 = trunc <8 x i32> %1 to <8 x i16> 1509 ret <8 x i16> %2 1510 } 1511 1512 define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 1513 ; SSE-LABEL: trunc_sub_const_v16i64_v16i8: 1514 ; SSE: # %bb.0: 1515 ; SSE-NEXT: movl $1, %eax 1516 ; SSE-NEXT: movq %rax, %xmm8 1517 ; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] 1518 ; SSE-NEXT: psubq %xmm8, %xmm0 1519 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm1 1520 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm2 1521 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm3 1522 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm4 1523 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm5 1524 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm6 1525 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm7 1526 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 1527 ; SSE-NEXT: pand %xmm8, %xmm7 1528 ; SSE-NEXT: pand %xmm8, %xmm6 1529 ; SSE-NEXT: packuswb %xmm7, %xmm6 1530 ; SSE-NEXT: pand %xmm8, %xmm5 1531 ; SSE-NEXT: pand %xmm8, %xmm4 1532 ; SSE-NEXT: packuswb %xmm5, %xmm4 1533 ; SSE-NEXT: packuswb %xmm6, %xmm4 1534 ; SSE-NEXT: pand %xmm8, %xmm3 1535 ; SSE-NEXT: pand %xmm8, %xmm2 1536 ; SSE-NEXT: packuswb %xmm3, %xmm2 1537 ; SSE-NEXT: pand %xmm8, %xmm1 1538 ; SSE-NEXT: pand %xmm8, %xmm0 1539 ; SSE-NEXT: packuswb %xmm1, %xmm0 1540 ; SSE-NEXT: packuswb %xmm2, %xmm0 1541 ; SSE-NEXT: packuswb %xmm4, %xmm0 1542 ; SSE-NEXT: retq 1543 ; 1544 ; AVX1-LABEL: trunc_sub_const_v16i64_v16i8: 1545 ; AVX1: # %bb.0: 1546 ; AVX1-NEXT: movl $1, %eax 1547 ; AVX1-NEXT: vmovq %rax, %xmm4 1548 ; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] 1549 ; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8 1550 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1551 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 1552 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm5 1553 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1554 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1 1555 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm6 1556 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1557 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm2 1558 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm7 1559 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1560 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm3 1561 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 1562 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 1563 ; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 1564 ; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3 1565 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1566 ; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6 1567 ; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2 1568 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1569 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 1570 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3 1571 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 1572 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 1573 ; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3 1574 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 1575 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1576 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1577 ; AVX1-NEXT: vzeroupper 1578 ; AVX1-NEXT: retq 1579 ; 1580 ; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8: 1581 ; AVX2-SLOW: # %bb.0: 1582 ; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 1583 ; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 1584 ; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm3, %ymm3 1585 ; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm2, %ymm2 1586 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 1587 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 1588 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] 1589 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] 1590 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1591 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1592 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 1593 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 1594 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1595 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 1596 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 1597 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1598 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 1599 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 1600 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1601 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 1602 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1603 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 1604 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 1605 ; AVX2-SLOW-NEXT: vzeroupper 1606 ; AVX2-SLOW-NEXT: retq 1607 ; 1608 ; AVX2-FAST-LABEL: trunc_sub_const_v16i64_v16i8: 1609 ; AVX2-FAST: # %bb.0: 1610 ; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 1611 ; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 1612 ; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm3, %ymm3 1613 ; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm2, %ymm2 1614 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 1615 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 1616 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 1617 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1618 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1619 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 1620 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 1621 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1622 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 1623 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 1624 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 1625 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1626 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 1627 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1628 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 1629 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 1630 ; AVX2-FAST-NEXT: vzeroupper 1631 ; AVX2-FAST-NEXT: retq 1632 ; 1633 ; AVX512-LABEL: trunc_sub_const_v16i64_v16i8: 1634 ; AVX512: # %bb.0: 1635 ; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm1, %zmm1 1636 ; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0 1637 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 1638 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1 1639 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1640 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1641 ; AVX512-NEXT: vzeroupper 1642 ; AVX512-NEXT: retq 1643 %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 1644 %2 = trunc <16 x i64> %1 to <16 x i8> 1645 ret <16 x i8> %2 1646 } 1647 1648 define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 1649 ; SSE-LABEL: trunc_sub_const_v16i32_v16i8: 1650 ; SSE: # %bb.0: 1651 ; SSE-NEXT: psubd {{.*}}(%rip), %xmm0 1652 ; SSE-NEXT: psubd {{.*}}(%rip), %xmm1 1653 ; SSE-NEXT: psubd {{.*}}(%rip), %xmm2 1654 ; SSE-NEXT: psubd {{.*}}(%rip), %xmm3 1655 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1656 ; SSE-NEXT: pand %xmm4, %xmm3 1657 ; SSE-NEXT: pand %xmm4, %xmm2 1658 ; SSE-NEXT: packuswb %xmm3, %xmm2 1659 ; SSE-NEXT: pand %xmm4, %xmm1 1660 ; SSE-NEXT: pand %xmm4, %xmm0 1661 ; SSE-NEXT: packuswb %xmm1, %xmm0 1662 ; SSE-NEXT: packuswb %xmm2, %xmm0 1663 ; SSE-NEXT: retq 1664 ; 1665 ; AVX1-LABEL: trunc_sub_const_v16i32_v16i8: 1666 ; AVX1: # %bb.0: 1667 ; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm2 1668 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1669 ; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 1670 ; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm1, %xmm3 1671 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1672 ; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm1, %xmm1 1673 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1674 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 1675 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 1676 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 1677 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 1678 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1679 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 1680 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1681 ; AVX1-NEXT: vzeroupper 1682 ; AVX1-NEXT: retq 1683 ; 1684 ; AVX2-LABEL: trunc_sub_const_v16i32_v16i8: 1685 ; AVX2: # %bb.0: 1686 ; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0 1687 ; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm1, %ymm1 1688 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1689 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 1690 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 1691 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1692 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1693 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1694 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1695 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1696 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1697 ; AVX2-NEXT: vzeroupper 1698 ; AVX2-NEXT: retq 1699 ; 1700 ; AVX512-LABEL: trunc_sub_const_v16i32_v16i8: 1701 ; AVX512: # %bb.0: 1702 ; AVX512-NEXT: vpsubd {{.*}}(%rip), %zmm0, %zmm0 1703 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1704 ; AVX512-NEXT: vzeroupper 1705 ; AVX512-NEXT: retq 1706 %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1707 %2 = trunc <16 x i32> %1 to <16 x i8> 1708 ret <16 x i8> %2 1709 } 1710 1711 define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 1712 ; SSE-LABEL: trunc_sub_const_v16i16_v16i8: 1713 ; SSE: # %bb.0: 1714 ; SSE-NEXT: psubw {{.*}}(%rip), %xmm0 1715 ; SSE-NEXT: psubw {{.*}}(%rip), %xmm1 1716 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1717 ; SSE-NEXT: pand %xmm2, %xmm1 1718 ; SSE-NEXT: pand %xmm2, %xmm0 1719 ; SSE-NEXT: packuswb %xmm1, %xmm0 1720 ; SSE-NEXT: retq 1721 ; 1722 ; AVX1-LABEL: trunc_sub_const_v16i16_v16i8: 1723 ; AVX1: # %bb.0: 1724 ; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm1 1725 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1726 ; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 1727 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1728 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1729 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1730 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1731 ; AVX1-NEXT: vzeroupper 1732 ; AVX1-NEXT: retq 1733 ; 1734 ; AVX2-LABEL: trunc_sub_const_v16i16_v16i8: 1735 ; AVX2: # %bb.0: 1736 ; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 1737 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1738 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1739 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1740 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1741 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1742 ; AVX2-NEXT: vzeroupper 1743 ; AVX2-NEXT: retq 1744 ; 1745 ; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8: 1746 ; AVX512F: # %bb.0: 1747 ; AVX512F-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 1748 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 1749 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1750 ; AVX512F-NEXT: vzeroupper 1751 ; AVX512F-NEXT: retq 1752 ; 1753 ; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8: 1754 ; AVX512BW: # %bb.0: 1755 ; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 1756 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1757 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1758 ; AVX512BW-NEXT: vzeroupper 1759 ; AVX512BW-NEXT: retq 1760 ; 1761 ; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8: 1762 ; AVX512DQ: # %bb.0: 1763 ; AVX512DQ-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 1764 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 1765 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1766 ; AVX512DQ-NEXT: vzeroupper 1767 ; AVX512DQ-NEXT: retq 1768 %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 1769 %2 = trunc <16 x i16> %1 to <16 x i8> 1770 ret <16 x i8> %2 1771 } 1772 1773 ; 1774 ; mul 1775 ; 1776 1777 define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 1778 ; SSE-LABEL: trunc_mul_v4i64_v4i32: 1779 ; SSE: # %bb.0: 1780 ; SSE-NEXT: movdqa %xmm1, %xmm4 1781 ; SSE-NEXT: psrlq $32, %xmm4 1782 ; SSE-NEXT: pmuludq %xmm3, %xmm4 1783 ; SSE-NEXT: movdqa %xmm3, %xmm5 1784 ; SSE-NEXT: psrlq $32, %xmm5 1785 ; SSE-NEXT: pmuludq %xmm1, %xmm5 1786 ; SSE-NEXT: paddq %xmm4, %xmm5 1787 ; SSE-NEXT: psllq $32, %xmm5 1788 ; SSE-NEXT: pmuludq %xmm3, %xmm1 1789 ; SSE-NEXT: paddq %xmm5, %xmm1 1790 ; SSE-NEXT: movdqa %xmm0, %xmm3 1791 ; SSE-NEXT: psrlq $32, %xmm3 1792 ; SSE-NEXT: pmuludq %xmm2, %xmm3 1793 ; SSE-NEXT: movdqa %xmm2, %xmm4 1794 ; SSE-NEXT: psrlq $32, %xmm4 1795 ; SSE-NEXT: pmuludq %xmm0, %xmm4 1796 ; SSE-NEXT: paddq %xmm3, %xmm4 1797 ; SSE-NEXT: psllq $32, %xmm4 1798 ; SSE-NEXT: pmuludq %xmm2, %xmm0 1799 ; SSE-NEXT: paddq %xmm4, %xmm0 1800 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1801 ; SSE-NEXT: retq 1802 ; 1803 ; AVX1-LABEL: trunc_mul_v4i64_v4i32: 1804 ; AVX1: # %bb.0: 1805 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1806 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 1807 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1808 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 1809 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1810 ; AVX1-NEXT: vzeroupper 1811 ; AVX1-NEXT: retq 1812 ; 1813 ; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32: 1814 ; AVX2-SLOW: # %bb.0: 1815 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 1816 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 1817 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 1818 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1819 ; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1820 ; AVX2-SLOW-NEXT: vzeroupper 1821 ; AVX2-SLOW-NEXT: retq 1822 ; 1823 ; AVX2-FAST-LABEL: trunc_mul_v4i64_v4i32: 1824 ; AVX2-FAST: # %bb.0: 1825 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 1826 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 1827 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 1828 ; AVX2-FAST-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1829 ; AVX2-FAST-NEXT: vzeroupper 1830 ; AVX2-FAST-NEXT: retq 1831 ; 1832 ; AVX512F-LABEL: trunc_mul_v4i64_v4i32: 1833 ; AVX512F: # %bb.0: 1834 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1835 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1836 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 1837 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 1838 ; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1839 ; AVX512F-NEXT: vzeroupper 1840 ; AVX512F-NEXT: retq 1841 ; 1842 ; AVX512BW-LABEL: trunc_mul_v4i64_v4i32: 1843 ; AVX512BW: # %bb.0: 1844 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1845 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1846 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 1847 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 1848 ; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1849 ; AVX512BW-NEXT: vzeroupper 1850 ; AVX512BW-NEXT: retq 1851 ; 1852 ; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32: 1853 ; AVX512DQ: # %bb.0: 1854 ; AVX512DQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1855 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1856 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 1857 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 1858 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1859 ; AVX512DQ-NEXT: vzeroupper 1860 ; AVX512DQ-NEXT: retq 1861 %1 = mul <4 x i64> %a0, %a1 1862 %2 = trunc <4 x i64> %1 to <4 x i32> 1863 ret <4 x i32> %2 1864 } 1865 1866 define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 1867 ; SSE-LABEL: trunc_mul_v8i64_v8i16: 1868 ; SSE: # %bb.0: 1869 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 1870 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] 1871 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1872 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] 1873 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 1874 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] 1875 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7] 1876 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 1877 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7] 1878 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 1879 ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] 1880 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1881 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1882 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1883 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 1884 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 1885 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 1886 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 1887 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1888 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 1889 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1890 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 1891 ; SSE-NEXT: pmullw %xmm6, %xmm0 1892 ; SSE-NEXT: retq 1893 ; 1894 ; AVX1-LABEL: trunc_mul_v8i64_v8i16: 1895 ; AVX1: # %bb.0: 1896 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 1897 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 1898 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7] 1899 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7] 1900 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 1901 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 1902 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7] 1903 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1,2,3],xmm2[4],xmm5[5,6,7] 1904 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 1905 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1906 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1907 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7] 1908 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1,2,3],xmm1[4],xmm5[5,6,7] 1909 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1910 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1911 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7] 1912 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7] 1913 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 1914 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1915 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1916 ; AVX1-NEXT: vzeroupper 1917 ; AVX1-NEXT: retq 1918 ; 1919 ; AVX2-SLOW-LABEL: trunc_mul_v8i64_v8i16: 1920 ; AVX2-SLOW: # %bb.0: 1921 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 1922 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 1923 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] 1924 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] 1925 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1926 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1927 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 1928 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 1929 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 1930 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1931 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 1932 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 1933 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1934 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 1935 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1936 ; AVX2-SLOW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1937 ; AVX2-SLOW-NEXT: vzeroupper 1938 ; AVX2-SLOW-NEXT: retq 1939 ; 1940 ; AVX2-FAST-LABEL: trunc_mul_v8i64_v8i16: 1941 ; AVX2-FAST: # %bb.0: 1942 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 1943 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 1944 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 1945 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1946 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 1947 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 1948 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 1949 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 1950 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 1951 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1952 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 1953 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1954 ; AVX2-FAST-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1955 ; AVX2-FAST-NEXT: vzeroupper 1956 ; AVX2-FAST-NEXT: retq 1957 ; 1958 ; AVX512F-LABEL: trunc_mul_v8i64_v8i16: 1959 ; AVX512F: # %bb.0: 1960 ; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 1961 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 1962 ; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1963 ; AVX512F-NEXT: vzeroupper 1964 ; AVX512F-NEXT: retq 1965 ; 1966 ; AVX512BW-LABEL: trunc_mul_v8i64_v8i16: 1967 ; AVX512BW: # %bb.0: 1968 ; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1 1969 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 1970 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1971 ; AVX512BW-NEXT: vzeroupper 1972 ; AVX512BW-NEXT: retq 1973 ; 1974 ; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16: 1975 ; AVX512DQ: # %bb.0: 1976 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 1977 ; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 1978 ; AVX512DQ-NEXT: vzeroupper 1979 ; AVX512DQ-NEXT: retq 1980 %1 = mul <8 x i64> %a0, %a1 1981 %2 = trunc <8 x i64> %1 to <8 x i16> 1982 ret <8 x i16> %2 1983 } 1984 1985 define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 1986 ; SSE-LABEL: trunc_mul_v8i32_v8i16: 1987 ; SSE: # %bb.0: 1988 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 1989 ; SSE-NEXT: pmuludq %xmm2, %xmm0 1990 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1991 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1992 ; SSE-NEXT: pmuludq %xmm4, %xmm2 1993 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1994 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1995 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1996 ; SSE-NEXT: pmuludq %xmm3, %xmm1 1997 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1998 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 1999 ; SSE-NEXT: pmuludq %xmm2, %xmm3 2000 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 2001 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2002 ; SSE-NEXT: pslld $16, %xmm1 2003 ; SSE-NEXT: psrad $16, %xmm1 2004 ; SSE-NEXT: pslld $16, %xmm0 2005 ; SSE-NEXT: psrad $16, %xmm0 2006 ; SSE-NEXT: packssdw %xmm1, %xmm0 2007 ; SSE-NEXT: retq 2008 ; 2009 ; AVX1-LABEL: trunc_mul_v8i32_v8i16: 2010 ; AVX1: # %bb.0: 2011 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm2 2012 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2013 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2014 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 2015 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2016 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 2017 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 2018 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2019 ; AVX1-NEXT: vzeroupper 2020 ; AVX1-NEXT: retq 2021 ; 2022 ; AVX2-LABEL: trunc_mul_v8i32_v8i16: 2023 ; AVX2: # %bb.0: 2024 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 2025 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 2026 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2027 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2028 ; AVX2-NEXT: vzeroupper 2029 ; AVX2-NEXT: retq 2030 ; 2031 ; AVX512-LABEL: trunc_mul_v8i32_v8i16: 2032 ; AVX512: # %bb.0: 2033 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 2034 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 2035 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2036 ; AVX512-NEXT: vzeroupper 2037 ; AVX512-NEXT: retq 2038 %1 = mul <8 x i32> %a0, %a1 2039 %2 = trunc <8 x i32> %1 to <8 x i16> 2040 ret <8 x i16> %2 2041 } 2042 2043 define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 2044 ; SSE-LABEL: trunc_mul_v16i64_v16i8: 2045 ; SSE: # %bb.0: 2046 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 2047 ; SSE-NEXT: movdqa %xmm0, %xmm9 2048 ; SSE-NEXT: psrlq $32, %xmm9 2049 ; SSE-NEXT: pmuludq %xmm8, %xmm9 2050 ; SSE-NEXT: movdqa %xmm8, %xmm10 2051 ; SSE-NEXT: psrlq $32, %xmm10 2052 ; SSE-NEXT: pmuludq %xmm0, %xmm10 2053 ; SSE-NEXT: paddq %xmm9, %xmm10 2054 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 2055 ; SSE-NEXT: psllq $32, %xmm10 2056 ; SSE-NEXT: pmuludq %xmm8, %xmm0 2057 ; SSE-NEXT: paddq %xmm10, %xmm0 2058 ; SSE-NEXT: movdqa %xmm1, %xmm8 2059 ; SSE-NEXT: psrlq $32, %xmm8 2060 ; SSE-NEXT: pmuludq %xmm9, %xmm8 2061 ; SSE-NEXT: movdqa %xmm9, %xmm10 2062 ; SSE-NEXT: psrlq $32, %xmm10 2063 ; SSE-NEXT: pmuludq %xmm1, %xmm10 2064 ; SSE-NEXT: paddq %xmm8, %xmm10 2065 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 2066 ; SSE-NEXT: psllq $32, %xmm10 2067 ; SSE-NEXT: pmuludq %xmm9, %xmm1 2068 ; SSE-NEXT: paddq %xmm10, %xmm1 2069 ; SSE-NEXT: movdqa %xmm2, %xmm9 2070 ; SSE-NEXT: psrlq $32, %xmm9 2071 ; SSE-NEXT: pmuludq %xmm8, %xmm9 2072 ; SSE-NEXT: movdqa %xmm8, %xmm10 2073 ; SSE-NEXT: psrlq $32, %xmm10 2074 ; SSE-NEXT: pmuludq %xmm2, %xmm10 2075 ; SSE-NEXT: paddq %xmm9, %xmm10 2076 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 2077 ; SSE-NEXT: psllq $32, %xmm10 2078 ; SSE-NEXT: pmuludq %xmm8, %xmm2 2079 ; SSE-NEXT: paddq %xmm10, %xmm2 2080 ; SSE-NEXT: movdqa %xmm3, %xmm8 2081 ; SSE-NEXT: psrlq $32, %xmm8 2082 ; SSE-NEXT: pmuludq %xmm9, %xmm8 2083 ; SSE-NEXT: movdqa %xmm9, %xmm10 2084 ; SSE-NEXT: psrlq $32, %xmm10 2085 ; SSE-NEXT: pmuludq %xmm3, %xmm10 2086 ; SSE-NEXT: paddq %xmm8, %xmm10 2087 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 2088 ; SSE-NEXT: psllq $32, %xmm10 2089 ; SSE-NEXT: pmuludq %xmm9, %xmm3 2090 ; SSE-NEXT: paddq %xmm10, %xmm3 2091 ; SSE-NEXT: movdqa %xmm4, %xmm9 2092 ; SSE-NEXT: psrlq $32, %xmm9 2093 ; SSE-NEXT: pmuludq %xmm8, %xmm9 2094 ; SSE-NEXT: movdqa %xmm8, %xmm10 2095 ; SSE-NEXT: psrlq $32, %xmm10 2096 ; SSE-NEXT: pmuludq %xmm4, %xmm10 2097 ; SSE-NEXT: paddq %xmm9, %xmm10 2098 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 2099 ; SSE-NEXT: psllq $32, %xmm10 2100 ; SSE-NEXT: pmuludq %xmm8, %xmm4 2101 ; SSE-NEXT: paddq %xmm10, %xmm4 2102 ; SSE-NEXT: movdqa %xmm5, %xmm8 2103 ; SSE-NEXT: psrlq $32, %xmm8 2104 ; SSE-NEXT: pmuludq %xmm9, %xmm8 2105 ; SSE-NEXT: movdqa %xmm9, %xmm10 2106 ; SSE-NEXT: psrlq $32, %xmm10 2107 ; SSE-NEXT: pmuludq %xmm5, %xmm10 2108 ; SSE-NEXT: paddq %xmm8, %xmm10 2109 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 2110 ; SSE-NEXT: psllq $32, %xmm10 2111 ; SSE-NEXT: pmuludq %xmm9, %xmm5 2112 ; SSE-NEXT: paddq %xmm10, %xmm5 2113 ; SSE-NEXT: movdqa %xmm6, %xmm9 2114 ; SSE-NEXT: psrlq $32, %xmm9 2115 ; SSE-NEXT: pmuludq %xmm8, %xmm9 2116 ; SSE-NEXT: movdqa %xmm8, %xmm10 2117 ; SSE-NEXT: psrlq $32, %xmm10 2118 ; SSE-NEXT: pmuludq %xmm6, %xmm10 2119 ; SSE-NEXT: paddq %xmm9, %xmm10 2120 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 2121 ; SSE-NEXT: psllq $32, %xmm10 2122 ; SSE-NEXT: pmuludq %xmm8, %xmm6 2123 ; SSE-NEXT: paddq %xmm10, %xmm6 2124 ; SSE-NEXT: movdqa %xmm7, %xmm8 2125 ; SSE-NEXT: psrlq $32, %xmm8 2126 ; SSE-NEXT: pmuludq %xmm9, %xmm8 2127 ; SSE-NEXT: movdqa %xmm9, %xmm10 2128 ; SSE-NEXT: psrlq $32, %xmm10 2129 ; SSE-NEXT: pmuludq %xmm7, %xmm10 2130 ; SSE-NEXT: paddq %xmm8, %xmm10 2131 ; SSE-NEXT: pmuludq %xmm9, %xmm7 2132 ; SSE-NEXT: psllq $32, %xmm10 2133 ; SSE-NEXT: paddq %xmm10, %xmm7 2134 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 2135 ; SSE-NEXT: pand %xmm8, %xmm7 2136 ; SSE-NEXT: pand %xmm8, %xmm6 2137 ; SSE-NEXT: packuswb %xmm7, %xmm6 2138 ; SSE-NEXT: pand %xmm8, %xmm5 2139 ; SSE-NEXT: pand %xmm8, %xmm4 2140 ; SSE-NEXT: packuswb %xmm5, %xmm4 2141 ; SSE-NEXT: packuswb %xmm6, %xmm4 2142 ; SSE-NEXT: pand %xmm8, %xmm3 2143 ; SSE-NEXT: pand %xmm8, %xmm2 2144 ; SSE-NEXT: packuswb %xmm3, %xmm2 2145 ; SSE-NEXT: pand %xmm8, %xmm1 2146 ; SSE-NEXT: pand %xmm8, %xmm0 2147 ; SSE-NEXT: packuswb %xmm1, %xmm0 2148 ; SSE-NEXT: packuswb %xmm2, %xmm0 2149 ; SSE-NEXT: packuswb %xmm4, %xmm0 2150 ; SSE-NEXT: retq 2151 ; 2152 ; AVX1-LABEL: trunc_mul_v16i64_v16i8: 2153 ; AVX1: # %bb.0: 2154 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm8 2155 ; AVX1-NEXT: vpmuludq %xmm4, %xmm8, %xmm8 2156 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm9 2157 ; AVX1-NEXT: vpmuludq %xmm9, %xmm0, %xmm9 2158 ; AVX1-NEXT: vpaddq %xmm8, %xmm9, %xmm8 2159 ; AVX1-NEXT: vpsllq $32, %xmm8, %xmm8 2160 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm9 2161 ; AVX1-NEXT: vpaddq %xmm8, %xmm9, %xmm8 2162 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm9 2163 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2164 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4 2165 ; AVX1-NEXT: vpmuludq %xmm9, %xmm4, %xmm10 2166 ; AVX1-NEXT: vpsrlq $32, %xmm9, %xmm4 2167 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4 2168 ; AVX1-NEXT: vpaddq %xmm10, %xmm4, %xmm4 2169 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 2170 ; AVX1-NEXT: vpmuludq %xmm9, %xmm0, %xmm0 2171 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm9 2172 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 2173 ; AVX1-NEXT: vpmuludq %xmm5, %xmm4, %xmm4 2174 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm0 2175 ; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 2176 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 2177 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 2178 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm4 2179 ; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm10 2180 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm0 2181 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2182 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5 2183 ; AVX1-NEXT: vpmuludq %xmm0, %xmm5, %xmm5 2184 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4 2185 ; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm4 2186 ; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4 2187 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 2188 ; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 2189 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm1 2190 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm0 2191 ; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm0 2192 ; AVX1-NEXT: vpsrlq $32, %xmm6, %xmm4 2193 ; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 2194 ; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 2195 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 2196 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm4 2197 ; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm5 2198 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm0 2199 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 2200 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4 2201 ; AVX1-NEXT: vpmuludq %xmm0, %xmm4, %xmm4 2202 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6 2203 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm6 2204 ; AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm4 2205 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 2206 ; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm0 2207 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 2208 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm2 2209 ; AVX1-NEXT: vpmuludq %xmm7, %xmm2, %xmm2 2210 ; AVX1-NEXT: vpsrlq $32, %xmm7, %xmm4 2211 ; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm4 2212 ; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 2213 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 2214 ; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm4 2215 ; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 2216 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4 2217 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 2218 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6 2219 ; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm6 2220 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm7 2221 ; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm7 2222 ; AVX1-NEXT: vpaddq %xmm6, %xmm7, %xmm6 2223 ; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6 2224 ; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3 2225 ; AVX1-NEXT: vpaddq %xmm6, %xmm3, %xmm3 2226 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 2227 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 2228 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2229 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 2230 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2231 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3 2232 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 2233 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 2234 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2235 ; AVX1-NEXT: vpand %xmm4, %xmm10, %xmm2 2236 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 2237 ; AVX1-NEXT: vpand %xmm4, %xmm9, %xmm2 2238 ; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3 2239 ; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 2240 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 2241 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 2242 ; AVX1-NEXT: vzeroupper 2243 ; AVX1-NEXT: retq 2244 ; 2245 ; AVX2-SLOW-LABEL: trunc_mul_v16i64_v16i8: 2246 ; AVX2-SLOW: # %bb.0: 2247 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] 2248 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] 2249 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] 2250 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] 2251 ; AVX2-SLOW-NEXT: vpmulld %xmm7, %xmm3, %xmm3 2252 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] 2253 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] 2254 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 2255 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 2256 ; AVX2-SLOW-NEXT: vpmulld %xmm6, %xmm2, %xmm2 2257 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 2258 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 2259 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 2260 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 2261 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 2262 ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm2 2263 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] 2264 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] 2265 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 2266 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 2267 ; AVX2-SLOW-NEXT: vpmulld %xmm5, %xmm1, %xmm1 2268 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] 2269 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] 2270 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 2271 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2272 ; AVX2-SLOW-NEXT: vpmulld %xmm4, %xmm0, %xmm0 2273 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 2274 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 2275 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2276 ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 2277 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2278 ; AVX2-SLOW-NEXT: vzeroupper 2279 ; AVX2-SLOW-NEXT: retq 2280 ; 2281 ; AVX2-FAST-LABEL: trunc_mul_v16i64_v16i8: 2282 ; AVX2-FAST: # %bb.0: 2283 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,4,6,4,6,6,7] 2284 ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7 2285 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm3 2286 ; AVX2-FAST-NEXT: vpmulld %xmm7, %xmm3, %xmm3 2287 ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6 2288 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 2289 ; AVX2-FAST-NEXT: vpmulld %xmm6, %xmm2, %xmm2 2290 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 2291 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 2292 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 2293 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 2294 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 2295 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 2296 ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm5 2297 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm1 2298 ; AVX2-FAST-NEXT: vpmulld %xmm5, %xmm1, %xmm1 2299 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm4 2300 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 2301 ; AVX2-FAST-NEXT: vpmulld %xmm4, %xmm0, %xmm0 2302 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 2303 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 2304 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2305 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 2306 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2307 ; AVX2-FAST-NEXT: vzeroupper 2308 ; AVX2-FAST-NEXT: retq 2309 ; 2310 ; AVX512F-LABEL: trunc_mul_v16i64_v16i8: 2311 ; AVX512F: # %bb.0: 2312 ; AVX512F-NEXT: vpmovqd %zmm3, %ymm3 2313 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 2314 ; AVX512F-NEXT: vpmulld %ymm3, %ymm1, %ymm1 2315 ; AVX512F-NEXT: vpmovqd %zmm2, %ymm2 2316 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 2317 ; AVX512F-NEXT: vpmulld %ymm2, %ymm0, %ymm0 2318 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2319 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 2320 ; AVX512F-NEXT: vzeroupper 2321 ; AVX512F-NEXT: retq 2322 ; 2323 ; AVX512BW-LABEL: trunc_mul_v16i64_v16i8: 2324 ; AVX512BW: # %bb.0: 2325 ; AVX512BW-NEXT: vpmovqd %zmm3, %ymm3 2326 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 2327 ; AVX512BW-NEXT: vpmulld %ymm3, %ymm1, %ymm1 2328 ; AVX512BW-NEXT: vpmovqd %zmm2, %ymm2 2329 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 2330 ; AVX512BW-NEXT: vpmulld %ymm2, %ymm0, %ymm0 2331 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2332 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 2333 ; AVX512BW-NEXT: vzeroupper 2334 ; AVX512BW-NEXT: retq 2335 ; 2336 ; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8: 2337 ; AVX512DQ: # %bb.0: 2338 ; AVX512DQ-NEXT: vpmullq %zmm3, %zmm1, %zmm1 2339 ; AVX512DQ-NEXT: vpmullq %zmm2, %zmm0, %zmm0 2340 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 2341 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 2342 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2343 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2344 ; AVX512DQ-NEXT: vzeroupper 2345 ; AVX512DQ-NEXT: retq 2346 %1 = mul <16 x i64> %a0, %a1 2347 %2 = trunc <16 x i64> %1 to <16 x i8> 2348 ret <16 x i8> %2 2349 } 2350 2351 define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 2352 ; SSE-LABEL: trunc_mul_v16i32_v16i8: 2353 ; SSE: # %bb.0: 2354 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] 2355 ; SSE-NEXT: pmuludq %xmm4, %xmm0 2356 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2357 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 2358 ; SSE-NEXT: pmuludq %xmm8, %xmm4 2359 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2360 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 2361 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 2362 ; SSE-NEXT: pmuludq %xmm5, %xmm1 2363 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2364 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 2365 ; SSE-NEXT: pmuludq %xmm4, %xmm5 2366 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 2367 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 2368 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] 2369 ; SSE-NEXT: pmuludq %xmm6, %xmm2 2370 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2371 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] 2372 ; SSE-NEXT: pmuludq %xmm4, %xmm5 2373 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 2374 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 2375 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 2376 ; SSE-NEXT: pmuludq %xmm7, %xmm3 2377 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 2378 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] 2379 ; SSE-NEXT: pmuludq %xmm4, %xmm5 2380 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 2381 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 2382 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 2383 ; SSE-NEXT: pand %xmm4, %xmm3 2384 ; SSE-NEXT: pand %xmm4, %xmm2 2385 ; SSE-NEXT: packuswb %xmm3, %xmm2 2386 ; SSE-NEXT: pand %xmm4, %xmm1 2387 ; SSE-NEXT: pand %xmm4, %xmm0 2388 ; SSE-NEXT: packuswb %xmm1, %xmm0 2389 ; SSE-NEXT: packuswb %xmm2, %xmm0 2390 ; SSE-NEXT: retq 2391 ; 2392 ; AVX1-LABEL: trunc_mul_v16i32_v16i8: 2393 ; AVX1: # %bb.0: 2394 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm4 2395 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 2396 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2397 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 2398 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm2 2399 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 2400 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2401 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 2402 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 2403 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 2404 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 2405 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 2406 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 2407 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 2408 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 2409 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2410 ; AVX1-NEXT: vzeroupper 2411 ; AVX1-NEXT: retq 2412 ; 2413 ; AVX2-LABEL: trunc_mul_v16i32_v16i8: 2414 ; AVX2: # %bb.0: 2415 ; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 2416 ; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1 2417 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 2418 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2419 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 2420 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 2421 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2422 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2423 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2424 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2425 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2426 ; AVX2-NEXT: vzeroupper 2427 ; AVX2-NEXT: retq 2428 ; 2429 ; AVX512-LABEL: trunc_mul_v16i32_v16i8: 2430 ; AVX512: # %bb.0: 2431 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 2432 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 2433 ; AVX512-NEXT: vzeroupper 2434 ; AVX512-NEXT: retq 2435 %1 = mul <16 x i32> %a0, %a1 2436 %2 = trunc <16 x i32> %1 to <16 x i8> 2437 ret <16 x i8> %2 2438 } 2439 2440 define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 2441 ; SSE-LABEL: trunc_mul_v16i16_v16i8: 2442 ; SSE: # %bb.0: 2443 ; SSE-NEXT: pmullw %xmm2, %xmm0 2444 ; SSE-NEXT: pmullw %xmm3, %xmm1 2445 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 2446 ; SSE-NEXT: pand %xmm2, %xmm1 2447 ; SSE-NEXT: pand %xmm2, %xmm0 2448 ; SSE-NEXT: packuswb %xmm1, %xmm0 2449 ; SSE-NEXT: retq 2450 ; 2451 ; AVX1-LABEL: trunc_mul_v16i16_v16i8: 2452 ; AVX1: # %bb.0: 2453 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2 2454 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2455 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2456 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2457 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 2458 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 2459 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 2460 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2461 ; AVX1-NEXT: vzeroupper 2462 ; AVX1-NEXT: retq 2463 ; 2464 ; AVX2-LABEL: trunc_mul_v16i16_v16i8: 2465 ; AVX2: # %bb.0: 2466 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2467 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2468 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 2469 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2470 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2471 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2472 ; AVX2-NEXT: vzeroupper 2473 ; AVX2-NEXT: retq 2474 ; 2475 ; AVX512F-LABEL: trunc_mul_v16i16_v16i8: 2476 ; AVX512F: # %bb.0: 2477 ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2478 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 2479 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 2480 ; AVX512F-NEXT: vzeroupper 2481 ; AVX512F-NEXT: retq 2482 ; 2483 ; AVX512BW-LABEL: trunc_mul_v16i16_v16i8: 2484 ; AVX512BW: # %bb.0: 2485 ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2486 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2487 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2488 ; AVX512BW-NEXT: vzeroupper 2489 ; AVX512BW-NEXT: retq 2490 ; 2491 ; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8: 2492 ; AVX512DQ: # %bb.0: 2493 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2494 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2495 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2496 ; AVX512DQ-NEXT: vzeroupper 2497 ; AVX512DQ-NEXT: retq 2498 %1 = mul <16 x i16> %a0, %a1 2499 %2 = trunc <16 x i16> %1 to <16 x i8> 2500 ret <16 x i8> %2 2501 } 2502 2503 define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) { 2504 ; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2505 ; SSE: # %bb.0: 2506 ; SSE-NEXT: pxor %xmm3, %xmm3 2507 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 2508 ; SSE-NEXT: pslld $16, %xmm2 2509 ; SSE-NEXT: psrad $16, %xmm2 2510 ; SSE-NEXT: pslld $16, %xmm1 2511 ; SSE-NEXT: psrad $16, %xmm1 2512 ; SSE-NEXT: packssdw %xmm2, %xmm1 2513 ; SSE-NEXT: pmullw %xmm1, %xmm0 2514 ; SSE-NEXT: retq 2515 ; 2516 ; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2517 ; AVX1: # %bb.0: 2518 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2519 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2520 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2521 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2522 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 2523 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2524 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2525 ; AVX1-NEXT: vzeroupper 2526 ; AVX1-NEXT: retq 2527 ; 2528 ; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2529 ; AVX2: # %bb.0: 2530 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2531 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 2532 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 2533 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2534 ; AVX2-NEXT: vzeroupper 2535 ; AVX2-NEXT: retq 2536 ; 2537 ; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2538 ; AVX512: # %bb.0: 2539 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2540 ; AVX512-NEXT: vpmovdw %zmm1, %ymm1 2541 ; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2542 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2543 ; AVX512-NEXT: vzeroupper 2544 ; AVX512-NEXT: retq 2545 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2546 %2 = zext <8 x i8> %1 to <8 x i32> 2547 %3 = mul <8 x i32> %2, %a1 2548 %4 = trunc <8 x i32> %3 to <8 x i16> 2549 ret <8 x i16> %4 2550 } 2551 2552 ; 2553 ; mul to constant 2554 ; 2555 2556 define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 2557 ; SSE-LABEL: trunc_mul_const_v4i64_v4i32: 2558 ; SSE: # %bb.0: 2559 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,3] 2560 ; SSE-NEXT: movdqa %xmm1, %xmm3 2561 ; SSE-NEXT: pmuludq %xmm2, %xmm3 2562 ; SSE-NEXT: psrlq $32, %xmm1 2563 ; SSE-NEXT: pmuludq %xmm2, %xmm1 2564 ; SSE-NEXT: psllq $32, %xmm1 2565 ; SSE-NEXT: paddq %xmm3, %xmm1 2566 ; SSE-NEXT: movl $1, %eax 2567 ; SSE-NEXT: movq %rax, %xmm2 2568 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] 2569 ; SSE-NEXT: movdqa %xmm0, %xmm3 2570 ; SSE-NEXT: pmuludq %xmm2, %xmm3 2571 ; SSE-NEXT: psrlq $32, %xmm0 2572 ; SSE-NEXT: pmuludq %xmm2, %xmm0 2573 ; SSE-NEXT: psllq $32, %xmm0 2574 ; SSE-NEXT: paddq %xmm3, %xmm0 2575 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2576 ; SSE-NEXT: retq 2577 ; 2578 ; AVX1-LABEL: trunc_mul_const_v4i64_v4i32: 2579 ; AVX1: # %bb.0: 2580 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2581 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2582 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 2583 ; AVX1-NEXT: vzeroupper 2584 ; AVX1-NEXT: retq 2585 ; 2586 ; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32: 2587 ; AVX2-SLOW: # %bb.0: 2588 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 2589 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2590 ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 2591 ; AVX2-SLOW-NEXT: vzeroupper 2592 ; AVX2-SLOW-NEXT: retq 2593 ; 2594 ; AVX2-FAST-LABEL: trunc_mul_const_v4i64_v4i32: 2595 ; AVX2-FAST: # %bb.0: 2596 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] 2597 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 2598 ; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 2599 ; AVX2-FAST-NEXT: vzeroupper 2600 ; AVX2-FAST-NEXT: retq 2601 ; 2602 ; AVX512-LABEL: trunc_mul_const_v4i64_v4i32: 2603 ; AVX512: # %bb.0: 2604 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2605 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 2606 ; AVX512-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 2607 ; AVX512-NEXT: vzeroupper 2608 ; AVX512-NEXT: retq 2609 %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 2610 %2 = trunc <4 x i64> %1 to <4 x i32> 2611 ret <4 x i32> %2 2612 } 2613 2614 define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 2615 ; SSE-LABEL: trunc_mul_const_v8i64_v8i16: 2616 ; SSE: # %bb.0: 2617 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2618 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 2619 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2620 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 2621 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 2622 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 2623 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 2624 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2625 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 2626 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2627 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 2628 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 2629 ; SSE-NEXT: retq 2630 ; 2631 ; AVX1-LABEL: trunc_mul_const_v8i64_v8i16: 2632 ; AVX1: # %bb.0: 2633 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2634 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 2635 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 2636 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 2637 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 2638 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2639 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 2640 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 2641 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 2642 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2643 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 2644 ; AVX1-NEXT: vzeroupper 2645 ; AVX1-NEXT: retq 2646 ; 2647 ; AVX2-SLOW-LABEL: trunc_mul_const_v8i64_v8i16: 2648 ; AVX2-SLOW: # %bb.0: 2649 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 2650 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2651 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 2652 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 2653 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 2654 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 2655 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2656 ; AVX2-SLOW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 2657 ; AVX2-SLOW-NEXT: vzeroupper 2658 ; AVX2-SLOW-NEXT: retq 2659 ; 2660 ; AVX2-FAST-LABEL: trunc_mul_const_v8i64_v8i16: 2661 ; AVX2-FAST: # %bb.0: 2662 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 2663 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 2664 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 2665 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 2666 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 2667 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2668 ; AVX2-FAST-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 2669 ; AVX2-FAST-NEXT: vzeroupper 2670 ; AVX2-FAST-NEXT: retq 2671 ; 2672 ; AVX512-LABEL: trunc_mul_const_v8i64_v8i16: 2673 ; AVX512: # %bb.0: 2674 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 2675 ; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 2676 ; AVX512-NEXT: vzeroupper 2677 ; AVX512-NEXT: retq 2678 %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 2679 %2 = trunc <8 x i64> %1 to <8 x i16> 2680 ret <8 x i16> %2 2681 } 2682 2683 define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 2684 ; SSE-LABEL: trunc_mul_const_v8i32_v8i16: 2685 ; SSE: # %bb.0: 2686 ; SSE-NEXT: pslld $16, %xmm1 2687 ; SSE-NEXT: psrad $16, %xmm1 2688 ; SSE-NEXT: pslld $16, %xmm0 2689 ; SSE-NEXT: psrad $16, %xmm0 2690 ; SSE-NEXT: packssdw %xmm1, %xmm0 2691 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 2692 ; SSE-NEXT: retq 2693 ; 2694 ; AVX1-LABEL: trunc_mul_const_v8i32_v8i16: 2695 ; AVX1: # %bb.0: 2696 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2697 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2698 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2699 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2700 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2701 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 2702 ; AVX1-NEXT: vzeroupper 2703 ; AVX1-NEXT: retq 2704 ; 2705 ; AVX2-LABEL: trunc_mul_const_v8i32_v8i16: 2706 ; AVX2: # %bb.0: 2707 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 2708 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2709 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 2710 ; AVX2-NEXT: vzeroupper 2711 ; AVX2-NEXT: retq 2712 ; 2713 ; AVX512-LABEL: trunc_mul_const_v8i32_v8i16: 2714 ; AVX512: # %bb.0: 2715 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2716 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 2717 ; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 2718 ; AVX512-NEXT: vzeroupper 2719 ; AVX512-NEXT: retq 2720 %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2721 %2 = trunc <8 x i32> %1 to <8 x i16> 2722 ret <8 x i16> %2 2723 } 2724 2725 define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 2726 ; SSE-LABEL: trunc_mul_const_v16i64_v16i8: 2727 ; SSE: # %bb.0: 2728 ; SSE-NEXT: movl $1, %eax 2729 ; SSE-NEXT: movq %rax, %xmm8 2730 ; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] 2731 ; SSE-NEXT: movdqa %xmm0, %xmm9 2732 ; SSE-NEXT: pmuludq %xmm8, %xmm9 2733 ; SSE-NEXT: psrlq $32, %xmm0 2734 ; SSE-NEXT: pmuludq %xmm8, %xmm0 2735 ; SSE-NEXT: psllq $32, %xmm0 2736 ; SSE-NEXT: paddq %xmm9, %xmm0 2737 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [2,3] 2738 ; SSE-NEXT: movdqa %xmm1, %xmm9 2739 ; SSE-NEXT: pmuludq %xmm8, %xmm9 2740 ; SSE-NEXT: psrlq $32, %xmm1 2741 ; SSE-NEXT: pmuludq %xmm8, %xmm1 2742 ; SSE-NEXT: psllq $32, %xmm1 2743 ; SSE-NEXT: paddq %xmm9, %xmm1 2744 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [4,5] 2745 ; SSE-NEXT: movdqa %xmm2, %xmm9 2746 ; SSE-NEXT: pmuludq %xmm8, %xmm9 2747 ; SSE-NEXT: psrlq $32, %xmm2 2748 ; SSE-NEXT: pmuludq %xmm8, %xmm2 2749 ; SSE-NEXT: psllq $32, %xmm2 2750 ; SSE-NEXT: paddq %xmm9, %xmm2 2751 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [6,7] 2752 ; SSE-NEXT: movdqa %xmm3, %xmm9 2753 ; SSE-NEXT: pmuludq %xmm8, %xmm9 2754 ; SSE-NEXT: psrlq $32, %xmm3 2755 ; SSE-NEXT: pmuludq %xmm8, %xmm3 2756 ; SSE-NEXT: psllq $32, %xmm3 2757 ; SSE-NEXT: paddq %xmm9, %xmm3 2758 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [8,9] 2759 ; SSE-NEXT: movdqa %xmm4, %xmm9 2760 ; SSE-NEXT: pmuludq %xmm8, %xmm9 2761 ; SSE-NEXT: psrlq $32, %xmm4 2762 ; SSE-NEXT: pmuludq %xmm8, %xmm4 2763 ; SSE-NEXT: psllq $32, %xmm4 2764 ; SSE-NEXT: paddq %xmm9, %xmm4 2765 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [10,11] 2766 ; SSE-NEXT: movdqa %xmm5, %xmm9 2767 ; SSE-NEXT: pmuludq %xmm8, %xmm9 2768 ; SSE-NEXT: psrlq $32, %xmm5 2769 ; SSE-NEXT: pmuludq %xmm8, %xmm5 2770 ; SSE-NEXT: psllq $32, %xmm5 2771 ; SSE-NEXT: paddq %xmm9, %xmm5 2772 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [12,13] 2773 ; SSE-NEXT: movdqa %xmm6, %xmm9 2774 ; SSE-NEXT: pmuludq %xmm8, %xmm9 2775 ; SSE-NEXT: psrlq $32, %xmm6 2776 ; SSE-NEXT: pmuludq %xmm8, %xmm6 2777 ; SSE-NEXT: psllq $32, %xmm6 2778 ; SSE-NEXT: paddq %xmm9, %xmm6 2779 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [14,15] 2780 ; SSE-NEXT: movdqa %xmm7, %xmm9 2781 ; SSE-NEXT: pmuludq %xmm8, %xmm9 2782 ; SSE-NEXT: psrlq $32, %xmm7 2783 ; SSE-NEXT: pmuludq %xmm8, %xmm7 2784 ; SSE-NEXT: psllq $32, %xmm7 2785 ; SSE-NEXT: paddq %xmm9, %xmm7 2786 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 2787 ; SSE-NEXT: pand %xmm8, %xmm7 2788 ; SSE-NEXT: pand %xmm8, %xmm6 2789 ; SSE-NEXT: packuswb %xmm7, %xmm6 2790 ; SSE-NEXT: pand %xmm8, %xmm5 2791 ; SSE-NEXT: pand %xmm8, %xmm4 2792 ; SSE-NEXT: packuswb %xmm5, %xmm4 2793 ; SSE-NEXT: packuswb %xmm6, %xmm4 2794 ; SSE-NEXT: pand %xmm8, %xmm3 2795 ; SSE-NEXT: pand %xmm8, %xmm2 2796 ; SSE-NEXT: packuswb %xmm3, %xmm2 2797 ; SSE-NEXT: pand %xmm8, %xmm1 2798 ; SSE-NEXT: pand %xmm8, %xmm0 2799 ; SSE-NEXT: packuswb %xmm1, %xmm0 2800 ; SSE-NEXT: packuswb %xmm2, %xmm0 2801 ; SSE-NEXT: packuswb %xmm4, %xmm0 2802 ; SSE-NEXT: retq 2803 ; 2804 ; AVX1-LABEL: trunc_mul_const_v16i64_v16i8: 2805 ; AVX1: # %bb.0: 2806 ; AVX1-NEXT: movl $1, %eax 2807 ; AVX1-NEXT: vmovq %rax, %xmm4 2808 ; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] 2809 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm5 2810 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6 2811 ; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 2812 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 2813 ; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm8 2814 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2815 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3] 2816 ; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm6 2817 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 2818 ; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm0 2819 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 2820 ; AVX1-NEXT: vpaddq %xmm0, %xmm6, %xmm9 2821 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5] 2822 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm6 2823 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm7 2824 ; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 2825 ; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 2826 ; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 2827 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2828 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7] 2829 ; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm7 2830 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 2831 ; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm1 2832 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 2833 ; AVX1-NEXT: vpaddq %xmm1, %xmm7, %xmm1 2834 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9] 2835 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm7 2836 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4 2837 ; AVX1-NEXT: vpmuludq %xmm6, %xmm4, %xmm4 2838 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 2839 ; AVX1-NEXT: vpaddq %xmm4, %xmm7, %xmm4 2840 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 2841 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11] 2842 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm7 2843 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 2844 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm2 2845 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 2846 ; AVX1-NEXT: vpaddq %xmm2, %xmm7, %xmm2 2847 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [12,13] 2848 ; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm7 2849 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm0 2850 ; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm0 2851 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 2852 ; AVX1-NEXT: vpaddq %xmm0, %xmm7, %xmm0 2853 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 2854 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [14,15] 2855 ; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm7 2856 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm3 2857 ; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm3 2858 ; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 2859 ; AVX1-NEXT: vpaddq %xmm3, %xmm7, %xmm3 2860 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 2861 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 2862 ; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 2863 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 2864 ; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 2865 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm3 2866 ; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 2867 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 2868 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 2869 ; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm2 2870 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 2871 ; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm2 2872 ; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm3 2873 ; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 2874 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 2875 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 2876 ; AVX1-NEXT: vzeroupper 2877 ; AVX1-NEXT: retq 2878 ; 2879 ; AVX2-SLOW-LABEL: trunc_mul_const_v16i64_v16i8: 2880 ; AVX2-SLOW: # %bb.0: 2881 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 2882 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 2883 ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 2884 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] 2885 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] 2886 ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 2887 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 2888 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 2889 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 2890 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 2891 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 2892 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 2893 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 2894 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2895 ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 2896 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 2897 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 2898 ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 2899 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 2900 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 2901 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2902 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 2903 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2904 ; AVX2-SLOW-NEXT: vzeroupper 2905 ; AVX2-SLOW-NEXT: retq 2906 ; 2907 ; AVX2-FAST-LABEL: trunc_mul_const_v16i64_v16i8: 2908 ; AVX2-FAST: # %bb.0: 2909 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 2910 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 2911 ; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 2912 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 2913 ; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 2914 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 2915 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 2916 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 2917 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 2918 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 2919 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 2920 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 2921 ; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 2922 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 2923 ; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 2924 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 2925 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 2926 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2927 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 2928 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2929 ; AVX2-FAST-NEXT: vzeroupper 2930 ; AVX2-FAST-NEXT: retq 2931 ; 2932 ; AVX512-LABEL: trunc_mul_const_v16i64_v16i8: 2933 ; AVX512: # %bb.0: 2934 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 2935 ; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 2936 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1 2937 ; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 2938 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 2939 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 2940 ; AVX512-NEXT: vzeroupper 2941 ; AVX512-NEXT: retq 2942 %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 2943 %2 = trunc <16 x i64> %1 to <16 x i8> 2944 ret <16 x i8> %2 2945 } 2946 2947 define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 2948 ; SSE-LABEL: trunc_mul_const_v16i32_v16i8: 2949 ; SSE: # %bb.0: 2950 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,1,2,3] 2951 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 2952 ; SSE-NEXT: pmuludq %xmm4, %xmm0 2953 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2954 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 2955 ; SSE-NEXT: pmuludq %xmm5, %xmm4 2956 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2957 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 2958 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5,6,7] 2959 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 2960 ; SSE-NEXT: pmuludq %xmm4, %xmm1 2961 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2962 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 2963 ; SSE-NEXT: pmuludq %xmm5, %xmm4 2964 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2965 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 2966 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,9,10,11] 2967 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] 2968 ; SSE-NEXT: pmuludq %xmm4, %xmm2 2969 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2970 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 2971 ; SSE-NEXT: pmuludq %xmm5, %xmm4 2972 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2973 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 2974 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [12,13,14,15] 2975 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] 2976 ; SSE-NEXT: pmuludq %xmm4, %xmm3 2977 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 2978 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 2979 ; SSE-NEXT: pmuludq %xmm5, %xmm4 2980 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2981 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 2982 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 2983 ; SSE-NEXT: pand %xmm4, %xmm3 2984 ; SSE-NEXT: pand %xmm4, %xmm2 2985 ; SSE-NEXT: packuswb %xmm3, %xmm2 2986 ; SSE-NEXT: pand %xmm4, %xmm1 2987 ; SSE-NEXT: pand %xmm4, %xmm0 2988 ; SSE-NEXT: packuswb %xmm1, %xmm0 2989 ; SSE-NEXT: packuswb %xmm2, %xmm0 2990 ; SSE-NEXT: retq 2991 ; 2992 ; AVX1-LABEL: trunc_mul_const_v16i32_v16i8: 2993 ; AVX1: # %bb.0: 2994 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 2995 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2996 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 2997 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm3 2998 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2999 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 3000 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3001 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 3002 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 3003 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 3004 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 3005 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 3006 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 3007 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3008 ; AVX1-NEXT: vzeroupper 3009 ; AVX1-NEXT: retq 3010 ; 3011 ; AVX2-LABEL: trunc_mul_const_v16i32_v16i8: 3012 ; AVX2: # %bb.0: 3013 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 3014 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3015 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 3016 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 3017 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 3018 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 3019 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3020 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3021 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 3022 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 3023 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3024 ; AVX2-NEXT: vzeroupper 3025 ; AVX2-NEXT: retq 3026 ; 3027 ; AVX512-LABEL: trunc_mul_const_v16i32_v16i8: 3028 ; AVX512: # %bb.0: 3029 ; AVX512-NEXT: vpmulld {{.*}}(%rip), %zmm0, %zmm0 3030 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3031 ; AVX512-NEXT: vzeroupper 3032 ; AVX512-NEXT: retq 3033 %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3034 %2 = trunc <16 x i32> %1 to <16 x i8> 3035 ret <16 x i8> %2 3036 } 3037 3038 define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 3039 ; SSE-LABEL: trunc_mul_const_v16i16_v16i8: 3040 ; SSE: # %bb.0: 3041 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 3042 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 3043 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 3044 ; SSE-NEXT: pand %xmm2, %xmm1 3045 ; SSE-NEXT: pand %xmm2, %xmm0 3046 ; SSE-NEXT: packuswb %xmm1, %xmm0 3047 ; SSE-NEXT: retq 3048 ; 3049 ; AVX1-LABEL: trunc_mul_const_v16i16_v16i8: 3050 ; AVX1: # %bb.0: 3051 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 3052 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3053 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 3054 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 3055 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3056 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3057 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 3058 ; AVX1-NEXT: vzeroupper 3059 ; AVX1-NEXT: retq 3060 ; 3061 ; AVX2-LABEL: trunc_mul_const_v16i16_v16i8: 3062 ; AVX2: # %bb.0: 3063 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 3064 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3065 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 3066 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3067 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3068 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3069 ; AVX2-NEXT: vzeroupper 3070 ; AVX2-NEXT: retq 3071 ; 3072 ; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8: 3073 ; AVX512F: # %bb.0: 3074 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 3075 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 3076 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3077 ; AVX512F-NEXT: vzeroupper 3078 ; AVX512F-NEXT: retq 3079 ; 3080 ; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8: 3081 ; AVX512BW: # %bb.0: 3082 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 3083 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 3084 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3085 ; AVX512BW-NEXT: vzeroupper 3086 ; AVX512BW-NEXT: retq 3087 ; 3088 ; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8: 3089 ; AVX512DQ: # %bb.0: 3090 ; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 3091 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 3092 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3093 ; AVX512DQ-NEXT: vzeroupper 3094 ; AVX512DQ-NEXT: retq 3095 %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 3096 %2 = trunc <16 x i16> %1 to <16 x i8> 3097 ret <16 x i8> %2 3098 } 3099 3100 ; 3101 ; and 3102 ; 3103 3104 define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3105 ; SSE-LABEL: trunc_and_v4i64_v4i32: 3106 ; SSE: # %bb.0: 3107 ; SSE-NEXT: andps %xmm3, %xmm1 3108 ; SSE-NEXT: andps %xmm2, %xmm0 3109 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3110 ; SSE-NEXT: retq 3111 ; 3112 ; AVX1-LABEL: trunc_and_v4i64_v4i32: 3113 ; AVX1: # %bb.0: 3114 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 3115 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3116 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3117 ; AVX1-NEXT: vzeroupper 3118 ; AVX1-NEXT: retq 3119 ; 3120 ; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32: 3121 ; AVX2-SLOW: # %bb.0: 3122 ; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0 3123 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 3124 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] 3125 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3126 ; AVX2-SLOW-NEXT: vzeroupper 3127 ; AVX2-SLOW-NEXT: retq 3128 ; 3129 ; AVX2-FAST-LABEL: trunc_and_v4i64_v4i32: 3130 ; AVX2-FAST: # %bb.0: 3131 ; AVX2-FAST-NEXT: vandps %ymm1, %ymm0, %ymm0 3132 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] 3133 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 3134 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3135 ; AVX2-FAST-NEXT: vzeroupper 3136 ; AVX2-FAST-NEXT: retq 3137 ; 3138 ; AVX512-LABEL: trunc_and_v4i64_v4i32: 3139 ; AVX512: # %bb.0: 3140 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 3141 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3142 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3143 ; AVX512-NEXT: vzeroupper 3144 ; AVX512-NEXT: retq 3145 %1 = and <4 x i64> %a0, %a1 3146 %2 = trunc <4 x i64> %1 to <4 x i32> 3147 ret <4 x i32> %2 3148 } 3149 3150 define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 3151 ; SSE-LABEL: trunc_and_v8i64_v8i16: 3152 ; SSE: # %bb.0: 3153 ; SSE-NEXT: pand %xmm6, %xmm2 3154 ; SSE-NEXT: pand %xmm7, %xmm3 3155 ; SSE-NEXT: pand %xmm4, %xmm0 3156 ; SSE-NEXT: pand %xmm5, %xmm1 3157 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3158 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 3159 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 3160 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 3161 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 3162 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 3163 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 3164 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 3165 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 3166 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3167 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 3168 ; SSE-NEXT: retq 3169 ; 3170 ; AVX1-LABEL: trunc_and_v8i64_v8i16: 3171 ; AVX1: # %bb.0: 3172 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3173 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 3174 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3175 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 3176 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 3177 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 3178 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 3179 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3180 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 3181 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 3182 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3183 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3184 ; AVX1-NEXT: vzeroupper 3185 ; AVX1-NEXT: retq 3186 ; 3187 ; AVX2-SLOW-LABEL: trunc_and_v8i64_v8i16: 3188 ; AVX2-SLOW: # %bb.0: 3189 ; AVX2-SLOW-NEXT: vpand %ymm3, %ymm1, %ymm1 3190 ; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0 3191 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 3192 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3193 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 3194 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 3195 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 3196 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 3197 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3198 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3199 ; AVX2-SLOW-NEXT: vzeroupper 3200 ; AVX2-SLOW-NEXT: retq 3201 ; 3202 ; AVX2-FAST-LABEL: trunc_and_v8i64_v8i16: 3203 ; AVX2-FAST: # %bb.0: 3204 ; AVX2-FAST-NEXT: vpand %ymm3, %ymm1, %ymm1 3205 ; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0 3206 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 3207 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 3208 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 3209 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 3210 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 3211 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3212 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3213 ; AVX2-FAST-NEXT: vzeroupper 3214 ; AVX2-FAST-NEXT: retq 3215 ; 3216 ; AVX512-LABEL: trunc_and_v8i64_v8i16: 3217 ; AVX512: # %bb.0: 3218 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 3219 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 3220 ; AVX512-NEXT: vzeroupper 3221 ; AVX512-NEXT: retq 3222 %1 = and <8 x i64> %a0, %a1 3223 %2 = trunc <8 x i64> %1 to <8 x i16> 3224 ret <8 x i16> %2 3225 } 3226 3227 define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 3228 ; SSE-LABEL: trunc_and_v8i32_v8i16: 3229 ; SSE: # %bb.0: 3230 ; SSE-NEXT: pand %xmm2, %xmm0 3231 ; SSE-NEXT: pand %xmm3, %xmm1 3232 ; SSE-NEXT: pslld $16, %xmm1 3233 ; SSE-NEXT: psrad $16, %xmm1 3234 ; SSE-NEXT: pslld $16, %xmm0 3235 ; SSE-NEXT: psrad $16, %xmm0 3236 ; SSE-NEXT: packssdw %xmm1, %xmm0 3237 ; SSE-NEXT: retq 3238 ; 3239 ; AVX1-LABEL: trunc_and_v8i32_v8i16: 3240 ; AVX1: # %bb.0: 3241 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 3242 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3243 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 3244 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3245 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3246 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3247 ; AVX1-NEXT: vzeroupper 3248 ; AVX1-NEXT: retq 3249 ; 3250 ; AVX2-LABEL: trunc_and_v8i32_v8i16: 3251 ; AVX2: # %bb.0: 3252 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 3253 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 3254 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3255 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3256 ; AVX2-NEXT: vzeroupper 3257 ; AVX2-NEXT: retq 3258 ; 3259 ; AVX512-LABEL: trunc_and_v8i32_v8i16: 3260 ; AVX512: # %bb.0: 3261 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 3262 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 3263 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3264 ; AVX512-NEXT: vzeroupper 3265 ; AVX512-NEXT: retq 3266 %1 = and <8 x i32> %a0, %a1 3267 %2 = trunc <8 x i32> %1 to <8 x i16> 3268 ret <8 x i16> %2 3269 } 3270 3271 define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 3272 ; SSE-LABEL: trunc_and_v16i64_v16i8: 3273 ; SSE: # %bb.0: 3274 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm0 3275 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm1 3276 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm2 3277 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm3 3278 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm4 3279 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm5 3280 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm6 3281 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm7 3282 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3283 ; SSE-NEXT: pand %xmm8, %xmm7 3284 ; SSE-NEXT: pand %xmm8, %xmm6 3285 ; SSE-NEXT: packuswb %xmm7, %xmm6 3286 ; SSE-NEXT: pand %xmm8, %xmm5 3287 ; SSE-NEXT: pand %xmm8, %xmm4 3288 ; SSE-NEXT: packuswb %xmm5, %xmm4 3289 ; SSE-NEXT: packuswb %xmm6, %xmm4 3290 ; SSE-NEXT: pand %xmm8, %xmm3 3291 ; SSE-NEXT: pand %xmm8, %xmm2 3292 ; SSE-NEXT: packuswb %xmm3, %xmm2 3293 ; SSE-NEXT: pand %xmm8, %xmm1 3294 ; SSE-NEXT: pand %xmm8, %xmm0 3295 ; SSE-NEXT: packuswb %xmm1, %xmm0 3296 ; SSE-NEXT: packuswb %xmm2, %xmm0 3297 ; SSE-NEXT: packuswb %xmm4, %xmm0 3298 ; SSE-NEXT: retq 3299 ; 3300 ; AVX1-LABEL: trunc_and_v16i64_v16i8: 3301 ; AVX1: # %bb.0: 3302 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 3303 ; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1 3304 ; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2 3305 ; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3 3306 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 3307 ; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3308 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 3309 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 3310 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 3311 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 3312 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 3313 ; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2 3314 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 3315 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 3316 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3317 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 3318 ; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1 3319 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3320 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 3321 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 3322 ; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0 3323 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 3324 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3325 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3326 ; AVX1-NEXT: vzeroupper 3327 ; AVX1-NEXT: retq 3328 ; 3329 ; AVX2-SLOW-LABEL: trunc_and_v16i64_v16i8: 3330 ; AVX2-SLOW: # %bb.0: 3331 ; AVX2-SLOW-NEXT: vpand %ymm5, %ymm1, %ymm1 3332 ; AVX2-SLOW-NEXT: vpand %ymm4, %ymm0, %ymm0 3333 ; AVX2-SLOW-NEXT: vpand %ymm7, %ymm3, %ymm3 3334 ; AVX2-SLOW-NEXT: vpand %ymm6, %ymm2, %ymm2 3335 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 3336 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 3337 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] 3338 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] 3339 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 3340 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 3341 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 3342 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 3343 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 3344 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 3345 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 3346 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3347 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 3348 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 3349 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 3350 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 3351 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3352 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 3353 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 3354 ; AVX2-SLOW-NEXT: vzeroupper 3355 ; AVX2-SLOW-NEXT: retq 3356 ; 3357 ; AVX2-FAST-LABEL: trunc_and_v16i64_v16i8: 3358 ; AVX2-FAST: # %bb.0: 3359 ; AVX2-FAST-NEXT: vpand %ymm5, %ymm1, %ymm1 3360 ; AVX2-FAST-NEXT: vpand %ymm4, %ymm0, %ymm0 3361 ; AVX2-FAST-NEXT: vpand %ymm7, %ymm3, %ymm3 3362 ; AVX2-FAST-NEXT: vpand %ymm6, %ymm2, %ymm2 3363 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 3364 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 3365 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 3366 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 3367 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 3368 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 3369 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 3370 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 3371 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 3372 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 3373 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 3374 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 3375 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 3376 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3377 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 3378 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 3379 ; AVX2-FAST-NEXT: vzeroupper 3380 ; AVX2-FAST-NEXT: retq 3381 ; 3382 ; AVX512-LABEL: trunc_and_v16i64_v16i8: 3383 ; AVX512: # %bb.0: 3384 ; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1 3385 ; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0 3386 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3387 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1 3388 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 3389 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3390 ; AVX512-NEXT: vzeroupper 3391 ; AVX512-NEXT: retq 3392 %1 = and <16 x i64> %a0, %a1 3393 %2 = trunc <16 x i64> %1 to <16 x i8> 3394 ret <16 x i8> %2 3395 } 3396 3397 define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 3398 ; SSE-LABEL: trunc_and_v16i32_v16i8: 3399 ; SSE: # %bb.0: 3400 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3401 ; SSE-NEXT: pand %xmm8, %xmm7 3402 ; SSE-NEXT: pand %xmm3, %xmm7 3403 ; SSE-NEXT: pand %xmm8, %xmm6 3404 ; SSE-NEXT: pand %xmm2, %xmm6 3405 ; SSE-NEXT: packuswb %xmm7, %xmm6 3406 ; SSE-NEXT: pand %xmm8, %xmm5 3407 ; SSE-NEXT: pand %xmm1, %xmm5 3408 ; SSE-NEXT: pand %xmm8, %xmm4 3409 ; SSE-NEXT: pand %xmm4, %xmm0 3410 ; SSE-NEXT: packuswb %xmm5, %xmm0 3411 ; SSE-NEXT: packuswb %xmm6, %xmm0 3412 ; SSE-NEXT: retq 3413 ; 3414 ; AVX1-LABEL: trunc_and_v16i32_v16i8: 3415 ; AVX1: # %bb.0: 3416 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3417 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 3418 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3419 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3420 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 3421 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 3422 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 3423 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3424 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 3425 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 3426 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3427 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3428 ; AVX1-NEXT: vzeroupper 3429 ; AVX1-NEXT: retq 3430 ; 3431 ; AVX2-LABEL: trunc_and_v16i32_v16i8: 3432 ; AVX2: # %bb.0: 3433 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 3434 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 3435 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 3436 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3437 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 3438 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 3439 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 3440 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3441 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3442 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 3443 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3444 ; AVX2-NEXT: vzeroupper 3445 ; AVX2-NEXT: retq 3446 ; 3447 ; AVX512-LABEL: trunc_and_v16i32_v16i8: 3448 ; AVX512: # %bb.0: 3449 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 3450 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3451 ; AVX512-NEXT: vzeroupper 3452 ; AVX512-NEXT: retq 3453 %1 = and <16 x i32> %a0, %a1 3454 %2 = trunc <16 x i32> %1 to <16 x i8> 3455 ret <16 x i8> %2 3456 } 3457 3458 define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 3459 ; SSE-LABEL: trunc_and_v16i16_v16i8: 3460 ; SSE: # %bb.0: 3461 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 3462 ; SSE-NEXT: pand %xmm4, %xmm3 3463 ; SSE-NEXT: pand %xmm1, %xmm3 3464 ; SSE-NEXT: pand %xmm4, %xmm2 3465 ; SSE-NEXT: pand %xmm2, %xmm0 3466 ; SSE-NEXT: packuswb %xmm3, %xmm0 3467 ; SSE-NEXT: retq 3468 ; 3469 ; AVX1-LABEL: trunc_and_v16i16_v16i8: 3470 ; AVX1: # %bb.0: 3471 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 3472 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3473 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 3474 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3475 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3476 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3477 ; AVX1-NEXT: vzeroupper 3478 ; AVX1-NEXT: retq 3479 ; 3480 ; AVX2-LABEL: trunc_and_v16i16_v16i8: 3481 ; AVX2: # %bb.0: 3482 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 3483 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3484 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 3485 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3486 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3487 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3488 ; AVX2-NEXT: vzeroupper 3489 ; AVX2-NEXT: retq 3490 ; 3491 ; AVX512F-LABEL: trunc_and_v16i16_v16i8: 3492 ; AVX512F: # %bb.0: 3493 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 3494 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 3495 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3496 ; AVX512F-NEXT: vzeroupper 3497 ; AVX512F-NEXT: retq 3498 ; 3499 ; AVX512BW-LABEL: trunc_and_v16i16_v16i8: 3500 ; AVX512BW: # %bb.0: 3501 ; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 3502 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 3503 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3504 ; AVX512BW-NEXT: vzeroupper 3505 ; AVX512BW-NEXT: retq 3506 ; 3507 ; AVX512DQ-LABEL: trunc_and_v16i16_v16i8: 3508 ; AVX512DQ: # %bb.0: 3509 ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 3510 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 3511 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3512 ; AVX512DQ-NEXT: vzeroupper 3513 ; AVX512DQ-NEXT: retq 3514 %1 = and <16 x i16> %a0, %a1 3515 %2 = trunc <16 x i16> %1 to <16 x i8> 3516 ret <16 x i8> %2 3517 } 3518 3519 ; 3520 ; and to constant 3521 ; 3522 3523 define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 3524 ; SSE-LABEL: trunc_and_const_v4i64_v4i32: 3525 ; SSE: # %bb.0: 3526 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3527 ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 3528 ; SSE-NEXT: retq 3529 ; 3530 ; AVX1-LABEL: trunc_and_const_v4i64_v4i32: 3531 ; AVX1: # %bb.0: 3532 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3533 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3534 ; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 3535 ; AVX1-NEXT: vzeroupper 3536 ; AVX1-NEXT: retq 3537 ; 3538 ; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32: 3539 ; AVX2-SLOW: # %bb.0: 3540 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 3541 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] 3542 ; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 3543 ; AVX2-SLOW-NEXT: vzeroupper 3544 ; AVX2-SLOW-NEXT: retq 3545 ; 3546 ; AVX2-FAST-LABEL: trunc_and_const_v4i64_v4i32: 3547 ; AVX2-FAST: # %bb.0: 3548 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] 3549 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 3550 ; AVX2-FAST-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 3551 ; AVX2-FAST-NEXT: vzeroupper 3552 ; AVX2-FAST-NEXT: retq 3553 ; 3554 ; AVX512-LABEL: trunc_and_const_v4i64_v4i32: 3555 ; AVX512: # %bb.0: 3556 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3557 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3558 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3559 ; AVX512-NEXT: vzeroupper 3560 ; AVX512-NEXT: retq 3561 %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 3562 %2 = trunc <4 x i64> %1 to <4 x i32> 3563 ret <4 x i32> %2 3564 } 3565 3566 define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 3567 ; SSE-LABEL: trunc_and_const_v8i64_v8i16: 3568 ; SSE: # %bb.0: 3569 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3570 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 3571 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 3572 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 3573 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 3574 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 3575 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 3576 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 3577 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 3578 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3579 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 3580 ; SSE-NEXT: andpd {{.*}}(%rip), %xmm0 3581 ; SSE-NEXT: retq 3582 ; 3583 ; AVX1-LABEL: trunc_and_const_v8i64_v8i16: 3584 ; AVX1: # %bb.0: 3585 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3586 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 3587 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 3588 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 3589 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 3590 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3591 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 3592 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 3593 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3594 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3595 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3596 ; AVX1-NEXT: vzeroupper 3597 ; AVX1-NEXT: retq 3598 ; 3599 ; AVX2-SLOW-LABEL: trunc_and_const_v8i64_v8i16: 3600 ; AVX2-SLOW: # %bb.0: 3601 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 3602 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3603 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 3604 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 3605 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 3606 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 3607 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3608 ; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3609 ; AVX2-SLOW-NEXT: vzeroupper 3610 ; AVX2-SLOW-NEXT: retq 3611 ; 3612 ; AVX2-FAST-LABEL: trunc_and_const_v8i64_v8i16: 3613 ; AVX2-FAST: # %bb.0: 3614 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 3615 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 3616 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 3617 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 3618 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 3619 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3620 ; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3621 ; AVX2-FAST-NEXT: vzeroupper 3622 ; AVX2-FAST-NEXT: retq 3623 ; 3624 ; AVX512-LABEL: trunc_and_const_v8i64_v8i16: 3625 ; AVX512: # %bb.0: 3626 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 3627 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3628 ; AVX512-NEXT: vzeroupper 3629 ; AVX512-NEXT: retq 3630 %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 3631 %2 = trunc <8 x i64> %1 to <8 x i16> 3632 ret <8 x i16> %2 3633 } 3634 3635 define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 3636 ; SSE-LABEL: trunc_and_const_v8i32_v8i16: 3637 ; SSE: # %bb.0: 3638 ; SSE-NEXT: pslld $16, %xmm1 3639 ; SSE-NEXT: psrad $16, %xmm1 3640 ; SSE-NEXT: pslld $16, %xmm0 3641 ; SSE-NEXT: psrad $16, %xmm0 3642 ; SSE-NEXT: packssdw %xmm1, %xmm0 3643 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 3644 ; SSE-NEXT: retq 3645 ; 3646 ; AVX1-LABEL: trunc_and_const_v8i32_v8i16: 3647 ; AVX1: # %bb.0: 3648 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3649 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 3650 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3651 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3652 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3653 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3654 ; AVX1-NEXT: vzeroupper 3655 ; AVX1-NEXT: retq 3656 ; 3657 ; AVX2-LABEL: trunc_and_const_v8i32_v8i16: 3658 ; AVX2: # %bb.0: 3659 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 3660 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3661 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3662 ; AVX2-NEXT: vzeroupper 3663 ; AVX2-NEXT: retq 3664 ; 3665 ; AVX512-LABEL: trunc_and_const_v8i32_v8i16: 3666 ; AVX512: # %bb.0: 3667 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3668 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 3669 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3670 ; AVX512-NEXT: vzeroupper 3671 ; AVX512-NEXT: retq 3672 %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3673 %2 = trunc <8 x i32> %1 to <8 x i16> 3674 ret <8 x i16> %2 3675 } 3676 3677 define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 3678 ; SSE-LABEL: trunc_and_const_v16i64_v16i8: 3679 ; SSE: # %bb.0: 3680 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3681 ; SSE-NEXT: pand %xmm8, %xmm7 3682 ; SSE-NEXT: pand %xmm8, %xmm6 3683 ; SSE-NEXT: packuswb %xmm7, %xmm6 3684 ; SSE-NEXT: pand %xmm8, %xmm5 3685 ; SSE-NEXT: pand %xmm8, %xmm4 3686 ; SSE-NEXT: packuswb %xmm5, %xmm4 3687 ; SSE-NEXT: packuswb %xmm6, %xmm4 3688 ; SSE-NEXT: pand %xmm8, %xmm3 3689 ; SSE-NEXT: pand %xmm8, %xmm2 3690 ; SSE-NEXT: packuswb %xmm3, %xmm2 3691 ; SSE-NEXT: pand %xmm8, %xmm1 3692 ; SSE-NEXT: pand %xmm8, %xmm0 3693 ; SSE-NEXT: packuswb %xmm1, %xmm0 3694 ; SSE-NEXT: packuswb %xmm2, %xmm0 3695 ; SSE-NEXT: packuswb %xmm4, %xmm0 3696 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 3697 ; SSE-NEXT: retq 3698 ; 3699 ; AVX1-LABEL: trunc_and_const_v16i64_v16i8: 3700 ; AVX1: # %bb.0: 3701 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 3702 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3703 ; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 3704 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 3705 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 3706 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 3707 ; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 3708 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 3709 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 3710 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 3711 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3712 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 3713 ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 3714 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3715 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 3716 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 3717 ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 3718 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 3719 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3720 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3721 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3722 ; AVX1-NEXT: vzeroupper 3723 ; AVX1-NEXT: retq 3724 ; 3725 ; AVX2-SLOW-LABEL: trunc_and_const_v16i64_v16i8: 3726 ; AVX2-SLOW: # %bb.0: 3727 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 3728 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 3729 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] 3730 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] 3731 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 3732 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 3733 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 3734 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 3735 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 3736 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 3737 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 3738 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3739 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 3740 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 3741 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 3742 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 3743 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3744 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 3745 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 3746 ; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3747 ; AVX2-SLOW-NEXT: vzeroupper 3748 ; AVX2-SLOW-NEXT: retq 3749 ; 3750 ; AVX2-FAST-LABEL: trunc_and_const_v16i64_v16i8: 3751 ; AVX2-FAST: # %bb.0: 3752 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 3753 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 3754 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 3755 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 3756 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 3757 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 3758 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 3759 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 3760 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 3761 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 3762 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 3763 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 3764 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 3765 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3766 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 3767 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 3768 ; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3769 ; AVX2-FAST-NEXT: vzeroupper 3770 ; AVX2-FAST-NEXT: retq 3771 ; 3772 ; AVX512-LABEL: trunc_and_const_v16i64_v16i8: 3773 ; AVX512: # %bb.0: 3774 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3775 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1 3776 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 3777 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3778 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3779 ; AVX512-NEXT: vzeroupper 3780 ; AVX512-NEXT: retq 3781 %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 3782 %2 = trunc <16 x i64> %1 to <16 x i8> 3783 ret <16 x i8> %2 3784 } 3785 3786 define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 3787 ; SSE-LABEL: trunc_and_const_v16i32_v16i8: 3788 ; SSE: # %bb.0: 3789 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3790 ; SSE-NEXT: pand %xmm4, %xmm3 3791 ; SSE-NEXT: pand %xmm4, %xmm2 3792 ; SSE-NEXT: packuswb %xmm3, %xmm2 3793 ; SSE-NEXT: pand %xmm4, %xmm1 3794 ; SSE-NEXT: pand %xmm4, %xmm0 3795 ; SSE-NEXT: packuswb %xmm1, %xmm0 3796 ; SSE-NEXT: packuswb %xmm2, %xmm0 3797 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 3798 ; SSE-NEXT: retq 3799 ; 3800 ; AVX1-LABEL: trunc_and_const_v16i32_v16i8: 3801 ; AVX1: # %bb.0: 3802 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3803 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3804 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 3805 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 3806 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 3807 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3808 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 3809 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 3810 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3811 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3812 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3813 ; AVX1-NEXT: vzeroupper 3814 ; AVX1-NEXT: retq 3815 ; 3816 ; AVX2-LABEL: trunc_and_const_v16i32_v16i8: 3817 ; AVX2: # %bb.0: 3818 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 3819 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3820 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 3821 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 3822 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 3823 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3824 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3825 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 3826 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3827 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3828 ; AVX2-NEXT: vzeroupper 3829 ; AVX2-NEXT: retq 3830 ; 3831 ; AVX512-LABEL: trunc_and_const_v16i32_v16i8: 3832 ; AVX512: # %bb.0: 3833 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3834 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3835 ; AVX512-NEXT: vzeroupper 3836 ; AVX512-NEXT: retq 3837 %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3838 %2 = trunc <16 x i32> %1 to <16 x i8> 3839 ret <16 x i8> %2 3840 } 3841 3842 define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 3843 ; SSE-LABEL: trunc_and_const_v16i16_v16i8: 3844 ; SSE: # %bb.0: 3845 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 3846 ; SSE-NEXT: pand %xmm2, %xmm1 3847 ; SSE-NEXT: pand %xmm2, %xmm0 3848 ; SSE-NEXT: packuswb %xmm1, %xmm0 3849 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 3850 ; SSE-NEXT: retq 3851 ; 3852 ; AVX1-LABEL: trunc_and_const_v16i16_v16i8: 3853 ; AVX1: # %bb.0: 3854 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3855 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 3856 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3857 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3858 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3859 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3860 ; AVX1-NEXT: vzeroupper 3861 ; AVX1-NEXT: retq 3862 ; 3863 ; AVX2-LABEL: trunc_and_const_v16i16_v16i8: 3864 ; AVX2: # %bb.0: 3865 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3866 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 3867 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3868 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3869 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3870 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3871 ; AVX2-NEXT: vzeroupper 3872 ; AVX2-NEXT: retq 3873 ; 3874 ; AVX512F-LABEL: trunc_and_const_v16i16_v16i8: 3875 ; AVX512F: # %bb.0: 3876 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 3877 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3878 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3879 ; AVX512F-NEXT: vzeroupper 3880 ; AVX512F-NEXT: retq 3881 ; 3882 ; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8: 3883 ; AVX512BW: # %bb.0: 3884 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3885 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 3886 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3887 ; AVX512BW-NEXT: vzeroupper 3888 ; AVX512BW-NEXT: retq 3889 ; 3890 ; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8: 3891 ; AVX512DQ: # %bb.0: 3892 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 3893 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3894 ; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 3895 ; AVX512DQ-NEXT: vzeroupper 3896 ; AVX512DQ-NEXT: retq 3897 %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 3898 %2 = trunc <16 x i16> %1 to <16 x i8> 3899 ret <16 x i8> %2 3900 } 3901 3902 ; 3903 ; xor 3904 ; 3905 3906 define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3907 ; SSE-LABEL: trunc_xor_v4i64_v4i32: 3908 ; SSE: # %bb.0: 3909 ; SSE-NEXT: xorps %xmm3, %xmm1 3910 ; SSE-NEXT: xorps %xmm2, %xmm0 3911 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3912 ; SSE-NEXT: retq 3913 ; 3914 ; AVX1-LABEL: trunc_xor_v4i64_v4i32: 3915 ; AVX1: # %bb.0: 3916 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 3917 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3918 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3919 ; AVX1-NEXT: vzeroupper 3920 ; AVX1-NEXT: retq 3921 ; 3922 ; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32: 3923 ; AVX2-SLOW: # %bb.0: 3924 ; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0 3925 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 3926 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] 3927 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3928 ; AVX2-SLOW-NEXT: vzeroupper 3929 ; AVX2-SLOW-NEXT: retq 3930 ; 3931 ; AVX2-FAST-LABEL: trunc_xor_v4i64_v4i32: 3932 ; AVX2-FAST: # %bb.0: 3933 ; AVX2-FAST-NEXT: vxorps %ymm1, %ymm0, %ymm0 3934 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] 3935 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 3936 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3937 ; AVX2-FAST-NEXT: vzeroupper 3938 ; AVX2-FAST-NEXT: retq 3939 ; 3940 ; AVX512-LABEL: trunc_xor_v4i64_v4i32: 3941 ; AVX512: # %bb.0: 3942 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 3943 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3944 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3945 ; AVX512-NEXT: vzeroupper 3946 ; AVX512-NEXT: retq 3947 %1 = xor <4 x i64> %a0, %a1 3948 %2 = trunc <4 x i64> %1 to <4 x i32> 3949 ret <4 x i32> %2 3950 } 3951 3952 define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 3953 ; SSE-LABEL: trunc_xor_v8i64_v8i16: 3954 ; SSE: # %bb.0: 3955 ; SSE-NEXT: pxor %xmm6, %xmm2 3956 ; SSE-NEXT: pxor %xmm7, %xmm3 3957 ; SSE-NEXT: pxor %xmm4, %xmm0 3958 ; SSE-NEXT: pxor %xmm5, %xmm1 3959 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3960 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 3961 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 3962 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 3963 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 3964 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 3965 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 3966 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 3967 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 3968 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3969 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 3970 ; SSE-NEXT: retq 3971 ; 3972 ; AVX1-LABEL: trunc_xor_v8i64_v8i16: 3973 ; AVX1: # %bb.0: 3974 ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 3975 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 3976 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3977 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 3978 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 3979 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 3980 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 3981 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3982 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 3983 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 3984 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3985 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3986 ; AVX1-NEXT: vzeroupper 3987 ; AVX1-NEXT: retq 3988 ; 3989 ; AVX2-SLOW-LABEL: trunc_xor_v8i64_v8i16: 3990 ; AVX2-SLOW: # %bb.0: 3991 ; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm1 3992 ; AVX2-SLOW-NEXT: vpxor %ymm2, %ymm0, %ymm0 3993 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 3994 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3995 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 3996 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 3997 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 3998 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 3999 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4000 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4001 ; AVX2-SLOW-NEXT: vzeroupper 4002 ; AVX2-SLOW-NEXT: retq 4003 ; 4004 ; AVX2-FAST-LABEL: trunc_xor_v8i64_v8i16: 4005 ; AVX2-FAST: # %bb.0: 4006 ; AVX2-FAST-NEXT: vpxor %ymm3, %ymm1, %ymm1 4007 ; AVX2-FAST-NEXT: vpxor %ymm2, %ymm0, %ymm0 4008 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 4009 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 4010 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 4011 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 4012 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4013 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4014 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4015 ; AVX2-FAST-NEXT: vzeroupper 4016 ; AVX2-FAST-NEXT: retq 4017 ; 4018 ; AVX512-LABEL: trunc_xor_v8i64_v8i16: 4019 ; AVX512: # %bb.0: 4020 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 4021 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 4022 ; AVX512-NEXT: vzeroupper 4023 ; AVX512-NEXT: retq 4024 %1 = xor <8 x i64> %a0, %a1 4025 %2 = trunc <8 x i64> %1 to <8 x i16> 4026 ret <8 x i16> %2 4027 } 4028 4029 define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 4030 ; SSE-LABEL: trunc_xor_v8i32_v8i16: 4031 ; SSE: # %bb.0: 4032 ; SSE-NEXT: pxor %xmm2, %xmm0 4033 ; SSE-NEXT: pxor %xmm3, %xmm1 4034 ; SSE-NEXT: pslld $16, %xmm1 4035 ; SSE-NEXT: psrad $16, %xmm1 4036 ; SSE-NEXT: pslld $16, %xmm0 4037 ; SSE-NEXT: psrad $16, %xmm0 4038 ; SSE-NEXT: packssdw %xmm1, %xmm0 4039 ; SSE-NEXT: retq 4040 ; 4041 ; AVX1-LABEL: trunc_xor_v8i32_v8i16: 4042 ; AVX1: # %bb.0: 4043 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 4044 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4045 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 4046 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4047 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4048 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4049 ; AVX1-NEXT: vzeroupper 4050 ; AVX1-NEXT: retq 4051 ; 4052 ; AVX2-LABEL: trunc_xor_v8i32_v8i16: 4053 ; AVX2: # %bb.0: 4054 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 4055 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4056 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4057 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4058 ; AVX2-NEXT: vzeroupper 4059 ; AVX2-NEXT: retq 4060 ; 4061 ; AVX512-LABEL: trunc_xor_v8i32_v8i16: 4062 ; AVX512: # %bb.0: 4063 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 4064 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 4065 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4066 ; AVX512-NEXT: vzeroupper 4067 ; AVX512-NEXT: retq 4068 %1 = xor <8 x i32> %a0, %a1 4069 %2 = trunc <8 x i32> %1 to <8 x i16> 4070 ret <8 x i16> %2 4071 } 4072 4073 define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 4074 ; SSE-LABEL: trunc_xor_v16i64_v16i8: 4075 ; SSE: # %bb.0: 4076 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm0 4077 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm1 4078 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm2 4079 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm3 4080 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm4 4081 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm5 4082 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm6 4083 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm7 4084 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4085 ; SSE-NEXT: pand %xmm8, %xmm7 4086 ; SSE-NEXT: pand %xmm8, %xmm6 4087 ; SSE-NEXT: packuswb %xmm7, %xmm6 4088 ; SSE-NEXT: pand %xmm8, %xmm5 4089 ; SSE-NEXT: pand %xmm8, %xmm4 4090 ; SSE-NEXT: packuswb %xmm5, %xmm4 4091 ; SSE-NEXT: packuswb %xmm6, %xmm4 4092 ; SSE-NEXT: pand %xmm8, %xmm3 4093 ; SSE-NEXT: pand %xmm8, %xmm2 4094 ; SSE-NEXT: packuswb %xmm3, %xmm2 4095 ; SSE-NEXT: pand %xmm8, %xmm1 4096 ; SSE-NEXT: pand %xmm8, %xmm0 4097 ; SSE-NEXT: packuswb %xmm1, %xmm0 4098 ; SSE-NEXT: packuswb %xmm2, %xmm0 4099 ; SSE-NEXT: packuswb %xmm4, %xmm0 4100 ; SSE-NEXT: retq 4101 ; 4102 ; AVX1-LABEL: trunc_xor_v16i64_v16i8: 4103 ; AVX1: # %bb.0: 4104 ; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0 4105 ; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1 4106 ; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2 4107 ; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3 4108 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 4109 ; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4110 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 4111 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 4112 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 4113 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 4114 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 4115 ; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2 4116 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 4117 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 4118 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4119 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 4120 ; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1 4121 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4122 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 4123 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 4124 ; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0 4125 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 4126 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4127 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4128 ; AVX1-NEXT: vzeroupper 4129 ; AVX1-NEXT: retq 4130 ; 4131 ; AVX2-SLOW-LABEL: trunc_xor_v16i64_v16i8: 4132 ; AVX2-SLOW: # %bb.0: 4133 ; AVX2-SLOW-NEXT: vpxor %ymm5, %ymm1, %ymm1 4134 ; AVX2-SLOW-NEXT: vpxor %ymm4, %ymm0, %ymm0 4135 ; AVX2-SLOW-NEXT: vpxor %ymm7, %ymm3, %ymm3 4136 ; AVX2-SLOW-NEXT: vpxor %ymm6, %ymm2, %ymm2 4137 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 4138 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 4139 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] 4140 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] 4141 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 4142 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4143 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 4144 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 4145 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4146 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 4147 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 4148 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4149 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 4150 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 4151 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 4152 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 4153 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4154 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 4155 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 4156 ; AVX2-SLOW-NEXT: vzeroupper 4157 ; AVX2-SLOW-NEXT: retq 4158 ; 4159 ; AVX2-FAST-LABEL: trunc_xor_v16i64_v16i8: 4160 ; AVX2-FAST: # %bb.0: 4161 ; AVX2-FAST-NEXT: vpxor %ymm5, %ymm1, %ymm1 4162 ; AVX2-FAST-NEXT: vpxor %ymm4, %ymm0, %ymm0 4163 ; AVX2-FAST-NEXT: vpxor %ymm7, %ymm3, %ymm3 4164 ; AVX2-FAST-NEXT: vpxor %ymm6, %ymm2, %ymm2 4165 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 4166 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 4167 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 4168 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 4169 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4170 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 4171 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 4172 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4173 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 4174 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 4175 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 4176 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 4177 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 4178 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4179 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 4180 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 4181 ; AVX2-FAST-NEXT: vzeroupper 4182 ; AVX2-FAST-NEXT: retq 4183 ; 4184 ; AVX512-LABEL: trunc_xor_v16i64_v16i8: 4185 ; AVX512: # %bb.0: 4186 ; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1 4187 ; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0 4188 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 4189 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1 4190 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4191 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 4192 ; AVX512-NEXT: vzeroupper 4193 ; AVX512-NEXT: retq 4194 %1 = xor <16 x i64> %a0, %a1 4195 %2 = trunc <16 x i64> %1 to <16 x i8> 4196 ret <16 x i8> %2 4197 } 4198 4199 define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 4200 ; SSE-LABEL: trunc_xor_v16i32_v16i8: 4201 ; SSE: # %bb.0: 4202 ; SSE-NEXT: pxor %xmm4, %xmm0 4203 ; SSE-NEXT: pxor %xmm5, %xmm1 4204 ; SSE-NEXT: pxor %xmm6, %xmm2 4205 ; SSE-NEXT: pxor %xmm7, %xmm3 4206 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 4207 ; SSE-NEXT: pand %xmm4, %xmm3 4208 ; SSE-NEXT: pand %xmm4, %xmm2 4209 ; SSE-NEXT: packuswb %xmm3, %xmm2 4210 ; SSE-NEXT: pand %xmm4, %xmm1 4211 ; SSE-NEXT: pand %xmm4, %xmm0 4212 ; SSE-NEXT: packuswb %xmm1, %xmm0 4213 ; SSE-NEXT: packuswb %xmm2, %xmm0 4214 ; SSE-NEXT: retq 4215 ; 4216 ; AVX1-LABEL: trunc_xor_v16i32_v16i8: 4217 ; AVX1: # %bb.0: 4218 ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 4219 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 4220 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 4221 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 4222 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 4223 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 4224 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 4225 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4226 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 4227 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 4228 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4229 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4230 ; AVX1-NEXT: vzeroupper 4231 ; AVX1-NEXT: retq 4232 ; 4233 ; AVX2-LABEL: trunc_xor_v16i32_v16i8: 4234 ; AVX2: # %bb.0: 4235 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 4236 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 4237 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4238 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 4239 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 4240 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4241 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 4242 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 4243 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4244 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 4245 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4246 ; AVX2-NEXT: vzeroupper 4247 ; AVX2-NEXT: retq 4248 ; 4249 ; AVX512-LABEL: trunc_xor_v16i32_v16i8: 4250 ; AVX512: # %bb.0: 4251 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 4252 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 4253 ; AVX512-NEXT: vzeroupper 4254 ; AVX512-NEXT: retq 4255 %1 = xor <16 x i32> %a0, %a1 4256 %2 = trunc <16 x i32> %1 to <16 x i8> 4257 ret <16 x i8> %2 4258 } 4259 4260 define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 4261 ; SSE-LABEL: trunc_xor_v16i16_v16i8: 4262 ; SSE: # %bb.0: 4263 ; SSE-NEXT: pxor %xmm2, %xmm0 4264 ; SSE-NEXT: pxor %xmm3, %xmm1 4265 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 4266 ; SSE-NEXT: pand %xmm2, %xmm1 4267 ; SSE-NEXT: pand %xmm2, %xmm0 4268 ; SSE-NEXT: packuswb %xmm1, %xmm0 4269 ; SSE-NEXT: retq 4270 ; 4271 ; AVX1-LABEL: trunc_xor_v16i16_v16i8: 4272 ; AVX1: # %bb.0: 4273 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 4274 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4275 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4276 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4277 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4278 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4279 ; AVX1-NEXT: vzeroupper 4280 ; AVX1-NEXT: retq 4281 ; 4282 ; AVX2-LABEL: trunc_xor_v16i16_v16i8: 4283 ; AVX2: # %bb.0: 4284 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 4285 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4286 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4287 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4288 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4289 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4290 ; AVX2-NEXT: vzeroupper 4291 ; AVX2-NEXT: retq 4292 ; 4293 ; AVX512F-LABEL: trunc_xor_v16i16_v16i8: 4294 ; AVX512F: # %bb.0: 4295 ; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0 4296 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 4297 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 4298 ; AVX512F-NEXT: vzeroupper 4299 ; AVX512F-NEXT: retq 4300 ; 4301 ; AVX512BW-LABEL: trunc_xor_v16i16_v16i8: 4302 ; AVX512BW: # %bb.0: 4303 ; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0 4304 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 4305 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4306 ; AVX512BW-NEXT: vzeroupper 4307 ; AVX512BW-NEXT: retq 4308 ; 4309 ; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8: 4310 ; AVX512DQ: # %bb.0: 4311 ; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 4312 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 4313 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 4314 ; AVX512DQ-NEXT: vzeroupper 4315 ; AVX512DQ-NEXT: retq 4316 %1 = xor <16 x i16> %a0, %a1 4317 %2 = trunc <16 x i16> %1 to <16 x i8> 4318 ret <16 x i8> %2 4319 } 4320 4321 ; 4322 ; xor to constant 4323 ; 4324 4325 define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 4326 ; SSE-LABEL: trunc_xor_const_v4i64_v4i32: 4327 ; SSE: # %bb.0: 4328 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4329 ; SSE-NEXT: xorps {{.*}}(%rip), %xmm0 4330 ; SSE-NEXT: retq 4331 ; 4332 ; AVX1-LABEL: trunc_xor_const_v4i64_v4i32: 4333 ; AVX1: # %bb.0: 4334 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4335 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4336 ; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 4337 ; AVX1-NEXT: vzeroupper 4338 ; AVX1-NEXT: retq 4339 ; 4340 ; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32: 4341 ; AVX2-SLOW: # %bb.0: 4342 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 4343 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] 4344 ; AVX2-SLOW-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 4345 ; AVX2-SLOW-NEXT: vzeroupper 4346 ; AVX2-SLOW-NEXT: retq 4347 ; 4348 ; AVX2-FAST-LABEL: trunc_xor_const_v4i64_v4i32: 4349 ; AVX2-FAST: # %bb.0: 4350 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] 4351 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 4352 ; AVX2-FAST-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 4353 ; AVX2-FAST-NEXT: vzeroupper 4354 ; AVX2-FAST-NEXT: retq 4355 ; 4356 ; AVX512-LABEL: trunc_xor_const_v4i64_v4i32: 4357 ; AVX512: # %bb.0: 4358 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 4359 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 4360 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4361 ; AVX512-NEXT: vzeroupper 4362 ; AVX512-NEXT: retq 4363 %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 4364 %2 = trunc <4 x i64> %1 to <4 x i32> 4365 ret <4 x i32> %2 4366 } 4367 4368 define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 4369 ; SSE-LABEL: trunc_xor_const_v8i64_v8i16: 4370 ; SSE: # %bb.0: 4371 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 4372 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 4373 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 4374 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 4375 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 4376 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 4377 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 4378 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 4379 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 4380 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4381 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 4382 ; SSE-NEXT: xorpd {{.*}}(%rip), %xmm0 4383 ; SSE-NEXT: retq 4384 ; 4385 ; AVX1-LABEL: trunc_xor_const_v8i64_v8i16: 4386 ; AVX1: # %bb.0: 4387 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 4388 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 4389 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 4390 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 4391 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 4392 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4393 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 4394 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 4395 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4396 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4397 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4398 ; AVX1-NEXT: vzeroupper 4399 ; AVX1-NEXT: retq 4400 ; 4401 ; AVX2-SLOW-LABEL: trunc_xor_const_v8i64_v8i16: 4402 ; AVX2-SLOW: # %bb.0: 4403 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 4404 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4405 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 4406 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 4407 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 4408 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4409 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4410 ; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4411 ; AVX2-SLOW-NEXT: vzeroupper 4412 ; AVX2-SLOW-NEXT: retq 4413 ; 4414 ; AVX2-FAST-LABEL: trunc_xor_const_v8i64_v8i16: 4415 ; AVX2-FAST: # %bb.0: 4416 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 4417 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 4418 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 4419 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 4420 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4421 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4422 ; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4423 ; AVX2-FAST-NEXT: vzeroupper 4424 ; AVX2-FAST-NEXT: retq 4425 ; 4426 ; AVX512-LABEL: trunc_xor_const_v8i64_v8i16: 4427 ; AVX512: # %bb.0: 4428 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 4429 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4430 ; AVX512-NEXT: vzeroupper 4431 ; AVX512-NEXT: retq 4432 %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 4433 %2 = trunc <8 x i64> %1 to <8 x i16> 4434 ret <8 x i16> %2 4435 } 4436 4437 define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 4438 ; SSE-LABEL: trunc_xor_const_v8i32_v8i16: 4439 ; SSE: # %bb.0: 4440 ; SSE-NEXT: pslld $16, %xmm1 4441 ; SSE-NEXT: psrad $16, %xmm1 4442 ; SSE-NEXT: pslld $16, %xmm0 4443 ; SSE-NEXT: psrad $16, %xmm0 4444 ; SSE-NEXT: packssdw %xmm1, %xmm0 4445 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 4446 ; SSE-NEXT: retq 4447 ; 4448 ; AVX1-LABEL: trunc_xor_const_v8i32_v8i16: 4449 ; AVX1: # %bb.0: 4450 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4451 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 4452 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4453 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4454 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4455 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4456 ; AVX1-NEXT: vzeroupper 4457 ; AVX1-NEXT: retq 4458 ; 4459 ; AVX2-LABEL: trunc_xor_const_v8i32_v8i16: 4460 ; AVX2: # %bb.0: 4461 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4462 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4463 ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4464 ; AVX2-NEXT: vzeroupper 4465 ; AVX2-NEXT: retq 4466 ; 4467 ; AVX512-LABEL: trunc_xor_const_v8i32_v8i16: 4468 ; AVX512: # %bb.0: 4469 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 4470 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 4471 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4472 ; AVX512-NEXT: vzeroupper 4473 ; AVX512-NEXT: retq 4474 %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 4475 %2 = trunc <8 x i32> %1 to <8 x i16> 4476 ret <8 x i16> %2 4477 } 4478 4479 define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 4480 ; SSE-LABEL: trunc_xor_const_v16i64_v16i8: 4481 ; SSE: # %bb.0: 4482 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4483 ; SSE-NEXT: pand %xmm8, %xmm7 4484 ; SSE-NEXT: pand %xmm8, %xmm6 4485 ; SSE-NEXT: packuswb %xmm7, %xmm6 4486 ; SSE-NEXT: pand %xmm8, %xmm5 4487 ; SSE-NEXT: pand %xmm8, %xmm4 4488 ; SSE-NEXT: packuswb %xmm5, %xmm4 4489 ; SSE-NEXT: packuswb %xmm6, %xmm4 4490 ; SSE-NEXT: pand %xmm8, %xmm3 4491 ; SSE-NEXT: pand %xmm8, %xmm2 4492 ; SSE-NEXT: packuswb %xmm3, %xmm2 4493 ; SSE-NEXT: pand %xmm8, %xmm1 4494 ; SSE-NEXT: pand %xmm8, %xmm0 4495 ; SSE-NEXT: packuswb %xmm1, %xmm0 4496 ; SSE-NEXT: packuswb %xmm2, %xmm0 4497 ; SSE-NEXT: packuswb %xmm4, %xmm0 4498 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 4499 ; SSE-NEXT: retq 4500 ; 4501 ; AVX1-LABEL: trunc_xor_const_v16i64_v16i8: 4502 ; AVX1: # %bb.0: 4503 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 4504 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4505 ; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 4506 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 4507 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 4508 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 4509 ; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 4510 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 4511 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 4512 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 4513 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4514 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 4515 ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 4516 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4517 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 4518 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 4519 ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 4520 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 4521 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4522 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4523 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4524 ; AVX1-NEXT: vzeroupper 4525 ; AVX1-NEXT: retq 4526 ; 4527 ; AVX2-SLOW-LABEL: trunc_xor_const_v16i64_v16i8: 4528 ; AVX2-SLOW: # %bb.0: 4529 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 4530 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 4531 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] 4532 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] 4533 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 4534 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4535 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 4536 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 4537 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4538 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 4539 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 4540 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4541 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 4542 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 4543 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 4544 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 4545 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4546 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 4547 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 4548 ; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4549 ; AVX2-SLOW-NEXT: vzeroupper 4550 ; AVX2-SLOW-NEXT: retq 4551 ; 4552 ; AVX2-FAST-LABEL: trunc_xor_const_v16i64_v16i8: 4553 ; AVX2-FAST: # %bb.0: 4554 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 4555 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 4556 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 4557 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 4558 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4559 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 4560 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 4561 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4562 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 4563 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 4564 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 4565 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 4566 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 4567 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4568 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 4569 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 4570 ; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4571 ; AVX2-FAST-NEXT: vzeroupper 4572 ; AVX2-FAST-NEXT: retq 4573 ; 4574 ; AVX512-LABEL: trunc_xor_const_v16i64_v16i8: 4575 ; AVX512: # %bb.0: 4576 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 4577 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1 4578 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4579 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 4580 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4581 ; AVX512-NEXT: vzeroupper 4582 ; AVX512-NEXT: retq 4583 %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 4584 %2 = trunc <16 x i64> %1 to <16 x i8> 4585 ret <16 x i8> %2 4586 } 4587 4588 define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 4589 ; SSE-LABEL: trunc_xor_const_v16i32_v16i8: 4590 ; SSE: # %bb.0: 4591 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 4592 ; SSE-NEXT: pand %xmm4, %xmm3 4593 ; SSE-NEXT: pand %xmm4, %xmm2 4594 ; SSE-NEXT: packuswb %xmm3, %xmm2 4595 ; SSE-NEXT: pand %xmm4, %xmm1 4596 ; SSE-NEXT: pand %xmm4, %xmm0 4597 ; SSE-NEXT: packuswb %xmm1, %xmm0 4598 ; SSE-NEXT: packuswb %xmm2, %xmm0 4599 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 4600 ; SSE-NEXT: retq 4601 ; 4602 ; AVX1-LABEL: trunc_xor_const_v16i32_v16i8: 4603 ; AVX1: # %bb.0: 4604 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 4605 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 4606 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 4607 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 4608 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 4609 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4610 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 4611 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 4612 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4613 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4614 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4615 ; AVX1-NEXT: vzeroupper 4616 ; AVX1-NEXT: retq 4617 ; 4618 ; AVX2-LABEL: trunc_xor_const_v16i32_v16i8: 4619 ; AVX2: # %bb.0: 4620 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4621 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 4622 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 4623 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4624 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 4625 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 4626 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4627 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 4628 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4629 ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4630 ; AVX2-NEXT: vzeroupper 4631 ; AVX2-NEXT: retq 4632 ; 4633 ; AVX512-LABEL: trunc_xor_const_v16i32_v16i8: 4634 ; AVX512: # %bb.0: 4635 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 4636 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4637 ; AVX512-NEXT: vzeroupper 4638 ; AVX512-NEXT: retq 4639 %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 4640 %2 = trunc <16 x i32> %1 to <16 x i8> 4641 ret <16 x i8> %2 4642 } 4643 4644 define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 4645 ; SSE-LABEL: trunc_xor_const_v16i16_v16i8: 4646 ; SSE: # %bb.0: 4647 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 4648 ; SSE-NEXT: pand %xmm2, %xmm1 4649 ; SSE-NEXT: pand %xmm2, %xmm0 4650 ; SSE-NEXT: packuswb %xmm1, %xmm0 4651 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 4652 ; SSE-NEXT: retq 4653 ; 4654 ; AVX1-LABEL: trunc_xor_const_v16i16_v16i8: 4655 ; AVX1: # %bb.0: 4656 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4657 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4658 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4659 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4660 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4661 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4662 ; AVX1-NEXT: vzeroupper 4663 ; AVX1-NEXT: retq 4664 ; 4665 ; AVX2-LABEL: trunc_xor_const_v16i16_v16i8: 4666 ; AVX2: # %bb.0: 4667 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4668 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4669 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4670 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4671 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4672 ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4673 ; AVX2-NEXT: vzeroupper 4674 ; AVX2-NEXT: retq 4675 ; 4676 ; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8: 4677 ; AVX512F: # %bb.0: 4678 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 4679 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 4680 ; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4681 ; AVX512F-NEXT: vzeroupper 4682 ; AVX512F-NEXT: retq 4683 ; 4684 ; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8: 4685 ; AVX512BW: # %bb.0: 4686 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 4687 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 4688 ; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4689 ; AVX512BW-NEXT: vzeroupper 4690 ; AVX512BW-NEXT: retq 4691 ; 4692 ; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8: 4693 ; AVX512DQ: # %bb.0: 4694 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 4695 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 4696 ; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 4697 ; AVX512DQ-NEXT: vzeroupper 4698 ; AVX512DQ-NEXT: retq 4699 %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 4700 %2 = trunc <16 x i16> %1 to <16 x i8> 4701 ret <16 x i8> %2 4702 } 4703 4704 ; 4705 ; or 4706 ; 4707 4708 define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 4709 ; SSE-LABEL: trunc_or_v4i64_v4i32: 4710 ; SSE: # %bb.0: 4711 ; SSE-NEXT: orps %xmm3, %xmm1 4712 ; SSE-NEXT: orps %xmm2, %xmm0 4713 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4714 ; SSE-NEXT: retq 4715 ; 4716 ; AVX1-LABEL: trunc_or_v4i64_v4i32: 4717 ; AVX1: # %bb.0: 4718 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 4719 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4720 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4721 ; AVX1-NEXT: vzeroupper 4722 ; AVX1-NEXT: retq 4723 ; 4724 ; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32: 4725 ; AVX2-SLOW: # %bb.0: 4726 ; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0 4727 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 4728 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] 4729 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4730 ; AVX2-SLOW-NEXT: vzeroupper 4731 ; AVX2-SLOW-NEXT: retq 4732 ; 4733 ; AVX2-FAST-LABEL: trunc_or_v4i64_v4i32: 4734 ; AVX2-FAST: # %bb.0: 4735 ; AVX2-FAST-NEXT: vorps %ymm1, %ymm0, %ymm0 4736 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] 4737 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 4738 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4739 ; AVX2-FAST-NEXT: vzeroupper 4740 ; AVX2-FAST-NEXT: retq 4741 ; 4742 ; AVX512-LABEL: trunc_or_v4i64_v4i32: 4743 ; AVX512: # %bb.0: 4744 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 4745 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 4746 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4747 ; AVX512-NEXT: vzeroupper 4748 ; AVX512-NEXT: retq 4749 %1 = or <4 x i64> %a0, %a1 4750 %2 = trunc <4 x i64> %1 to <4 x i32> 4751 ret <4 x i32> %2 4752 } 4753 4754 define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 4755 ; SSE-LABEL: trunc_or_v8i64_v8i16: 4756 ; SSE: # %bb.0: 4757 ; SSE-NEXT: por %xmm6, %xmm2 4758 ; SSE-NEXT: por %xmm7, %xmm3 4759 ; SSE-NEXT: por %xmm4, %xmm0 4760 ; SSE-NEXT: por %xmm5, %xmm1 4761 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 4762 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 4763 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 4764 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 4765 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 4766 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 4767 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 4768 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 4769 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 4770 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4771 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 4772 ; SSE-NEXT: retq 4773 ; 4774 ; AVX1-LABEL: trunc_or_v8i64_v8i16: 4775 ; AVX1: # %bb.0: 4776 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 4777 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 4778 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 4779 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 4780 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 4781 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 4782 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 4783 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4784 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 4785 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 4786 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4787 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4788 ; AVX1-NEXT: vzeroupper 4789 ; AVX1-NEXT: retq 4790 ; 4791 ; AVX2-SLOW-LABEL: trunc_or_v8i64_v8i16: 4792 ; AVX2-SLOW: # %bb.0: 4793 ; AVX2-SLOW-NEXT: vpor %ymm3, %ymm1, %ymm1 4794 ; AVX2-SLOW-NEXT: vpor %ymm2, %ymm0, %ymm0 4795 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 4796 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4797 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 4798 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 4799 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 4800 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4801 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4802 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4803 ; AVX2-SLOW-NEXT: vzeroupper 4804 ; AVX2-SLOW-NEXT: retq 4805 ; 4806 ; AVX2-FAST-LABEL: trunc_or_v8i64_v8i16: 4807 ; AVX2-FAST: # %bb.0: 4808 ; AVX2-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1 4809 ; AVX2-FAST-NEXT: vpor %ymm2, %ymm0, %ymm0 4810 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 4811 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 4812 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 4813 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 4814 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4815 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4816 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4817 ; AVX2-FAST-NEXT: vzeroupper 4818 ; AVX2-FAST-NEXT: retq 4819 ; 4820 ; AVX512-LABEL: trunc_or_v8i64_v8i16: 4821 ; AVX512: # %bb.0: 4822 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 4823 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 4824 ; AVX512-NEXT: vzeroupper 4825 ; AVX512-NEXT: retq 4826 %1 = or <8 x i64> %a0, %a1 4827 %2 = trunc <8 x i64> %1 to <8 x i16> 4828 ret <8 x i16> %2 4829 } 4830 4831 define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 4832 ; SSE-LABEL: trunc_or_v8i32_v8i16: 4833 ; SSE: # %bb.0: 4834 ; SSE-NEXT: por %xmm2, %xmm0 4835 ; SSE-NEXT: por %xmm3, %xmm1 4836 ; SSE-NEXT: pslld $16, %xmm1 4837 ; SSE-NEXT: psrad $16, %xmm1 4838 ; SSE-NEXT: pslld $16, %xmm0 4839 ; SSE-NEXT: psrad $16, %xmm0 4840 ; SSE-NEXT: packssdw %xmm1, %xmm0 4841 ; SSE-NEXT: retq 4842 ; 4843 ; AVX1-LABEL: trunc_or_v8i32_v8i16: 4844 ; AVX1: # %bb.0: 4845 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 4846 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4847 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 4848 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4849 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4850 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4851 ; AVX1-NEXT: vzeroupper 4852 ; AVX1-NEXT: retq 4853 ; 4854 ; AVX2-LABEL: trunc_or_v8i32_v8i16: 4855 ; AVX2: # %bb.0: 4856 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 4857 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4858 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4859 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4860 ; AVX2-NEXT: vzeroupper 4861 ; AVX2-NEXT: retq 4862 ; 4863 ; AVX512-LABEL: trunc_or_v8i32_v8i16: 4864 ; AVX512: # %bb.0: 4865 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 4866 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 4867 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4868 ; AVX512-NEXT: vzeroupper 4869 ; AVX512-NEXT: retq 4870 %1 = or <8 x i32> %a0, %a1 4871 %2 = trunc <8 x i32> %1 to <8 x i16> 4872 ret <8 x i16> %2 4873 } 4874 4875 define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 4876 ; SSE-LABEL: trunc_or_v16i64_v16i8: 4877 ; SSE: # %bb.0: 4878 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0 4879 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1 4880 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm2 4881 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm3 4882 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm4 4883 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm5 4884 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm6 4885 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm7 4886 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4887 ; SSE-NEXT: pand %xmm8, %xmm7 4888 ; SSE-NEXT: pand %xmm8, %xmm6 4889 ; SSE-NEXT: packuswb %xmm7, %xmm6 4890 ; SSE-NEXT: pand %xmm8, %xmm5 4891 ; SSE-NEXT: pand %xmm8, %xmm4 4892 ; SSE-NEXT: packuswb %xmm5, %xmm4 4893 ; SSE-NEXT: packuswb %xmm6, %xmm4 4894 ; SSE-NEXT: pand %xmm8, %xmm3 4895 ; SSE-NEXT: pand %xmm8, %xmm2 4896 ; SSE-NEXT: packuswb %xmm3, %xmm2 4897 ; SSE-NEXT: pand %xmm8, %xmm1 4898 ; SSE-NEXT: pand %xmm8, %xmm0 4899 ; SSE-NEXT: packuswb %xmm1, %xmm0 4900 ; SSE-NEXT: packuswb %xmm2, %xmm0 4901 ; SSE-NEXT: packuswb %xmm4, %xmm0 4902 ; SSE-NEXT: retq 4903 ; 4904 ; AVX1-LABEL: trunc_or_v16i64_v16i8: 4905 ; AVX1: # %bb.0: 4906 ; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0 4907 ; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1 4908 ; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2 4909 ; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3 4910 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 4911 ; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4912 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 4913 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 4914 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 4915 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 4916 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 4917 ; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2 4918 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 4919 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 4920 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4921 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 4922 ; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1 4923 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4924 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 4925 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 4926 ; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0 4927 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 4928 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4929 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4930 ; AVX1-NEXT: vzeroupper 4931 ; AVX1-NEXT: retq 4932 ; 4933 ; AVX2-SLOW-LABEL: trunc_or_v16i64_v16i8: 4934 ; AVX2-SLOW: # %bb.0: 4935 ; AVX2-SLOW-NEXT: vpor %ymm5, %ymm1, %ymm1 4936 ; AVX2-SLOW-NEXT: vpor %ymm4, %ymm0, %ymm0 4937 ; AVX2-SLOW-NEXT: vpor %ymm7, %ymm3, %ymm3 4938 ; AVX2-SLOW-NEXT: vpor %ymm6, %ymm2, %ymm2 4939 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 4940 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 4941 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] 4942 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] 4943 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 4944 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4945 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 4946 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 4947 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4948 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 4949 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 4950 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4951 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 4952 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 4953 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 4954 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 4955 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4956 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 4957 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 4958 ; AVX2-SLOW-NEXT: vzeroupper 4959 ; AVX2-SLOW-NEXT: retq 4960 ; 4961 ; AVX2-FAST-LABEL: trunc_or_v16i64_v16i8: 4962 ; AVX2-FAST: # %bb.0: 4963 ; AVX2-FAST-NEXT: vpor %ymm5, %ymm1, %ymm1 4964 ; AVX2-FAST-NEXT: vpor %ymm4, %ymm0, %ymm0 4965 ; AVX2-FAST-NEXT: vpor %ymm7, %ymm3, %ymm3 4966 ; AVX2-FAST-NEXT: vpor %ymm6, %ymm2, %ymm2 4967 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 4968 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 4969 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 4970 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 4971 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 4972 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 4973 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 4974 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 4975 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 4976 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 4977 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 4978 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 4979 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 4980 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4981 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 4982 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 4983 ; AVX2-FAST-NEXT: vzeroupper 4984 ; AVX2-FAST-NEXT: retq 4985 ; 4986 ; AVX512-LABEL: trunc_or_v16i64_v16i8: 4987 ; AVX512: # %bb.0: 4988 ; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1 4989 ; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0 4990 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 4991 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1 4992 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4993 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 4994 ; AVX512-NEXT: vzeroupper 4995 ; AVX512-NEXT: retq 4996 %1 = or <16 x i64> %a0, %a1 4997 %2 = trunc <16 x i64> %1 to <16 x i8> 4998 ret <16 x i8> %2 4999 } 5000 5001 define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 5002 ; SSE-LABEL: trunc_or_v16i32_v16i8: 5003 ; SSE: # %bb.0: 5004 ; SSE-NEXT: por %xmm4, %xmm0 5005 ; SSE-NEXT: por %xmm5, %xmm1 5006 ; SSE-NEXT: por %xmm6, %xmm2 5007 ; SSE-NEXT: por %xmm7, %xmm3 5008 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 5009 ; SSE-NEXT: pand %xmm4, %xmm3 5010 ; SSE-NEXT: pand %xmm4, %xmm2 5011 ; SSE-NEXT: packuswb %xmm3, %xmm2 5012 ; SSE-NEXT: pand %xmm4, %xmm1 5013 ; SSE-NEXT: pand %xmm4, %xmm0 5014 ; SSE-NEXT: packuswb %xmm1, %xmm0 5015 ; SSE-NEXT: packuswb %xmm2, %xmm0 5016 ; SSE-NEXT: retq 5017 ; 5018 ; AVX1-LABEL: trunc_or_v16i32_v16i8: 5019 ; AVX1: # %bb.0: 5020 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 5021 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 5022 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 5023 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 5024 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 5025 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 5026 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 5027 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 5028 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 5029 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 5030 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 5031 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 5032 ; AVX1-NEXT: vzeroupper 5033 ; AVX1-NEXT: retq 5034 ; 5035 ; AVX2-LABEL: trunc_or_v16i32_v16i8: 5036 ; AVX2: # %bb.0: 5037 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 5038 ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 5039 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 5040 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 5041 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 5042 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 5043 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 5044 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 5045 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 5046 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 5047 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 5048 ; AVX2-NEXT: vzeroupper 5049 ; AVX2-NEXT: retq 5050 ; 5051 ; AVX512-LABEL: trunc_or_v16i32_v16i8: 5052 ; AVX512: # %bb.0: 5053 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 5054 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 5055 ; AVX512-NEXT: vzeroupper 5056 ; AVX512-NEXT: retq 5057 %1 = or <16 x i32> %a0, %a1 5058 %2 = trunc <16 x i32> %1 to <16 x i8> 5059 ret <16 x i8> %2 5060 } 5061 5062 define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 5063 ; SSE-LABEL: trunc_or_v16i16_v16i8: 5064 ; SSE: # %bb.0: 5065 ; SSE-NEXT: por %xmm2, %xmm0 5066 ; SSE-NEXT: por %xmm3, %xmm1 5067 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 5068 ; SSE-NEXT: pand %xmm2, %xmm1 5069 ; SSE-NEXT: pand %xmm2, %xmm0 5070 ; SSE-NEXT: packuswb %xmm1, %xmm0 5071 ; SSE-NEXT: retq 5072 ; 5073 ; AVX1-LABEL: trunc_or_v16i16_v16i8: 5074 ; AVX1: # %bb.0: 5075 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 5076 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 5077 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 5078 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 5079 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 5080 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 5081 ; AVX1-NEXT: vzeroupper 5082 ; AVX1-NEXT: retq 5083 ; 5084 ; AVX2-LABEL: trunc_or_v16i16_v16i8: 5085 ; AVX2: # %bb.0: 5086 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 5087 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 5088 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 5089 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 5090 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 5091 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 5092 ; AVX2-NEXT: vzeroupper 5093 ; AVX2-NEXT: retq 5094 ; 5095 ; AVX512F-LABEL: trunc_or_v16i16_v16i8: 5096 ; AVX512F: # %bb.0: 5097 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 5098 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 5099 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 5100 ; AVX512F-NEXT: vzeroupper 5101 ; AVX512F-NEXT: retq 5102 ; 5103 ; AVX512BW-LABEL: trunc_or_v16i16_v16i8: 5104 ; AVX512BW: # %bb.0: 5105 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 5106 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 5107 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 5108 ; AVX512BW-NEXT: vzeroupper 5109 ; AVX512BW-NEXT: retq 5110 ; 5111 ; AVX512DQ-LABEL: trunc_or_v16i16_v16i8: 5112 ; AVX512DQ: # %bb.0: 5113 ; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0 5114 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 5115 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 5116 ; AVX512DQ-NEXT: vzeroupper 5117 ; AVX512DQ-NEXT: retq 5118 %1 = or <16 x i16> %a0, %a1 5119 %2 = trunc <16 x i16> %1 to <16 x i8> 5120 ret <16 x i8> %2 5121 } 5122 5123 ; 5124 ; or to constant 5125 ; 5126 5127 define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 5128 ; SSE-LABEL: trunc_or_const_v4i64_v4i32: 5129 ; SSE: # %bb.0: 5130 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 5131 ; SSE-NEXT: orps {{.*}}(%rip), %xmm0 5132 ; SSE-NEXT: retq 5133 ; 5134 ; AVX1-LABEL: trunc_or_const_v4i64_v4i32: 5135 ; AVX1: # %bb.0: 5136 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 5137 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 5138 ; AVX1-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 5139 ; AVX1-NEXT: vzeroupper 5140 ; AVX1-NEXT: retq 5141 ; 5142 ; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32: 5143 ; AVX2-SLOW: # %bb.0: 5144 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 5145 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] 5146 ; AVX2-SLOW-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 5147 ; AVX2-SLOW-NEXT: vzeroupper 5148 ; AVX2-SLOW-NEXT: retq 5149 ; 5150 ; AVX2-FAST-LABEL: trunc_or_const_v4i64_v4i32: 5151 ; AVX2-FAST: # %bb.0: 5152 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] 5153 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 5154 ; AVX2-FAST-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 5155 ; AVX2-FAST-NEXT: vzeroupper 5156 ; AVX2-FAST-NEXT: retq 5157 ; 5158 ; AVX512-LABEL: trunc_or_const_v4i64_v4i32: 5159 ; AVX512: # %bb.0: 5160 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 5161 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 5162 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5163 ; AVX512-NEXT: vzeroupper 5164 ; AVX512-NEXT: retq 5165 %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 5166 %2 = trunc <4 x i64> %1 to <4 x i32> 5167 ret <4 x i32> %2 5168 } 5169 5170 define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 5171 ; SSE-LABEL: trunc_or_const_v8i64_v8i16: 5172 ; SSE: # %bb.0: 5173 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 5174 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 5175 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 5176 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 5177 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 5178 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 5179 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 5180 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 5181 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 5182 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 5183 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 5184 ; SSE-NEXT: orpd {{.*}}(%rip), %xmm0 5185 ; SSE-NEXT: retq 5186 ; 5187 ; AVX1-LABEL: trunc_or_const_v8i64_v8i16: 5188 ; AVX1: # %bb.0: 5189 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 5190 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 5191 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 5192 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 5193 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 5194 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 5195 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 5196 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 5197 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 5198 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 5199 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5200 ; AVX1-NEXT: vzeroupper 5201 ; AVX1-NEXT: retq 5202 ; 5203 ; AVX2-SLOW-LABEL: trunc_or_const_v8i64_v8i16: 5204 ; AVX2-SLOW: # %bb.0: 5205 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 5206 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 5207 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 5208 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 5209 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 5210 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 5211 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 5212 ; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5213 ; AVX2-SLOW-NEXT: vzeroupper 5214 ; AVX2-SLOW-NEXT: retq 5215 ; 5216 ; AVX2-FAST-LABEL: trunc_or_const_v8i64_v8i16: 5217 ; AVX2-FAST: # %bb.0: 5218 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 5219 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 5220 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 5221 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 5222 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 5223 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 5224 ; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5225 ; AVX2-FAST-NEXT: vzeroupper 5226 ; AVX2-FAST-NEXT: retq 5227 ; 5228 ; AVX512-LABEL: trunc_or_const_v8i64_v8i16: 5229 ; AVX512: # %bb.0: 5230 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 5231 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5232 ; AVX512-NEXT: vzeroupper 5233 ; AVX512-NEXT: retq 5234 %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 5235 %2 = trunc <8 x i64> %1 to <8 x i16> 5236 ret <8 x i16> %2 5237 } 5238 5239 define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 5240 ; SSE-LABEL: trunc_or_const_v8i32_v8i16: 5241 ; SSE: # %bb.0: 5242 ; SSE-NEXT: pslld $16, %xmm1 5243 ; SSE-NEXT: psrad $16, %xmm1 5244 ; SSE-NEXT: pslld $16, %xmm0 5245 ; SSE-NEXT: psrad $16, %xmm0 5246 ; SSE-NEXT: packssdw %xmm1, %xmm0 5247 ; SSE-NEXT: por {{.*}}(%rip), %xmm0 5248 ; SSE-NEXT: retq 5249 ; 5250 ; AVX1-LABEL: trunc_or_const_v8i32_v8i16: 5251 ; AVX1: # %bb.0: 5252 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 5253 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 5254 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 5255 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 5256 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 5257 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5258 ; AVX1-NEXT: vzeroupper 5259 ; AVX1-NEXT: retq 5260 ; 5261 ; AVX2-LABEL: trunc_or_const_v8i32_v8i16: 5262 ; AVX2: # %bb.0: 5263 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 5264 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 5265 ; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5266 ; AVX2-NEXT: vzeroupper 5267 ; AVX2-NEXT: retq 5268 ; 5269 ; AVX512-LABEL: trunc_or_const_v8i32_v8i16: 5270 ; AVX512: # %bb.0: 5271 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 5272 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 5273 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5274 ; AVX512-NEXT: vzeroupper 5275 ; AVX512-NEXT: retq 5276 %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 5277 %2 = trunc <8 x i32> %1 to <8 x i16> 5278 ret <8 x i16> %2 5279 } 5280 5281 define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 5282 ; SSE-LABEL: trunc_or_const_v16i64_v16i8: 5283 ; SSE: # %bb.0: 5284 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 5285 ; SSE-NEXT: pand %xmm8, %xmm7 5286 ; SSE-NEXT: pand %xmm8, %xmm6 5287 ; SSE-NEXT: packuswb %xmm7, %xmm6 5288 ; SSE-NEXT: pand %xmm8, %xmm5 5289 ; SSE-NEXT: pand %xmm8, %xmm4 5290 ; SSE-NEXT: packuswb %xmm5, %xmm4 5291 ; SSE-NEXT: packuswb %xmm6, %xmm4 5292 ; SSE-NEXT: pand %xmm8, %xmm3 5293 ; SSE-NEXT: pand %xmm8, %xmm2 5294 ; SSE-NEXT: packuswb %xmm3, %xmm2 5295 ; SSE-NEXT: pand %xmm8, %xmm1 5296 ; SSE-NEXT: pand %xmm8, %xmm0 5297 ; SSE-NEXT: packuswb %xmm1, %xmm0 5298 ; SSE-NEXT: packuswb %xmm2, %xmm0 5299 ; SSE-NEXT: packuswb %xmm4, %xmm0 5300 ; SSE-NEXT: por {{.*}}(%rip), %xmm0 5301 ; SSE-NEXT: retq 5302 ; 5303 ; AVX1-LABEL: trunc_or_const_v16i64_v16i8: 5304 ; AVX1: # %bb.0: 5305 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 5306 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 5307 ; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 5308 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 5309 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 5310 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 5311 ; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 5312 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 5313 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 5314 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 5315 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 5316 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 5317 ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 5318 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 5319 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 5320 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 5321 ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 5322 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 5323 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 5324 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 5325 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5326 ; AVX1-NEXT: vzeroupper 5327 ; AVX1-NEXT: retq 5328 ; 5329 ; AVX2-SLOW-LABEL: trunc_or_const_v16i64_v16i8: 5330 ; AVX2-SLOW: # %bb.0: 5331 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] 5332 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 5333 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] 5334 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] 5335 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 5336 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 5337 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 5338 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 5339 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 5340 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 5341 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 5342 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 5343 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 5344 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 5345 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 5346 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 5347 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 5348 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 5349 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 5350 ; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5351 ; AVX2-SLOW-NEXT: vzeroupper 5352 ; AVX2-SLOW-NEXT: retq 5353 ; 5354 ; AVX2-FAST-LABEL: trunc_or_const_v16i64_v16i8: 5355 ; AVX2-FAST: # %bb.0: 5356 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] 5357 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 5358 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 5359 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 5360 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 5361 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 5362 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 5363 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 5364 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 5365 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 5366 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 5367 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 5368 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 5369 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 5370 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 5371 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 5372 ; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5373 ; AVX2-FAST-NEXT: vzeroupper 5374 ; AVX2-FAST-NEXT: retq 5375 ; 5376 ; AVX512-LABEL: trunc_or_const_v16i64_v16i8: 5377 ; AVX512: # %bb.0: 5378 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 5379 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1 5380 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 5381 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 5382 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5383 ; AVX512-NEXT: vzeroupper 5384 ; AVX512-NEXT: retq 5385 %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 5386 %2 = trunc <16 x i64> %1 to <16 x i8> 5387 ret <16 x i8> %2 5388 } 5389 5390 define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 5391 ; SSE-LABEL: trunc_or_const_v16i32_v16i8: 5392 ; SSE: # %bb.0: 5393 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 5394 ; SSE-NEXT: pand %xmm4, %xmm3 5395 ; SSE-NEXT: pand %xmm4, %xmm2 5396 ; SSE-NEXT: packuswb %xmm3, %xmm2 5397 ; SSE-NEXT: pand %xmm4, %xmm1 5398 ; SSE-NEXT: pand %xmm4, %xmm0 5399 ; SSE-NEXT: packuswb %xmm1, %xmm0 5400 ; SSE-NEXT: packuswb %xmm2, %xmm0 5401 ; SSE-NEXT: por {{.*}}(%rip), %xmm0 5402 ; SSE-NEXT: retq 5403 ; 5404 ; AVX1-LABEL: trunc_or_const_v16i32_v16i8: 5405 ; AVX1: # %bb.0: 5406 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 5407 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 5408 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 5409 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 5410 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 5411 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 5412 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 5413 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 5414 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 5415 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 5416 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5417 ; AVX1-NEXT: vzeroupper 5418 ; AVX1-NEXT: retq 5419 ; 5420 ; AVX2-LABEL: trunc_or_const_v16i32_v16i8: 5421 ; AVX2: # %bb.0: 5422 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 5423 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 5424 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 5425 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 5426 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 5427 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 5428 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 5429 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 5430 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 5431 ; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5432 ; AVX2-NEXT: vzeroupper 5433 ; AVX2-NEXT: retq 5434 ; 5435 ; AVX512-LABEL: trunc_or_const_v16i32_v16i8: 5436 ; AVX512: # %bb.0: 5437 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 5438 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5439 ; AVX512-NEXT: vzeroupper 5440 ; AVX512-NEXT: retq 5441 %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 5442 %2 = trunc <16 x i32> %1 to <16 x i8> 5443 ret <16 x i8> %2 5444 } 5445 5446 define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 5447 ; SSE-LABEL: trunc_or_const_v16i16_v16i8: 5448 ; SSE: # %bb.0: 5449 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 5450 ; SSE-NEXT: pand %xmm2, %xmm1 5451 ; SSE-NEXT: pand %xmm2, %xmm0 5452 ; SSE-NEXT: packuswb %xmm1, %xmm0 5453 ; SSE-NEXT: por {{.*}}(%rip), %xmm0 5454 ; SSE-NEXT: retq 5455 ; 5456 ; AVX1-LABEL: trunc_or_const_v16i16_v16i8: 5457 ; AVX1: # %bb.0: 5458 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 5459 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 5460 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 5461 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 5462 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 5463 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5464 ; AVX1-NEXT: vzeroupper 5465 ; AVX1-NEXT: retq 5466 ; 5467 ; AVX2-LABEL: trunc_or_const_v16i16_v16i8: 5468 ; AVX2: # %bb.0: 5469 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 5470 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 5471 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 5472 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 5473 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 5474 ; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5475 ; AVX2-NEXT: vzeroupper 5476 ; AVX2-NEXT: retq 5477 ; 5478 ; AVX512F-LABEL: trunc_or_const_v16i16_v16i8: 5479 ; AVX512F: # %bb.0: 5480 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 5481 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 5482 ; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5483 ; AVX512F-NEXT: vzeroupper 5484 ; AVX512F-NEXT: retq 5485 ; 5486 ; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8: 5487 ; AVX512BW: # %bb.0: 5488 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 5489 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 5490 ; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5491 ; AVX512BW-NEXT: vzeroupper 5492 ; AVX512BW-NEXT: retq 5493 ; 5494 ; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8: 5495 ; AVX512DQ: # %bb.0: 5496 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 5497 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 5498 ; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 5499 ; AVX512DQ-NEXT: vzeroupper 5500 ; AVX512DQ-NEXT: retq 5501 %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 5502 %2 = trunc <16 x i16> %1 to <16 x i8> 5503 ret <16 x i8> %2 5504 } 5505 5506 ; 5507 ; complex patterns - often created by vectorizer 5508 ; 5509 5510 define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { 5511 ; SSE-LABEL: mul_add_const_v4i64_v4i32: 5512 ; SSE: # %bb.0: 5513 ; SSE-NEXT: movdqa %xmm0, %xmm2 5514 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 5515 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] 5516 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3] 5517 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] 5518 ; SSE-NEXT: movdqa %xmm2, %xmm4 5519 ; SSE-NEXT: psrlq $32, %xmm4 5520 ; SSE-NEXT: pmuludq %xmm1, %xmm4 5521 ; SSE-NEXT: movdqa %xmm1, %xmm5 5522 ; SSE-NEXT: psrlq $32, %xmm5 5523 ; SSE-NEXT: pmuludq %xmm2, %xmm5 5524 ; SSE-NEXT: paddq %xmm4, %xmm5 5525 ; SSE-NEXT: psllq $32, %xmm5 5526 ; SSE-NEXT: pmuludq %xmm1, %xmm2 5527 ; SSE-NEXT: paddq %xmm5, %xmm2 5528 ; SSE-NEXT: movdqa %xmm0, %xmm1 5529 ; SSE-NEXT: psrlq $32, %xmm1 5530 ; SSE-NEXT: pmuludq %xmm3, %xmm1 5531 ; SSE-NEXT: movdqa %xmm3, %xmm4 5532 ; SSE-NEXT: psrlq $32, %xmm4 5533 ; SSE-NEXT: pmuludq %xmm0, %xmm4 5534 ; SSE-NEXT: paddq %xmm1, %xmm4 5535 ; SSE-NEXT: psllq $32, %xmm4 5536 ; SSE-NEXT: pmuludq %xmm3, %xmm0 5537 ; SSE-NEXT: paddq %xmm4, %xmm0 5538 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 5539 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 5540 ; SSE-NEXT: retq 5541 ; 5542 ; AVX-LABEL: mul_add_const_v4i64_v4i32: 5543 ; AVX: # %bb.0: 5544 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 5545 ; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 5546 ; AVX-NEXT: retq 5547 %1 = sext <4 x i32> %a0 to <4 x i64> 5548 %2 = sext <4 x i32> %a1 to <4 x i64> 5549 %3 = mul <4 x i64> %1, %2 5550 %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3> 5551 %5 = trunc <4 x i64> %4 to <4 x i32> 5552 ret <4 x i32> %5 5553 } 5554 5555 define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { 5556 ; SSE-LABEL: mul_add_self_v4i64_v4i32: 5557 ; SSE: # %bb.0: 5558 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 5559 ; SSE-NEXT: movdqa %xmm2, %xmm3 5560 ; SSE-NEXT: psrad $31, %xmm3 5561 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 5562 ; SSE-NEXT: movdqa %xmm0, %xmm3 5563 ; SSE-NEXT: psrad $31, %xmm3 5564 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 5565 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 5566 ; SSE-NEXT: movdqa %xmm3, %xmm4 5567 ; SSE-NEXT: psrad $31, %xmm4 5568 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 5569 ; SSE-NEXT: movdqa %xmm1, %xmm4 5570 ; SSE-NEXT: psrad $31, %xmm4 5571 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 5572 ; SSE-NEXT: movdqa %xmm0, %xmm4 5573 ; SSE-NEXT: psrlq $32, %xmm4 5574 ; SSE-NEXT: pmuludq %xmm1, %xmm4 5575 ; SSE-NEXT: movdqa %xmm1, %xmm5 5576 ; SSE-NEXT: psrlq $32, %xmm5 5577 ; SSE-NEXT: pmuludq %xmm0, %xmm5 5578 ; SSE-NEXT: paddq %xmm4, %xmm5 5579 ; SSE-NEXT: psllq $32, %xmm5 5580 ; SSE-NEXT: pmuludq %xmm0, %xmm1 5581 ; SSE-NEXT: paddq %xmm5, %xmm1 5582 ; SSE-NEXT: movdqa %xmm2, %xmm0 5583 ; SSE-NEXT: psrlq $32, %xmm0 5584 ; SSE-NEXT: pmuludq %xmm3, %xmm0 5585 ; SSE-NEXT: movdqa %xmm3, %xmm4 5586 ; SSE-NEXT: psrlq $32, %xmm4 5587 ; SSE-NEXT: pmuludq %xmm2, %xmm4 5588 ; SSE-NEXT: paddq %xmm0, %xmm4 5589 ; SSE-NEXT: psllq $32, %xmm4 5590 ; SSE-NEXT: pmuludq %xmm2, %xmm3 5591 ; SSE-NEXT: paddq %xmm4, %xmm3 5592 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] 5593 ; SSE-NEXT: paddd %xmm1, %xmm1 5594 ; SSE-NEXT: movdqa %xmm1, %xmm0 5595 ; SSE-NEXT: retq 5596 ; 5597 ; AVX-LABEL: mul_add_self_v4i64_v4i32: 5598 ; AVX: # %bb.0: 5599 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 5600 ; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 5601 ; AVX-NEXT: retq 5602 %1 = sext <4 x i32> %a0 to <4 x i64> 5603 %2 = sext <4 x i32> %a1 to <4 x i64> 5604 %3 = mul <4 x i64> %1, %2 5605 %4 = add <4 x i64> %3, %3 5606 %5 = trunc <4 x i64> %4 to <4 x i32> 5607 ret <4 x i32> %5 5608 } 5609 5610 define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { 5611 ; SSE-LABEL: mul_add_multiuse_v4i64_v4i32: 5612 ; SSE: # %bb.0: 5613 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 5614 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] 5615 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3] 5616 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] 5617 ; SSE-NEXT: movdqa %xmm3, %xmm5 5618 ; SSE-NEXT: psrlq $32, %xmm5 5619 ; SSE-NEXT: pmuludq %xmm1, %xmm5 5620 ; SSE-NEXT: movdqa %xmm1, %xmm6 5621 ; SSE-NEXT: psrlq $32, %xmm6 5622 ; SSE-NEXT: pmuludq %xmm3, %xmm6 5623 ; SSE-NEXT: paddq %xmm5, %xmm6 5624 ; SSE-NEXT: psllq $32, %xmm6 5625 ; SSE-NEXT: pmuludq %xmm1, %xmm3 5626 ; SSE-NEXT: paddq %xmm6, %xmm3 5627 ; SSE-NEXT: movdqa %xmm2, %xmm1 5628 ; SSE-NEXT: psrlq $32, %xmm1 5629 ; SSE-NEXT: pmuludq %xmm4, %xmm1 5630 ; SSE-NEXT: movdqa %xmm4, %xmm5 5631 ; SSE-NEXT: psrlq $32, %xmm5 5632 ; SSE-NEXT: pmuludq %xmm2, %xmm5 5633 ; SSE-NEXT: paddq %xmm1, %xmm5 5634 ; SSE-NEXT: psllq $32, %xmm5 5635 ; SSE-NEXT: pmuludq %xmm4, %xmm2 5636 ; SSE-NEXT: paddq %xmm5, %xmm2 5637 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 5638 ; SSE-NEXT: paddd %xmm2, %xmm0 5639 ; SSE-NEXT: retq 5640 ; 5641 ; AVX-LABEL: mul_add_multiuse_v4i64_v4i32: 5642 ; AVX: # %bb.0: 5643 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm1 5644 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 5645 ; AVX-NEXT: retq 5646 %1 = sext <4 x i32> %a0 to <4 x i64> 5647 %2 = sext <4 x i32> %a1 to <4 x i64> 5648 %3 = mul <4 x i64> %1, %2 5649 %4 = add <4 x i64> %1, %3 5650 %5 = trunc <4 x i64> %4 to <4 x i32> 5651 ret <4 x i32> %5 5652 } 5653