1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL 8 9 ; 10 ; vXf32 (accum) 11 ; 12 13 define float @test_v2f32(float %a0, <2 x float> %a1) { 14 ; SSE2-LABEL: test_v2f32: 15 ; SSE2: # %bb.0: 16 ; SSE2-NEXT: addss %xmm1, %xmm0 17 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] 18 ; SSE2-NEXT: addss %xmm1, %xmm0 19 ; SSE2-NEXT: retq 20 ; 21 ; SSE41-LABEL: test_v2f32: 22 ; SSE41: # %bb.0: 23 ; SSE41-NEXT: addss %xmm1, %xmm0 24 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 25 ; SSE41-NEXT: addss %xmm1, %xmm0 26 ; SSE41-NEXT: retq 27 ; 28 ; AVX-LABEL: test_v2f32: 29 ; AVX: # %bb.0: 30 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 31 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 32 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 33 ; AVX-NEXT: retq 34 ; 35 ; AVX512-LABEL: test_v2f32: 36 ; AVX512: # %bb.0: 37 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 38 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 39 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 40 ; AVX512-NEXT: retq 41 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1) 42 ret float %1 43 } 44 45 define float @test_v4f32(float %a0, <4 x float> %a1) { 46 ; SSE2-LABEL: test_v4f32: 47 ; SSE2: # %bb.0: 48 ; SSE2-NEXT: addss %xmm1, %xmm0 49 ; SSE2-NEXT: movaps %xmm1, %xmm2 50 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3] 51 ; SSE2-NEXT: addss %xmm2, %xmm0 52 ; SSE2-NEXT: movaps %xmm1, %xmm2 53 ; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 54 ; SSE2-NEXT: addss %xmm2, %xmm0 55 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 56 ; SSE2-NEXT: addss %xmm1, %xmm0 57 ; SSE2-NEXT: retq 58 ; 59 ; SSE41-LABEL: test_v4f32: 60 ; SSE41: # %bb.0: 61 ; SSE41-NEXT: addss %xmm1, %xmm0 62 ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 63 ; SSE41-NEXT: addss %xmm2, %xmm0 64 ; SSE41-NEXT: movaps %xmm1, %xmm2 65 ; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 66 ; SSE41-NEXT: addss %xmm2, %xmm0 67 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 68 ; SSE41-NEXT: addss %xmm1, %xmm0 69 ; SSE41-NEXT: retq 70 ; 71 ; AVX-LABEL: test_v4f32: 72 ; AVX: # %bb.0: 73 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 74 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 75 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 76 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 77 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 78 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 79 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 80 ; AVX-NEXT: retq 81 ; 82 ; AVX512-LABEL: test_v4f32: 83 ; AVX512: # %bb.0: 84 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 85 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 86 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 87 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 88 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 89 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 90 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 91 ; AVX512-NEXT: retq 92 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1) 93 ret float %1 94 } 95 96 define float @test_v8f32(float %a0, <8 x float> %a1) { 97 ; SSE2-LABEL: test_v8f32: 98 ; SSE2: # %bb.0: 99 ; SSE2-NEXT: addss %xmm1, %xmm0 100 ; SSE2-NEXT: movaps %xmm1, %xmm3 101 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[2,3] 102 ; SSE2-NEXT: addss %xmm3, %xmm0 103 ; SSE2-NEXT: movaps %xmm1, %xmm3 104 ; SSE2-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1] 105 ; SSE2-NEXT: addss %xmm3, %xmm0 106 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 107 ; SSE2-NEXT: addss %xmm1, %xmm0 108 ; SSE2-NEXT: addss %xmm2, %xmm0 109 ; SSE2-NEXT: movaps %xmm2, %xmm1 110 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3] 111 ; SSE2-NEXT: addss %xmm1, %xmm0 112 ; SSE2-NEXT: movaps %xmm2, %xmm1 113 ; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] 114 ; SSE2-NEXT: addss %xmm1, %xmm0 115 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] 116 ; SSE2-NEXT: addss %xmm2, %xmm0 117 ; SSE2-NEXT: retq 118 ; 119 ; SSE41-LABEL: test_v8f32: 120 ; SSE41: # %bb.0: 121 ; SSE41-NEXT: addss %xmm1, %xmm0 122 ; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 123 ; SSE41-NEXT: addss %xmm3, %xmm0 124 ; SSE41-NEXT: movaps %xmm1, %xmm3 125 ; SSE41-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1] 126 ; SSE41-NEXT: addss %xmm3, %xmm0 127 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 128 ; SSE41-NEXT: addss %xmm1, %xmm0 129 ; SSE41-NEXT: addss %xmm2, %xmm0 130 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 131 ; SSE41-NEXT: addss %xmm1, %xmm0 132 ; SSE41-NEXT: movaps %xmm2, %xmm1 133 ; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] 134 ; SSE41-NEXT: addss %xmm1, %xmm0 135 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] 136 ; SSE41-NEXT: addss %xmm2, %xmm0 137 ; SSE41-NEXT: retq 138 ; 139 ; AVX-LABEL: test_v8f32: 140 ; AVX: # %bb.0: 141 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 142 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 143 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 144 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 145 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 146 ; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 147 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 148 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 149 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 150 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 151 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 152 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 153 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 154 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 155 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 156 ; AVX-NEXT: vzeroupper 157 ; AVX-NEXT: retq 158 ; 159 ; AVX512-LABEL: test_v8f32: 160 ; AVX512: # %bb.0: 161 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 162 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 163 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 164 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 165 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 166 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 167 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 168 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 169 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 170 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 171 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 172 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 173 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 174 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 175 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 176 ; AVX512-NEXT: vzeroupper 177 ; AVX512-NEXT: retq 178 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1) 179 ret float %1 180 } 181 182 define float @test_v16f32(float %a0, <16 x float> %a1) { 183 ; SSE2-LABEL: test_v16f32: 184 ; SSE2: # %bb.0: 185 ; SSE2-NEXT: addss %xmm1, %xmm0 186 ; SSE2-NEXT: movaps %xmm1, %xmm5 187 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[2,3] 188 ; SSE2-NEXT: addss %xmm5, %xmm0 189 ; SSE2-NEXT: movaps %xmm1, %xmm5 190 ; SSE2-NEXT: movhlps {{.*#+}} xmm5 = xmm1[1],xmm5[1] 191 ; SSE2-NEXT: addss %xmm5, %xmm0 192 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 193 ; SSE2-NEXT: addss %xmm1, %xmm0 194 ; SSE2-NEXT: addss %xmm2, %xmm0 195 ; SSE2-NEXT: movaps %xmm2, %xmm1 196 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3] 197 ; SSE2-NEXT: addss %xmm1, %xmm0 198 ; SSE2-NEXT: movaps %xmm2, %xmm1 199 ; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] 200 ; SSE2-NEXT: addss %xmm1, %xmm0 201 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] 202 ; SSE2-NEXT: addss %xmm2, %xmm0 203 ; SSE2-NEXT: addss %xmm3, %xmm0 204 ; SSE2-NEXT: movaps %xmm3, %xmm1 205 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[2,3] 206 ; SSE2-NEXT: addss %xmm1, %xmm0 207 ; SSE2-NEXT: movaps %xmm3, %xmm1 208 ; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] 209 ; SSE2-NEXT: addss %xmm1, %xmm0 210 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] 211 ; SSE2-NEXT: addss %xmm3, %xmm0 212 ; SSE2-NEXT: addss %xmm4, %xmm0 213 ; SSE2-NEXT: movaps %xmm4, %xmm1 214 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[2,3] 215 ; SSE2-NEXT: addss %xmm1, %xmm0 216 ; SSE2-NEXT: movaps %xmm4, %xmm1 217 ; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] 218 ; SSE2-NEXT: addss %xmm1, %xmm0 219 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1,2,3] 220 ; SSE2-NEXT: addss %xmm4, %xmm0 221 ; SSE2-NEXT: retq 222 ; 223 ; SSE41-LABEL: test_v16f32: 224 ; SSE41: # %bb.0: 225 ; SSE41-NEXT: addss %xmm1, %xmm0 226 ; SSE41-NEXT: movshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] 227 ; SSE41-NEXT: addss %xmm5, %xmm0 228 ; SSE41-NEXT: movaps %xmm1, %xmm5 229 ; SSE41-NEXT: movhlps {{.*#+}} xmm5 = xmm1[1],xmm5[1] 230 ; SSE41-NEXT: addss %xmm5, %xmm0 231 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 232 ; SSE41-NEXT: addss %xmm1, %xmm0 233 ; SSE41-NEXT: addss %xmm2, %xmm0 234 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 235 ; SSE41-NEXT: addss %xmm1, %xmm0 236 ; SSE41-NEXT: movaps %xmm2, %xmm1 237 ; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] 238 ; SSE41-NEXT: addss %xmm1, %xmm0 239 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] 240 ; SSE41-NEXT: addss %xmm2, %xmm0 241 ; SSE41-NEXT: addss %xmm3, %xmm0 242 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 243 ; SSE41-NEXT: addss %xmm1, %xmm0 244 ; SSE41-NEXT: movaps %xmm3, %xmm1 245 ; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] 246 ; SSE41-NEXT: addss %xmm1, %xmm0 247 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] 248 ; SSE41-NEXT: addss %xmm3, %xmm0 249 ; SSE41-NEXT: addss %xmm4, %xmm0 250 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3] 251 ; SSE41-NEXT: addss %xmm1, %xmm0 252 ; SSE41-NEXT: movaps %xmm4, %xmm1 253 ; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] 254 ; SSE41-NEXT: addss %xmm1, %xmm0 255 ; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1,2,3] 256 ; SSE41-NEXT: addss %xmm4, %xmm0 257 ; SSE41-NEXT: retq 258 ; 259 ; AVX-LABEL: test_v16f32: 260 ; AVX: # %bb.0: 261 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 262 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 263 ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 264 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 265 ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 266 ; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3] 267 ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 268 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 269 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 270 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 271 ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 272 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 273 ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 274 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 275 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 276 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 277 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 278 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 279 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 280 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 281 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3] 282 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 283 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 284 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 285 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 286 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 287 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 288 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 289 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 290 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 291 ; AVX-NEXT: vzeroupper 292 ; AVX-NEXT: retq 293 ; 294 ; AVX512-LABEL: test_v16f32: 295 ; AVX512: # %bb.0: 296 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 297 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 298 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 299 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 300 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 301 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 302 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 303 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 304 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 305 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 306 ; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 307 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 308 ; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 309 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 310 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 311 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 312 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 313 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 314 ; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 315 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 316 ; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 317 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 318 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 319 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 320 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 321 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 322 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 323 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 324 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 325 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 326 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 327 ; AVX512-NEXT: vzeroupper 328 ; AVX512-NEXT: retq 329 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1) 330 ret float %1 331 } 332 333 ; 334 ; vXf32 (zero) 335 ; 336 337 define float @test_v2f32_zero(<2 x float> %a0) { 338 ; SSE2-LABEL: test_v2f32_zero: 339 ; SSE2: # %bb.0: 340 ; SSE2-NEXT: xorps %xmm1, %xmm1 341 ; SSE2-NEXT: addss %xmm0, %xmm1 342 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] 343 ; SSE2-NEXT: addss %xmm1, %xmm0 344 ; SSE2-NEXT: retq 345 ; 346 ; SSE41-LABEL: test_v2f32_zero: 347 ; SSE41: # %bb.0: 348 ; SSE41-NEXT: xorps %xmm1, %xmm1 349 ; SSE41-NEXT: addss %xmm0, %xmm1 350 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 351 ; SSE41-NEXT: addss %xmm1, %xmm0 352 ; SSE41-NEXT: retq 353 ; 354 ; AVX-LABEL: test_v2f32_zero: 355 ; AVX: # %bb.0: 356 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 357 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm1 358 ; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 359 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 360 ; AVX-NEXT: retq 361 ; 362 ; AVX512-LABEL: test_v2f32_zero: 363 ; AVX512: # %bb.0: 364 ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 365 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 366 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 367 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 368 ; AVX512-NEXT: retq 369 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0) 370 ret float %1 371 } 372 373 define float @test_v4f32_zero(<4 x float> %a0) { 374 ; SSE2-LABEL: test_v4f32_zero: 375 ; SSE2: # %bb.0: 376 ; SSE2-NEXT: xorps %xmm1, %xmm1 377 ; SSE2-NEXT: addss %xmm0, %xmm1 378 ; SSE2-NEXT: movaps %xmm0, %xmm2 379 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3] 380 ; SSE2-NEXT: addss %xmm1, %xmm2 381 ; SSE2-NEXT: movaps %xmm0, %xmm1 382 ; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 383 ; SSE2-NEXT: addss %xmm2, %xmm1 384 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 385 ; SSE2-NEXT: addss %xmm1, %xmm0 386 ; SSE2-NEXT: retq 387 ; 388 ; SSE41-LABEL: test_v4f32_zero: 389 ; SSE41: # %bb.0: 390 ; SSE41-NEXT: xorps %xmm1, %xmm1 391 ; SSE41-NEXT: addss %xmm0, %xmm1 392 ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 393 ; SSE41-NEXT: addss %xmm1, %xmm2 394 ; SSE41-NEXT: movaps %xmm0, %xmm1 395 ; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] 396 ; SSE41-NEXT: addss %xmm2, %xmm1 397 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 398 ; SSE41-NEXT: addss %xmm1, %xmm0 399 ; SSE41-NEXT: retq 400 ; 401 ; AVX-LABEL: test_v4f32_zero: 402 ; AVX: # %bb.0: 403 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 404 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm1 405 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 406 ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 407 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 408 ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 409 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 410 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 411 ; AVX-NEXT: retq 412 ; 413 ; AVX512-LABEL: test_v4f32_zero: 414 ; AVX512: # %bb.0: 415 ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 416 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 417 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 418 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 419 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 420 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 421 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 422 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 423 ; AVX512-NEXT: retq 424 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0) 425 ret float %1 426 } 427 428 define float @test_v8f32_zero(<8 x float> %a0) { 429 ; SSE2-LABEL: test_v8f32_zero: 430 ; SSE2: # %bb.0: 431 ; SSE2-NEXT: xorps %xmm2, %xmm2 432 ; SSE2-NEXT: addss %xmm0, %xmm2 433 ; SSE2-NEXT: movaps %xmm0, %xmm3 434 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] 435 ; SSE2-NEXT: addss %xmm2, %xmm3 436 ; SSE2-NEXT: movaps %xmm0, %xmm2 437 ; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] 438 ; SSE2-NEXT: addss %xmm3, %xmm2 439 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 440 ; SSE2-NEXT: addss %xmm2, %xmm0 441 ; SSE2-NEXT: addss %xmm1, %xmm0 442 ; SSE2-NEXT: movaps %xmm1, %xmm2 443 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3] 444 ; SSE2-NEXT: addss %xmm2, %xmm0 445 ; SSE2-NEXT: movaps %xmm1, %xmm2 446 ; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 447 ; SSE2-NEXT: addss %xmm2, %xmm0 448 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 449 ; SSE2-NEXT: addss %xmm1, %xmm0 450 ; SSE2-NEXT: retq 451 ; 452 ; SSE41-LABEL: test_v8f32_zero: 453 ; SSE41: # %bb.0: 454 ; SSE41-NEXT: xorps %xmm2, %xmm2 455 ; SSE41-NEXT: addss %xmm0, %xmm2 456 ; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 457 ; SSE41-NEXT: addss %xmm2, %xmm3 458 ; SSE41-NEXT: movaps %xmm0, %xmm2 459 ; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] 460 ; SSE41-NEXT: addss %xmm3, %xmm2 461 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 462 ; SSE41-NEXT: addss %xmm2, %xmm0 463 ; SSE41-NEXT: addss %xmm1, %xmm0 464 ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 465 ; SSE41-NEXT: addss %xmm2, %xmm0 466 ; SSE41-NEXT: movaps %xmm1, %xmm2 467 ; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 468 ; SSE41-NEXT: addss %xmm2, %xmm0 469 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 470 ; SSE41-NEXT: addss %xmm1, %xmm0 471 ; SSE41-NEXT: retq 472 ; 473 ; AVX-LABEL: test_v8f32_zero: 474 ; AVX: # %bb.0: 475 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 476 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm1 477 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 478 ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 479 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 480 ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 481 ; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] 482 ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 483 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 484 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm1 485 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 486 ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 487 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 488 ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 489 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 490 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 491 ; AVX-NEXT: vzeroupper 492 ; AVX-NEXT: retq 493 ; 494 ; AVX512-LABEL: test_v8f32_zero: 495 ; AVX512: # %bb.0: 496 ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 497 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 498 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 499 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 500 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 501 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 502 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] 503 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 504 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 505 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 506 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 507 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 508 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 509 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 510 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 511 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 512 ; AVX512-NEXT: vzeroupper 513 ; AVX512-NEXT: retq 514 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0) 515 ret float %1 516 } 517 518 define float @test_v16f32_zero(<16 x float> %a0) { 519 ; SSE2-LABEL: test_v16f32_zero: 520 ; SSE2: # %bb.0: 521 ; SSE2-NEXT: xorps %xmm4, %xmm4 522 ; SSE2-NEXT: addss %xmm0, %xmm4 523 ; SSE2-NEXT: movaps %xmm0, %xmm5 524 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[2,3] 525 ; SSE2-NEXT: addss %xmm4, %xmm5 526 ; SSE2-NEXT: movaps %xmm0, %xmm4 527 ; SSE2-NEXT: movhlps {{.*#+}} xmm4 = xmm0[1],xmm4[1] 528 ; SSE2-NEXT: addss %xmm5, %xmm4 529 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 530 ; SSE2-NEXT: addss %xmm4, %xmm0 531 ; SSE2-NEXT: addss %xmm1, %xmm0 532 ; SSE2-NEXT: movaps %xmm1, %xmm4 533 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[2,3] 534 ; SSE2-NEXT: addss %xmm4, %xmm0 535 ; SSE2-NEXT: movaps %xmm1, %xmm4 536 ; SSE2-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1] 537 ; SSE2-NEXT: addss %xmm4, %xmm0 538 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 539 ; SSE2-NEXT: addss %xmm1, %xmm0 540 ; SSE2-NEXT: addss %xmm2, %xmm0 541 ; SSE2-NEXT: movaps %xmm2, %xmm1 542 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3] 543 ; SSE2-NEXT: addss %xmm1, %xmm0 544 ; SSE2-NEXT: movaps %xmm2, %xmm1 545 ; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] 546 ; SSE2-NEXT: addss %xmm1, %xmm0 547 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] 548 ; SSE2-NEXT: addss %xmm2, %xmm0 549 ; SSE2-NEXT: addss %xmm3, %xmm0 550 ; SSE2-NEXT: movaps %xmm3, %xmm1 551 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[2,3] 552 ; SSE2-NEXT: addss %xmm1, %xmm0 553 ; SSE2-NEXT: movaps %xmm3, %xmm1 554 ; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] 555 ; SSE2-NEXT: addss %xmm1, %xmm0 556 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] 557 ; SSE2-NEXT: addss %xmm3, %xmm0 558 ; SSE2-NEXT: retq 559 ; 560 ; SSE41-LABEL: test_v16f32_zero: 561 ; SSE41: # %bb.0: 562 ; SSE41-NEXT: xorps %xmm4, %xmm4 563 ; SSE41-NEXT: addss %xmm0, %xmm4 564 ; SSE41-NEXT: movshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] 565 ; SSE41-NEXT: addss %xmm4, %xmm5 566 ; SSE41-NEXT: movaps %xmm0, %xmm4 567 ; SSE41-NEXT: movhlps {{.*#+}} xmm4 = xmm0[1],xmm4[1] 568 ; SSE41-NEXT: addss %xmm5, %xmm4 569 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 570 ; SSE41-NEXT: addss %xmm4, %xmm0 571 ; SSE41-NEXT: addss %xmm1, %xmm0 572 ; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 573 ; SSE41-NEXT: addss %xmm4, %xmm0 574 ; SSE41-NEXT: movaps %xmm1, %xmm4 575 ; SSE41-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1] 576 ; SSE41-NEXT: addss %xmm4, %xmm0 577 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 578 ; SSE41-NEXT: addss %xmm1, %xmm0 579 ; SSE41-NEXT: addss %xmm2, %xmm0 580 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 581 ; SSE41-NEXT: addss %xmm1, %xmm0 582 ; SSE41-NEXT: movaps %xmm2, %xmm1 583 ; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] 584 ; SSE41-NEXT: addss %xmm1, %xmm0 585 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] 586 ; SSE41-NEXT: addss %xmm2, %xmm0 587 ; SSE41-NEXT: addss %xmm3, %xmm0 588 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 589 ; SSE41-NEXT: addss %xmm1, %xmm0 590 ; SSE41-NEXT: movaps %xmm3, %xmm1 591 ; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] 592 ; SSE41-NEXT: addss %xmm1, %xmm0 593 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] 594 ; SSE41-NEXT: addss %xmm3, %xmm0 595 ; SSE41-NEXT: retq 596 ; 597 ; AVX-LABEL: test_v16f32_zero: 598 ; AVX: # %bb.0: 599 ; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 600 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm2 601 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 602 ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 603 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 604 ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 605 ; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3] 606 ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 607 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 608 ; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm2 609 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 610 ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 611 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 612 ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 613 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 614 ; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm0 615 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 616 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 617 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 618 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 619 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 620 ; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 621 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 622 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 623 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 624 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 625 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 626 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 627 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 628 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 629 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 630 ; AVX-NEXT: vzeroupper 631 ; AVX-NEXT: retq 632 ; 633 ; AVX512-LABEL: test_v16f32_zero: 634 ; AVX512: # %bb.0: 635 ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 636 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 637 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 638 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 639 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 640 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 641 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] 642 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 643 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 644 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 645 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 646 ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 647 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 648 ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 649 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 650 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 651 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 652 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 653 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 654 ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 655 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 656 ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 657 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 658 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 659 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 660 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 661 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 662 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 663 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 664 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 665 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 666 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 667 ; AVX512-NEXT: vzeroupper 668 ; AVX512-NEXT: retq 669 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0) 670 ret float %1 671 } 672 673 ; 674 ; vXf32 (undef) 675 ; 676 677 define float @test_v2f32_undef(<2 x float> %a0) { 678 ; SSE2-LABEL: test_v2f32_undef: 679 ; SSE2: # %bb.0: 680 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] 681 ; SSE2-NEXT: addss {{.*}}(%rip), %xmm0 682 ; SSE2-NEXT: retq 683 ; 684 ; SSE41-LABEL: test_v2f32_undef: 685 ; SSE41: # %bb.0: 686 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 687 ; SSE41-NEXT: addss {{.*}}(%rip), %xmm0 688 ; SSE41-NEXT: retq 689 ; 690 ; AVX-LABEL: test_v2f32_undef: 691 ; AVX: # %bb.0: 692 ; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 693 ; AVX-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 694 ; AVX-NEXT: retq 695 ; 696 ; AVX512-LABEL: test_v2f32_undef: 697 ; AVX512: # %bb.0: 698 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 699 ; AVX512-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 700 ; AVX512-NEXT: retq 701 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0) 702 ret float %1 703 } 704 705 define float @test_v4f32_undef(<4 x float> %a0) { 706 ; SSE2-LABEL: test_v4f32_undef: 707 ; SSE2: # %bb.0: 708 ; SSE2-NEXT: movaps %xmm0, %xmm1 709 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] 710 ; SSE2-NEXT: addss {{.*}}(%rip), %xmm1 711 ; SSE2-NEXT: movaps %xmm0, %xmm2 712 ; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] 713 ; SSE2-NEXT: addss %xmm1, %xmm2 714 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 715 ; SSE2-NEXT: addss %xmm2, %xmm0 716 ; SSE2-NEXT: retq 717 ; 718 ; SSE41-LABEL: test_v4f32_undef: 719 ; SSE41: # %bb.0: 720 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 721 ; SSE41-NEXT: addss {{.*}}(%rip), %xmm1 722 ; SSE41-NEXT: movaps %xmm0, %xmm2 723 ; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] 724 ; SSE41-NEXT: addss %xmm1, %xmm2 725 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 726 ; SSE41-NEXT: addss %xmm2, %xmm0 727 ; SSE41-NEXT: retq 728 ; 729 ; AVX-LABEL: test_v4f32_undef: 730 ; AVX: # %bb.0: 731 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 732 ; AVX-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 733 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 734 ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 735 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 736 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 737 ; AVX-NEXT: retq 738 ; 739 ; AVX512-LABEL: test_v4f32_undef: 740 ; AVX512: # %bb.0: 741 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 742 ; AVX512-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 743 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 744 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 745 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 746 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 747 ; AVX512-NEXT: retq 748 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0) 749 ret float %1 750 } 751 752 define float @test_v8f32_undef(<8 x float> %a0) { 753 ; SSE2-LABEL: test_v8f32_undef: 754 ; SSE2: # %bb.0: 755 ; SSE2-NEXT: movaps %xmm0, %xmm2 756 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3] 757 ; SSE2-NEXT: addss {{.*}}(%rip), %xmm2 758 ; SSE2-NEXT: movaps %xmm0, %xmm3 759 ; SSE2-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1] 760 ; SSE2-NEXT: addss %xmm2, %xmm3 761 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 762 ; SSE2-NEXT: addss %xmm3, %xmm0 763 ; SSE2-NEXT: addss %xmm1, %xmm0 764 ; SSE2-NEXT: movaps %xmm1, %xmm2 765 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3] 766 ; SSE2-NEXT: addss %xmm2, %xmm0 767 ; SSE2-NEXT: movaps %xmm1, %xmm2 768 ; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 769 ; SSE2-NEXT: addss %xmm2, %xmm0 770 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 771 ; SSE2-NEXT: addss %xmm1, %xmm0 772 ; SSE2-NEXT: retq 773 ; 774 ; SSE41-LABEL: test_v8f32_undef: 775 ; SSE41: # %bb.0: 776 ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 777 ; SSE41-NEXT: addss {{.*}}(%rip), %xmm2 778 ; SSE41-NEXT: movaps %xmm0, %xmm3 779 ; SSE41-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1] 780 ; SSE41-NEXT: addss %xmm2, %xmm3 781 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 782 ; SSE41-NEXT: addss %xmm3, %xmm0 783 ; SSE41-NEXT: addss %xmm1, %xmm0 784 ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 785 ; SSE41-NEXT: addss %xmm2, %xmm0 786 ; SSE41-NEXT: movaps %xmm1, %xmm2 787 ; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 788 ; SSE41-NEXT: addss %xmm2, %xmm0 789 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 790 ; SSE41-NEXT: addss %xmm1, %xmm0 791 ; SSE41-NEXT: retq 792 ; 793 ; AVX-LABEL: test_v8f32_undef: 794 ; AVX: # %bb.0: 795 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 796 ; AVX-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 797 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 798 ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 799 ; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] 800 ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 801 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 802 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm1 803 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 804 ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 805 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 806 ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 807 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 808 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 809 ; AVX-NEXT: vzeroupper 810 ; AVX-NEXT: retq 811 ; 812 ; AVX512-LABEL: test_v8f32_undef: 813 ; AVX512: # %bb.0: 814 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 815 ; AVX512-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 816 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 817 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 818 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] 819 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 820 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 821 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 822 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 823 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 824 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 825 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 826 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 827 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 828 ; AVX512-NEXT: vzeroupper 829 ; AVX512-NEXT: retq 830 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0) 831 ret float %1 832 } 833 834 define float @test_v16f32_undef(<16 x float> %a0) { 835 ; SSE2-LABEL: test_v16f32_undef: 836 ; SSE2: # %bb.0: 837 ; SSE2-NEXT: movaps %xmm0, %xmm4 838 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[2,3] 839 ; SSE2-NEXT: addss {{.*}}(%rip), %xmm4 840 ; SSE2-NEXT: movaps %xmm0, %xmm5 841 ; SSE2-NEXT: movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1] 842 ; SSE2-NEXT: addss %xmm4, %xmm5 843 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 844 ; SSE2-NEXT: addss %xmm5, %xmm0 845 ; SSE2-NEXT: addss %xmm1, %xmm0 846 ; SSE2-NEXT: movaps %xmm1, %xmm4 847 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[2,3] 848 ; SSE2-NEXT: addss %xmm4, %xmm0 849 ; SSE2-NEXT: movaps %xmm1, %xmm4 850 ; SSE2-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1] 851 ; SSE2-NEXT: addss %xmm4, %xmm0 852 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 853 ; SSE2-NEXT: addss %xmm1, %xmm0 854 ; SSE2-NEXT: addss %xmm2, %xmm0 855 ; SSE2-NEXT: movaps %xmm2, %xmm1 856 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3] 857 ; SSE2-NEXT: addss %xmm1, %xmm0 858 ; SSE2-NEXT: movaps %xmm2, %xmm1 859 ; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] 860 ; SSE2-NEXT: addss %xmm1, %xmm0 861 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] 862 ; SSE2-NEXT: addss %xmm2, %xmm0 863 ; SSE2-NEXT: addss %xmm3, %xmm0 864 ; SSE2-NEXT: movaps %xmm3, %xmm1 865 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[2,3] 866 ; SSE2-NEXT: addss %xmm1, %xmm0 867 ; SSE2-NEXT: movaps %xmm3, %xmm1 868 ; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] 869 ; SSE2-NEXT: addss %xmm1, %xmm0 870 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] 871 ; SSE2-NEXT: addss %xmm3, %xmm0 872 ; SSE2-NEXT: retq 873 ; 874 ; SSE41-LABEL: test_v16f32_undef: 875 ; SSE41: # %bb.0: 876 ; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 877 ; SSE41-NEXT: addss {{.*}}(%rip), %xmm4 878 ; SSE41-NEXT: movaps %xmm0, %xmm5 879 ; SSE41-NEXT: movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1] 880 ; SSE41-NEXT: addss %xmm4, %xmm5 881 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 882 ; SSE41-NEXT: addss %xmm5, %xmm0 883 ; SSE41-NEXT: addss %xmm1, %xmm0 884 ; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 885 ; SSE41-NEXT: addss %xmm4, %xmm0 886 ; SSE41-NEXT: movaps %xmm1, %xmm4 887 ; SSE41-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1] 888 ; SSE41-NEXT: addss %xmm4, %xmm0 889 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] 890 ; SSE41-NEXT: addss %xmm1, %xmm0 891 ; SSE41-NEXT: addss %xmm2, %xmm0 892 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 893 ; SSE41-NEXT: addss %xmm1, %xmm0 894 ; SSE41-NEXT: movaps %xmm2, %xmm1 895 ; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] 896 ; SSE41-NEXT: addss %xmm1, %xmm0 897 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] 898 ; SSE41-NEXT: addss %xmm2, %xmm0 899 ; SSE41-NEXT: addss %xmm3, %xmm0 900 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 901 ; SSE41-NEXT: addss %xmm1, %xmm0 902 ; SSE41-NEXT: movaps %xmm3, %xmm1 903 ; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] 904 ; SSE41-NEXT: addss %xmm1, %xmm0 905 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] 906 ; SSE41-NEXT: addss %xmm3, %xmm0 907 ; SSE41-NEXT: retq 908 ; 909 ; AVX-LABEL: test_v16f32_undef: 910 ; AVX: # %bb.0: 911 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 912 ; AVX-NEXT: vaddss {{.*}}(%rip), %xmm2, %xmm2 913 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 914 ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 915 ; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3] 916 ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 917 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 918 ; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm2 919 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 920 ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 921 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 922 ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 923 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 924 ; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm0 925 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 926 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 927 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 928 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 929 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 930 ; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 931 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 932 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 933 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 934 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 935 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 936 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 937 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 938 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 939 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 940 ; AVX-NEXT: vzeroupper 941 ; AVX-NEXT: retq 942 ; 943 ; AVX512-LABEL: test_v16f32_undef: 944 ; AVX512: # %bb.0: 945 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 946 ; AVX512-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 947 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 948 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 949 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] 950 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 951 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 952 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 953 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 954 ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 955 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 956 ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 957 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 958 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 959 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 960 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 961 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 962 ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 963 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 964 ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 965 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 966 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 967 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 968 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 969 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 970 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 971 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 972 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 973 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 974 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 975 ; AVX512-NEXT: vzeroupper 976 ; AVX512-NEXT: retq 977 %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0) 978 ret float %1 979 } 980 981 ; 982 ; vXf64 (accum) 983 ; 984 985 define double @test_v2f64(double %a0, <2 x double> %a1) { 986 ; SSE-LABEL: test_v2f64: 987 ; SSE: # %bb.0: 988 ; SSE-NEXT: addsd %xmm1, %xmm0 989 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 990 ; SSE-NEXT: addsd %xmm1, %xmm0 991 ; SSE-NEXT: retq 992 ; 993 ; AVX-LABEL: test_v2f64: 994 ; AVX: # %bb.0: 995 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 996 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 997 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 998 ; AVX-NEXT: retq 999 ; 1000 ; AVX512-LABEL: test_v2f64: 1001 ; AVX512: # %bb.0: 1002 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1003 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1004 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1005 ; AVX512-NEXT: retq 1006 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1) 1007 ret double %1 1008 } 1009 1010 define double @test_v4f64(double %a0, <4 x double> %a1) { 1011 ; SSE-LABEL: test_v4f64: 1012 ; SSE: # %bb.0: 1013 ; SSE-NEXT: addsd %xmm1, %xmm0 1014 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 1015 ; SSE-NEXT: addsd %xmm1, %xmm0 1016 ; SSE-NEXT: addsd %xmm2, %xmm0 1017 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] 1018 ; SSE-NEXT: addsd %xmm2, %xmm0 1019 ; SSE-NEXT: retq 1020 ; 1021 ; AVX-LABEL: test_v4f64: 1022 ; AVX: # %bb.0: 1023 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1024 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1025 ; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1026 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1027 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1028 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1029 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1030 ; AVX-NEXT: vzeroupper 1031 ; AVX-NEXT: retq 1032 ; 1033 ; AVX512-LABEL: test_v4f64: 1034 ; AVX512: # %bb.0: 1035 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1036 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1037 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1038 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 1039 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1040 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1041 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1042 ; AVX512-NEXT: vzeroupper 1043 ; AVX512-NEXT: retq 1044 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1) 1045 ret double %1 1046 } 1047 1048 define double @test_v8f64(double %a0, <8 x double> %a1) { 1049 ; SSE-LABEL: test_v8f64: 1050 ; SSE: # %bb.0: 1051 ; SSE-NEXT: addsd %xmm1, %xmm0 1052 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 1053 ; SSE-NEXT: addsd %xmm1, %xmm0 1054 ; SSE-NEXT: addsd %xmm2, %xmm0 1055 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] 1056 ; SSE-NEXT: addsd %xmm2, %xmm0 1057 ; SSE-NEXT: addsd %xmm3, %xmm0 1058 ; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] 1059 ; SSE-NEXT: addsd %xmm3, %xmm0 1060 ; SSE-NEXT: addsd %xmm4, %xmm0 1061 ; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1] 1062 ; SSE-NEXT: addsd %xmm4, %xmm0 1063 ; SSE-NEXT: retq 1064 ; 1065 ; AVX-LABEL: test_v8f64: 1066 ; AVX: # %bb.0: 1067 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1068 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 1069 ; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1070 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1071 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1072 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1073 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1074 ; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1075 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1076 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1077 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1078 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1079 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1080 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1081 ; AVX-NEXT: vzeroupper 1082 ; AVX-NEXT: retq 1083 ; 1084 ; AVX512-LABEL: test_v8f64: 1085 ; AVX512: # %bb.0: 1086 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1087 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1088 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1089 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 1090 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1091 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1092 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1093 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 1094 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1095 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1096 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1097 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1098 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1099 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1100 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1101 ; AVX512-NEXT: vzeroupper 1102 ; AVX512-NEXT: retq 1103 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1) 1104 ret double %1 1105 } 1106 1107 define double @test_v16f64(double %a0, <16 x double> %a1) { 1108 ; SSE-LABEL: test_v16f64: 1109 ; SSE: # %bb.0: 1110 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 1111 ; SSE-NEXT: addsd %xmm1, %xmm0 1112 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 1113 ; SSE-NEXT: addsd %xmm1, %xmm0 1114 ; SSE-NEXT: addsd %xmm2, %xmm0 1115 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] 1116 ; SSE-NEXT: addsd %xmm2, %xmm0 1117 ; SSE-NEXT: addsd %xmm3, %xmm0 1118 ; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] 1119 ; SSE-NEXT: addsd %xmm3, %xmm0 1120 ; SSE-NEXT: addsd %xmm4, %xmm0 1121 ; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1] 1122 ; SSE-NEXT: addsd %xmm4, %xmm0 1123 ; SSE-NEXT: addsd %xmm5, %xmm0 1124 ; SSE-NEXT: movhlps {{.*#+}} xmm5 = xmm5[1,1] 1125 ; SSE-NEXT: addsd %xmm5, %xmm0 1126 ; SSE-NEXT: addsd %xmm6, %xmm0 1127 ; SSE-NEXT: movhlps {{.*#+}} xmm6 = xmm6[1,1] 1128 ; SSE-NEXT: addsd %xmm6, %xmm0 1129 ; SSE-NEXT: addsd %xmm7, %xmm0 1130 ; SSE-NEXT: movhlps {{.*#+}} xmm7 = xmm7[1,1] 1131 ; SSE-NEXT: addsd %xmm7, %xmm0 1132 ; SSE-NEXT: addsd %xmm8, %xmm0 1133 ; SSE-NEXT: movhlps {{.*#+}} xmm8 = xmm8[1,1] 1134 ; SSE-NEXT: addsd %xmm8, %xmm0 1135 ; SSE-NEXT: retq 1136 ; 1137 ; AVX-LABEL: test_v16f64: 1138 ; AVX: # %bb.0: 1139 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1140 ; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] 1141 ; AVX-NEXT: vaddsd %xmm5, %xmm0, %xmm0 1142 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1143 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1144 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1145 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1146 ; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1147 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1148 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1149 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1150 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1151 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1152 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1153 ; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1154 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] 1155 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1156 ; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1 1157 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1158 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1159 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1160 ; AVX-NEXT: vaddsd %xmm4, %xmm0, %xmm0 1161 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm4[1,0] 1162 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1163 ; AVX-NEXT: vextractf128 $1, %ymm4, %xmm1 1164 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1165 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1166 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1167 ; AVX-NEXT: vzeroupper 1168 ; AVX-NEXT: retq 1169 ; 1170 ; AVX512-LABEL: test_v16f64: 1171 ; AVX512: # %bb.0: 1172 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1173 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 1174 ; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1175 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm3 1176 ; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1177 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1178 ; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1179 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm3 1180 ; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1181 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1182 ; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1183 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1184 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1185 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1186 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1187 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1188 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1189 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1190 ; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm1 1191 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1192 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1193 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1194 ; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm1 1195 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1196 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1197 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1198 ; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm1 1199 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1200 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1201 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1202 ; AVX512-NEXT: vzeroupper 1203 ; AVX512-NEXT: retq 1204 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1) 1205 ret double %1 1206 } 1207 1208 ; 1209 ; vXf64 (zero) 1210 ; 1211 1212 define double @test_v2f64_zero(<2 x double> %a0) { 1213 ; SSE-LABEL: test_v2f64_zero: 1214 ; SSE: # %bb.0: 1215 ; SSE-NEXT: xorpd %xmm1, %xmm1 1216 ; SSE-NEXT: addsd %xmm0, %xmm1 1217 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 1218 ; SSE-NEXT: addsd %xmm1, %xmm0 1219 ; SSE-NEXT: retq 1220 ; 1221 ; AVX-LABEL: test_v2f64_zero: 1222 ; AVX: # %bb.0: 1223 ; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1224 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm1 1225 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1226 ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1227 ; AVX-NEXT: retq 1228 ; 1229 ; AVX512-LABEL: test_v2f64_zero: 1230 ; AVX512: # %bb.0: 1231 ; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1232 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm1 1233 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1234 ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1235 ; AVX512-NEXT: retq 1236 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0) 1237 ret double %1 1238 } 1239 1240 define double @test_v4f64_zero(<4 x double> %a0) { 1241 ; SSE-LABEL: test_v4f64_zero: 1242 ; SSE: # %bb.0: 1243 ; SSE-NEXT: xorpd %xmm2, %xmm2 1244 ; SSE-NEXT: addsd %xmm0, %xmm2 1245 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 1246 ; SSE-NEXT: addsd %xmm2, %xmm0 1247 ; SSE-NEXT: addsd %xmm1, %xmm0 1248 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 1249 ; SSE-NEXT: addsd %xmm1, %xmm0 1250 ; SSE-NEXT: retq 1251 ; 1252 ; AVX-LABEL: test_v4f64_zero: 1253 ; AVX: # %bb.0: 1254 ; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1255 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm1 1256 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 1257 ; AVX-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1258 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1259 ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1260 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1261 ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1262 ; AVX-NEXT: vzeroupper 1263 ; AVX-NEXT: retq 1264 ; 1265 ; AVX512-LABEL: test_v4f64_zero: 1266 ; AVX512: # %bb.0: 1267 ; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1268 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm1 1269 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 1270 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1271 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 1272 ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1273 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1274 ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1275 ; AVX512-NEXT: vzeroupper 1276 ; AVX512-NEXT: retq 1277 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0) 1278 ret double %1 1279 } 1280 1281 define double @test_v8f64_zero(<8 x double> %a0) { 1282 ; SSE-LABEL: test_v8f64_zero: 1283 ; SSE: # %bb.0: 1284 ; SSE-NEXT: xorpd %xmm4, %xmm4 1285 ; SSE-NEXT: addsd %xmm0, %xmm4 1286 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 1287 ; SSE-NEXT: addsd %xmm4, %xmm0 1288 ; SSE-NEXT: addsd %xmm1, %xmm0 1289 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 1290 ; SSE-NEXT: addsd %xmm1, %xmm0 1291 ; SSE-NEXT: addsd %xmm2, %xmm0 1292 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] 1293 ; SSE-NEXT: addsd %xmm2, %xmm0 1294 ; SSE-NEXT: addsd %xmm3, %xmm0 1295 ; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] 1296 ; SSE-NEXT: addsd %xmm3, %xmm0 1297 ; SSE-NEXT: retq 1298 ; 1299 ; AVX-LABEL: test_v8f64_zero: 1300 ; AVX: # %bb.0: 1301 ; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 1302 ; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm2 1303 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 1304 ; AVX-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1305 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1306 ; AVX-NEXT: vaddsd %xmm0, %xmm2, %xmm2 1307 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1308 ; AVX-NEXT: vaddsd %xmm0, %xmm2, %xmm0 1309 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1310 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1311 ; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1312 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1313 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1314 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1315 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1316 ; AVX-NEXT: vzeroupper 1317 ; AVX-NEXT: retq 1318 ; 1319 ; AVX512-LABEL: test_v8f64_zero: 1320 ; AVX512: # %bb.0: 1321 ; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1322 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm1 1323 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 1324 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1325 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 1326 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1327 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1328 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1329 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 1330 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1331 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1332 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1333 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1334 ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1335 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1336 ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1337 ; AVX512-NEXT: vzeroupper 1338 ; AVX512-NEXT: retq 1339 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0) 1340 ret double %1 1341 } 1342 1343 define double @test_v16f64_zero(<16 x double> %a0) { 1344 ; SSE-LABEL: test_v16f64_zero: 1345 ; SSE: # %bb.0: 1346 ; SSE-NEXT: xorpd %xmm8, %xmm8 1347 ; SSE-NEXT: addsd %xmm0, %xmm8 1348 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 1349 ; SSE-NEXT: addsd %xmm8, %xmm0 1350 ; SSE-NEXT: addsd %xmm1, %xmm0 1351 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 1352 ; SSE-NEXT: addsd %xmm1, %xmm0 1353 ; SSE-NEXT: addsd %xmm2, %xmm0 1354 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] 1355 ; SSE-NEXT: addsd %xmm2, %xmm0 1356 ; SSE-NEXT: addsd %xmm3, %xmm0 1357 ; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] 1358 ; SSE-NEXT: addsd %xmm3, %xmm0 1359 ; SSE-NEXT: addsd %xmm4, %xmm0 1360 ; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1] 1361 ; SSE-NEXT: addsd %xmm4, %xmm0 1362 ; SSE-NEXT: addsd %xmm5, %xmm0 1363 ; SSE-NEXT: movhlps {{.*#+}} xmm5 = xmm5[1,1] 1364 ; SSE-NEXT: addsd %xmm5, %xmm0 1365 ; SSE-NEXT: addsd %xmm6, %xmm0 1366 ; SSE-NEXT: movhlps {{.*#+}} xmm6 = xmm6[1,1] 1367 ; SSE-NEXT: addsd %xmm6, %xmm0 1368 ; SSE-NEXT: addsd %xmm7, %xmm0 1369 ; SSE-NEXT: movhlps {{.*#+}} xmm7 = xmm7[1,1] 1370 ; SSE-NEXT: addsd %xmm7, %xmm0 1371 ; SSE-NEXT: retq 1372 ; 1373 ; AVX-LABEL: test_v16f64_zero: 1374 ; AVX: # %bb.0: 1375 ; AVX-NEXT: vxorpd %xmm4, %xmm4, %xmm4 1376 ; AVX-NEXT: vaddsd %xmm4, %xmm0, %xmm4 1377 ; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm0[1,0] 1378 ; AVX-NEXT: vaddsd %xmm5, %xmm4, %xmm4 1379 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1380 ; AVX-NEXT: vaddsd %xmm0, %xmm4, %xmm4 1381 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1382 ; AVX-NEXT: vaddsd %xmm0, %xmm4, %xmm0 1383 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1384 ; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] 1385 ; AVX-NEXT: vaddsd %xmm4, %xmm0, %xmm0 1386 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1387 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1388 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1389 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1390 ; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1391 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1392 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1393 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1394 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1395 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1396 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1397 ; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1398 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] 1399 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1400 ; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1 1401 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1402 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1403 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1404 ; AVX-NEXT: vzeroupper 1405 ; AVX-NEXT: retq 1406 ; 1407 ; AVX512-LABEL: test_v16f64_zero: 1408 ; AVX512: # %bb.0: 1409 ; AVX512-NEXT: vxorpd %xmm2, %xmm2, %xmm2 1410 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm2 1411 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 1412 ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1413 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 1414 ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1415 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1416 ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1417 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 1418 ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1419 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1420 ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1421 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1422 ; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm2 1423 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1424 ; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm0 1425 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1426 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1427 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1428 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 1429 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1430 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1431 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1432 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 1433 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1434 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1435 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1436 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1437 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1438 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1439 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1440 ; AVX512-NEXT: vzeroupper 1441 ; AVX512-NEXT: retq 1442 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0) 1443 ret double %1 1444 } 1445 1446 ; 1447 ; vXf64 (undef) 1448 ; 1449 1450 define double @test_v2f64_undef(<2 x double> %a0) { 1451 ; SSE-LABEL: test_v2f64_undef: 1452 ; SSE: # %bb.0: 1453 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 1454 ; SSE-NEXT: addsd {{.*}}(%rip), %xmm0 1455 ; SSE-NEXT: retq 1456 ; 1457 ; AVX-LABEL: test_v2f64_undef: 1458 ; AVX: # %bb.0: 1459 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1460 ; AVX-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0 1461 ; AVX-NEXT: retq 1462 ; 1463 ; AVX512-LABEL: test_v2f64_undef: 1464 ; AVX512: # %bb.0: 1465 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1466 ; AVX512-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0 1467 ; AVX512-NEXT: retq 1468 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0) 1469 ret double %1 1470 } 1471 1472 define double @test_v4f64_undef(<4 x double> %a0) { 1473 ; SSE-LABEL: test_v4f64_undef: 1474 ; SSE: # %bb.0: 1475 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 1476 ; SSE-NEXT: addsd {{.*}}(%rip), %xmm0 1477 ; SSE-NEXT: addsd %xmm1, %xmm0 1478 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 1479 ; SSE-NEXT: addsd %xmm1, %xmm0 1480 ; SSE-NEXT: retq 1481 ; 1482 ; AVX-LABEL: test_v4f64_undef: 1483 ; AVX: # %bb.0: 1484 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1485 ; AVX-NEXT: vaddsd {{.*}}(%rip), %xmm1, %xmm1 1486 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1487 ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1488 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1489 ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1490 ; AVX-NEXT: vzeroupper 1491 ; AVX-NEXT: retq 1492 ; 1493 ; AVX512-LABEL: test_v4f64_undef: 1494 ; AVX512: # %bb.0: 1495 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1496 ; AVX512-NEXT: vaddsd {{.*}}(%rip), %xmm1, %xmm1 1497 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 1498 ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1499 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1500 ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1501 ; AVX512-NEXT: vzeroupper 1502 ; AVX512-NEXT: retq 1503 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0) 1504 ret double %1 1505 } 1506 1507 define double @test_v8f64_undef(<8 x double> %a0) { 1508 ; SSE-LABEL: test_v8f64_undef: 1509 ; SSE: # %bb.0: 1510 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 1511 ; SSE-NEXT: addsd {{.*}}(%rip), %xmm0 1512 ; SSE-NEXT: addsd %xmm1, %xmm0 1513 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 1514 ; SSE-NEXT: addsd %xmm1, %xmm0 1515 ; SSE-NEXT: addsd %xmm2, %xmm0 1516 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] 1517 ; SSE-NEXT: addsd %xmm2, %xmm0 1518 ; SSE-NEXT: addsd %xmm3, %xmm0 1519 ; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] 1520 ; SSE-NEXT: addsd %xmm3, %xmm0 1521 ; SSE-NEXT: retq 1522 ; 1523 ; AVX-LABEL: test_v8f64_undef: 1524 ; AVX: # %bb.0: 1525 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 1526 ; AVX-NEXT: vaddsd {{.*}}(%rip), %xmm2, %xmm2 1527 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1528 ; AVX-NEXT: vaddsd %xmm0, %xmm2, %xmm2 1529 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1530 ; AVX-NEXT: vaddsd %xmm0, %xmm2, %xmm0 1531 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1532 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1533 ; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1534 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1535 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1536 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1537 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1538 ; AVX-NEXT: vzeroupper 1539 ; AVX-NEXT: retq 1540 ; 1541 ; AVX512-LABEL: test_v8f64_undef: 1542 ; AVX512: # %bb.0: 1543 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1544 ; AVX512-NEXT: vaddsd {{.*}}(%rip), %xmm1, %xmm1 1545 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 1546 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1547 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1548 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1549 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 1550 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1551 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1552 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1553 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1554 ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1555 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1556 ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1557 ; AVX512-NEXT: vzeroupper 1558 ; AVX512-NEXT: retq 1559 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0) 1560 ret double %1 1561 } 1562 1563 define double @test_v16f64_undef(<16 x double> %a0) { 1564 ; SSE-LABEL: test_v16f64_undef: 1565 ; SSE: # %bb.0: 1566 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 1567 ; SSE-NEXT: addsd {{.*}}(%rip), %xmm0 1568 ; SSE-NEXT: addsd %xmm1, %xmm0 1569 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 1570 ; SSE-NEXT: addsd %xmm1, %xmm0 1571 ; SSE-NEXT: addsd %xmm2, %xmm0 1572 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] 1573 ; SSE-NEXT: addsd %xmm2, %xmm0 1574 ; SSE-NEXT: addsd %xmm3, %xmm0 1575 ; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] 1576 ; SSE-NEXT: addsd %xmm3, %xmm0 1577 ; SSE-NEXT: addsd %xmm4, %xmm0 1578 ; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1] 1579 ; SSE-NEXT: addsd %xmm4, %xmm0 1580 ; SSE-NEXT: addsd %xmm5, %xmm0 1581 ; SSE-NEXT: movhlps {{.*#+}} xmm5 = xmm5[1,1] 1582 ; SSE-NEXT: addsd %xmm5, %xmm0 1583 ; SSE-NEXT: addsd %xmm6, %xmm0 1584 ; SSE-NEXT: movhlps {{.*#+}} xmm6 = xmm6[1,1] 1585 ; SSE-NEXT: addsd %xmm6, %xmm0 1586 ; SSE-NEXT: addsd %xmm7, %xmm0 1587 ; SSE-NEXT: movhlps {{.*#+}} xmm7 = xmm7[1,1] 1588 ; SSE-NEXT: addsd %xmm7, %xmm0 1589 ; SSE-NEXT: retq 1590 ; 1591 ; AVX-LABEL: test_v16f64_undef: 1592 ; AVX: # %bb.0: 1593 ; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] 1594 ; AVX-NEXT: vaddsd {{.*}}(%rip), %xmm4, %xmm4 1595 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1596 ; AVX-NEXT: vaddsd %xmm0, %xmm4, %xmm4 1597 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1598 ; AVX-NEXT: vaddsd %xmm0, %xmm4, %xmm0 1599 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1600 ; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] 1601 ; AVX-NEXT: vaddsd %xmm4, %xmm0, %xmm0 1602 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1603 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1604 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1605 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1606 ; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1607 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1608 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1609 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1610 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1611 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1612 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1613 ; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1614 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] 1615 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1616 ; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1 1617 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1618 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1619 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1620 ; AVX-NEXT: vzeroupper 1621 ; AVX-NEXT: retq 1622 ; 1623 ; AVX512-LABEL: test_v16f64_undef: 1624 ; AVX512: # %bb.0: 1625 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 1626 ; AVX512-NEXT: vaddsd {{.*}}(%rip), %xmm2, %xmm2 1627 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 1628 ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1629 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1630 ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1631 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 1632 ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1633 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] 1634 ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1635 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1636 ; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm2 1637 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1638 ; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm0 1639 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1640 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1641 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1642 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 1643 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1644 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1645 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1646 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 1647 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1648 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 1649 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1650 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1651 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1652 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1653 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1654 ; AVX512-NEXT: vzeroupper 1655 ; AVX512-NEXT: retq 1656 %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0) 1657 ret double %1 1658 } 1659 1660 declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float, <2 x float>) 1661 declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float, <4 x float>) 1662 declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>) 1663 declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float, <16 x float>) 1664 1665 declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double, <2 x double>) 1666 declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>) 1667 declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double, <8 x double>) 1668 declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double, <16 x double>) 1669