1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSE3 3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3,+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 6 7 define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) { 8 ; SSE-LABEL: hadd_ps_test1: 9 ; SSE: # BB#0: 10 ; SSE-NEXT: haddps %xmm1, %xmm0 11 ; SSE-NEXT: retq 12 ; 13 ; AVX-LABEL: hadd_ps_test1: 14 ; AVX: # BB#0: 15 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 16 ; AVX-NEXT: retq 17 %vecext = extractelement <4 x float> %A, i32 0 18 %vecext1 = extractelement <4 x float> %A, i32 1 19 %add = fadd float %vecext, %vecext1 20 %vecinit = insertelement <4 x float> undef, float %add, i32 0 21 %vecext2 = extractelement <4 x float> %A, i32 2 22 %vecext3 = extractelement <4 x float> %A, i32 3 23 %add4 = fadd float %vecext2, %vecext3 24 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1 25 %vecext6 = extractelement <4 x float> %B, i32 0 26 %vecext7 = extractelement <4 x float> %B, i32 1 27 %add8 = fadd float %vecext6, %vecext7 28 %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2 29 %vecext10 = extractelement <4 x float> %B, i32 2 30 %vecext11 = extractelement <4 x float> %B, i32 3 31 %add12 = fadd float %vecext10, %vecext11 32 %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3 33 ret <4 x float> %vecinit13 34 } 35 36 define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) { 37 ; SSE-LABEL: hadd_ps_test2: 38 ; SSE: # BB#0: 39 ; SSE-NEXT: haddps %xmm1, %xmm0 40 ; SSE-NEXT: retq 41 ; 42 ; AVX-LABEL: hadd_ps_test2: 43 ; AVX: # BB#0: 44 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 45 ; AVX-NEXT: retq 46 %vecext = extractelement <4 x float> %A, i32 2 47 %vecext1 = extractelement <4 x float> %A, i32 3 48 %add = fadd float %vecext, %vecext1 49 %vecinit = insertelement <4 x float> undef, float %add, i32 1 50 %vecext2 = extractelement <4 x float> %A, i32 0 51 %vecext3 = extractelement <4 x float> %A, i32 1 52 %add4 = fadd float %vecext2, %vecext3 53 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 0 54 %vecext6 = extractelement <4 x float> %B, i32 2 55 %vecext7 = extractelement <4 x float> %B, i32 3 56 %add8 = fadd float %vecext6, %vecext7 57 %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 3 58 %vecext10 = extractelement <4 x float> %B, i32 0 59 %vecext11 = extractelement <4 x float> %B, i32 1 60 %add12 = fadd float %vecext10, %vecext11 61 %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 2 62 ret <4 x float> %vecinit13 63 } 64 65 define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) { 66 ; SSE-LABEL: hsub_ps_test1: 67 ; SSE: # BB#0: 68 ; SSE-NEXT: hsubps %xmm1, %xmm0 69 ; SSE-NEXT: retq 70 ; 71 ; AVX-LABEL: hsub_ps_test1: 72 ; AVX: # BB#0: 73 ; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0 74 ; AVX-NEXT: retq 75 %vecext = extractelement <4 x float> %A, i32 0 76 %vecext1 = extractelement <4 x float> %A, i32 1 77 %sub = fsub float %vecext, %vecext1 78 %vecinit = insertelement <4 x float> undef, float %sub, i32 0 79 %vecext2 = extractelement <4 x float> %A, i32 2 80 %vecext3 = extractelement <4 x float> %A, i32 3 81 %sub4 = fsub float %vecext2, %vecext3 82 %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 1 83 %vecext6 = extractelement <4 x float> %B, i32 0 84 %vecext7 = extractelement <4 x float> %B, i32 1 85 %sub8 = fsub float %vecext6, %vecext7 86 %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 2 87 %vecext10 = extractelement <4 x float> %B, i32 2 88 %vecext11 = extractelement <4 x float> %B, i32 3 89 %sub12 = fsub float %vecext10, %vecext11 90 %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 3 91 ret <4 x float> %vecinit13 92 } 93 94 define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) { 95 ; SSE-LABEL: hsub_ps_test2: 96 ; SSE: # BB#0: 97 ; SSE-NEXT: hsubps %xmm1, %xmm0 98 ; SSE-NEXT: retq 99 ; 100 ; AVX-LABEL: hsub_ps_test2: 101 ; AVX: # BB#0: 102 ; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0 103 ; AVX-NEXT: retq 104 %vecext = extractelement <4 x float> %A, i32 2 105 %vecext1 = extractelement <4 x float> %A, i32 3 106 %sub = fsub float %vecext, %vecext1 107 %vecinit = insertelement <4 x float> undef, float %sub, i32 1 108 %vecext2 = extractelement <4 x float> %A, i32 0 109 %vecext3 = extractelement <4 x float> %A, i32 1 110 %sub4 = fsub float %vecext2, %vecext3 111 %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0 112 %vecext6 = extractelement <4 x float> %B, i32 2 113 %vecext7 = extractelement <4 x float> %B, i32 3 114 %sub8 = fsub float %vecext6, %vecext7 115 %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3 116 %vecext10 = extractelement <4 x float> %B, i32 0 117 %vecext11 = extractelement <4 x float> %B, i32 1 118 %sub12 = fsub float %vecext10, %vecext11 119 %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2 120 ret <4 x float> %vecinit13 121 } 122 123 define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) { 124 ; SSE3-LABEL: phadd_d_test1: 125 ; SSE3: # BB#0: 126 ; SSE3-NEXT: movd %xmm0, %eax 127 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 128 ; SSE3-NEXT: movd %xmm2, %ecx 129 ; SSE3-NEXT: addl %eax, %ecx 130 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 131 ; SSE3-NEXT: movd %xmm2, %eax 132 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 133 ; SSE3-NEXT: movd %xmm0, %edx 134 ; SSE3-NEXT: addl %eax, %edx 135 ; SSE3-NEXT: movd %xmm1, %eax 136 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 137 ; SSE3-NEXT: movd %xmm0, %esi 138 ; SSE3-NEXT: addl %eax, %esi 139 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 140 ; SSE3-NEXT: movd %xmm0, %eax 141 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 142 ; SSE3-NEXT: movd %xmm0, %edi 143 ; SSE3-NEXT: addl %eax, %edi 144 ; SSE3-NEXT: movd %edi, %xmm0 145 ; SSE3-NEXT: movd %edx, %xmm1 146 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 147 ; SSE3-NEXT: movd %esi, %xmm2 148 ; SSE3-NEXT: movd %ecx, %xmm0 149 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 150 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 151 ; SSE3-NEXT: retq 152 ; 153 ; SSSE3-LABEL: phadd_d_test1: 154 ; SSSE3: # BB#0: 155 ; SSSE3-NEXT: phaddd %xmm1, %xmm0 156 ; SSSE3-NEXT: retq 157 ; 158 ; AVX-LABEL: phadd_d_test1: 159 ; AVX: # BB#0: 160 ; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 161 ; AVX-NEXT: retq 162 %vecext = extractelement <4 x i32> %A, i32 0 163 %vecext1 = extractelement <4 x i32> %A, i32 1 164 %add = add i32 %vecext, %vecext1 165 %vecinit = insertelement <4 x i32> undef, i32 %add, i32 0 166 %vecext2 = extractelement <4 x i32> %A, i32 2 167 %vecext3 = extractelement <4 x i32> %A, i32 3 168 %add4 = add i32 %vecext2, %vecext3 169 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 1 170 %vecext6 = extractelement <4 x i32> %B, i32 0 171 %vecext7 = extractelement <4 x i32> %B, i32 1 172 %add8 = add i32 %vecext6, %vecext7 173 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 2 174 %vecext10 = extractelement <4 x i32> %B, i32 2 175 %vecext11 = extractelement <4 x i32> %B, i32 3 176 %add12 = add i32 %vecext10, %vecext11 177 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 3 178 ret <4 x i32> %vecinit13 179 } 180 181 define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) { 182 ; SSE3-LABEL: phadd_d_test2: 183 ; SSE3: # BB#0: 184 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 185 ; SSE3-NEXT: movd %xmm2, %eax 186 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] 187 ; SSE3-NEXT: movd %xmm2, %ecx 188 ; SSE3-NEXT: addl %eax, %ecx 189 ; SSE3-NEXT: movd %xmm0, %eax 190 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 191 ; SSE3-NEXT: movd %xmm0, %edx 192 ; SSE3-NEXT: addl %eax, %edx 193 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 194 ; SSE3-NEXT: movd %xmm0, %eax 195 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 196 ; SSE3-NEXT: movd %xmm0, %esi 197 ; SSE3-NEXT: addl %eax, %esi 198 ; SSE3-NEXT: movd %esi, %xmm0 199 ; SSE3-NEXT: movd %ecx, %xmm2 200 ; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 201 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 202 ; SSE3-NEXT: movd %xmm0, %eax 203 ; SSE3-NEXT: movd %xmm1, %ecx 204 ; SSE3-NEXT: addl %eax, %ecx 205 ; SSE3-NEXT: movd %ecx, %xmm1 206 ; SSE3-NEXT: movd %edx, %xmm0 207 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 208 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 209 ; SSE3-NEXT: retq 210 ; 211 ; SSSE3-LABEL: phadd_d_test2: 212 ; SSSE3: # BB#0: 213 ; SSSE3-NEXT: phaddd %xmm1, %xmm0 214 ; SSSE3-NEXT: retq 215 ; 216 ; AVX-LABEL: phadd_d_test2: 217 ; AVX: # BB#0: 218 ; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 219 ; AVX-NEXT: retq 220 %vecext = extractelement <4 x i32> %A, i32 2 221 %vecext1 = extractelement <4 x i32> %A, i32 3 222 %add = add i32 %vecext, %vecext1 223 %vecinit = insertelement <4 x i32> undef, i32 %add, i32 1 224 %vecext2 = extractelement <4 x i32> %A, i32 0 225 %vecext3 = extractelement <4 x i32> %A, i32 1 226 %add4 = add i32 %vecext2, %vecext3 227 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 0 228 %vecext6 = extractelement <4 x i32> %B, i32 3 229 %vecext7 = extractelement <4 x i32> %B, i32 2 230 %add8 = add i32 %vecext6, %vecext7 231 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 3 232 %vecext10 = extractelement <4 x i32> %B, i32 1 233 %vecext11 = extractelement <4 x i32> %B, i32 0 234 %add12 = add i32 %vecext10, %vecext11 235 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 2 236 ret <4 x i32> %vecinit13 237 } 238 239 define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) { 240 ; SSE3-LABEL: phsub_d_test1: 241 ; SSE3: # BB#0: 242 ; SSE3-NEXT: movd %xmm0, %eax 243 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 244 ; SSE3-NEXT: movd %xmm2, %ecx 245 ; SSE3-NEXT: subl %ecx, %eax 246 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 247 ; SSE3-NEXT: movd %xmm2, %ecx 248 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 249 ; SSE3-NEXT: movd %xmm0, %edx 250 ; SSE3-NEXT: subl %edx, %ecx 251 ; SSE3-NEXT: movd %xmm1, %edx 252 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 253 ; SSE3-NEXT: movd %xmm0, %esi 254 ; SSE3-NEXT: subl %esi, %edx 255 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 256 ; SSE3-NEXT: movd %xmm0, %esi 257 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 258 ; SSE3-NEXT: movd %xmm0, %edi 259 ; SSE3-NEXT: subl %edi, %esi 260 ; SSE3-NEXT: movd %esi, %xmm0 261 ; SSE3-NEXT: movd %ecx, %xmm1 262 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 263 ; SSE3-NEXT: movd %edx, %xmm2 264 ; SSE3-NEXT: movd %eax, %xmm0 265 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 266 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 267 ; SSE3-NEXT: retq 268 ; 269 ; SSSE3-LABEL: phsub_d_test1: 270 ; SSSE3: # BB#0: 271 ; SSSE3-NEXT: phsubd %xmm1, %xmm0 272 ; SSSE3-NEXT: retq 273 ; 274 ; AVX-LABEL: phsub_d_test1: 275 ; AVX: # BB#0: 276 ; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0 277 ; AVX-NEXT: retq 278 %vecext = extractelement <4 x i32> %A, i32 0 279 %vecext1 = extractelement <4 x i32> %A, i32 1 280 %sub = sub i32 %vecext, %vecext1 281 %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0 282 %vecext2 = extractelement <4 x i32> %A, i32 2 283 %vecext3 = extractelement <4 x i32> %A, i32 3 284 %sub4 = sub i32 %vecext2, %vecext3 285 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1 286 %vecext6 = extractelement <4 x i32> %B, i32 0 287 %vecext7 = extractelement <4 x i32> %B, i32 1 288 %sub8 = sub i32 %vecext6, %vecext7 289 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2 290 %vecext10 = extractelement <4 x i32> %B, i32 2 291 %vecext11 = extractelement <4 x i32> %B, i32 3 292 %sub12 = sub i32 %vecext10, %vecext11 293 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3 294 ret <4 x i32> %vecinit13 295 } 296 297 define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) { 298 ; SSE3-LABEL: phsub_d_test2: 299 ; SSE3: # BB#0: 300 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 301 ; SSE3-NEXT: movd %xmm2, %eax 302 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] 303 ; SSE3-NEXT: movd %xmm2, %ecx 304 ; SSE3-NEXT: subl %ecx, %eax 305 ; SSE3-NEXT: movd %xmm0, %ecx 306 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 307 ; SSE3-NEXT: movd %xmm0, %edx 308 ; SSE3-NEXT: subl %edx, %ecx 309 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 310 ; SSE3-NEXT: movd %xmm0, %edx 311 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 312 ; SSE3-NEXT: movd %xmm0, %esi 313 ; SSE3-NEXT: subl %esi, %edx 314 ; SSE3-NEXT: movd %edx, %xmm0 315 ; SSE3-NEXT: movd %eax, %xmm2 316 ; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 317 ; SSE3-NEXT: movd %xmm1, %eax 318 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 319 ; SSE3-NEXT: movd %xmm0, %edx 320 ; SSE3-NEXT: subl %edx, %eax 321 ; SSE3-NEXT: movd %eax, %xmm1 322 ; SSE3-NEXT: movd %ecx, %xmm0 323 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 324 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 325 ; SSE3-NEXT: retq 326 ; 327 ; SSSE3-LABEL: phsub_d_test2: 328 ; SSSE3: # BB#0: 329 ; SSSE3-NEXT: phsubd %xmm1, %xmm0 330 ; SSSE3-NEXT: retq 331 ; 332 ; AVX-LABEL: phsub_d_test2: 333 ; AVX: # BB#0: 334 ; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0 335 ; AVX-NEXT: retq 336 %vecext = extractelement <4 x i32> %A, i32 2 337 %vecext1 = extractelement <4 x i32> %A, i32 3 338 %sub = sub i32 %vecext, %vecext1 339 %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 1 340 %vecext2 = extractelement <4 x i32> %A, i32 0 341 %vecext3 = extractelement <4 x i32> %A, i32 1 342 %sub4 = sub i32 %vecext2, %vecext3 343 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 0 344 %vecext6 = extractelement <4 x i32> %B, i32 2 345 %vecext7 = extractelement <4 x i32> %B, i32 3 346 %sub8 = sub i32 %vecext6, %vecext7 347 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 3 348 %vecext10 = extractelement <4 x i32> %B, i32 0 349 %vecext11 = extractelement <4 x i32> %B, i32 1 350 %sub12 = sub i32 %vecext10, %vecext11 351 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 2 352 ret <4 x i32> %vecinit13 353 } 354 355 define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) { 356 ; SSE-LABEL: hadd_pd_test1: 357 ; SSE: # BB#0: 358 ; SSE-NEXT: haddpd %xmm1, %xmm0 359 ; SSE-NEXT: retq 360 ; 361 ; AVX-LABEL: hadd_pd_test1: 362 ; AVX: # BB#0: 363 ; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 364 ; AVX-NEXT: retq 365 %vecext = extractelement <2 x double> %A, i32 0 366 %vecext1 = extractelement <2 x double> %A, i32 1 367 %add = fadd double %vecext, %vecext1 368 %vecinit = insertelement <2 x double> undef, double %add, i32 0 369 %vecext2 = extractelement <2 x double> %B, i32 0 370 %vecext3 = extractelement <2 x double> %B, i32 1 371 %add2 = fadd double %vecext2, %vecext3 372 %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1 373 ret <2 x double> %vecinit2 374 } 375 376 define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) { 377 ; SSE-LABEL: hadd_pd_test2: 378 ; SSE: # BB#0: 379 ; SSE-NEXT: haddpd %xmm1, %xmm0 380 ; SSE-NEXT: retq 381 ; 382 ; AVX-LABEL: hadd_pd_test2: 383 ; AVX: # BB#0: 384 ; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 385 ; AVX-NEXT: retq 386 %vecext = extractelement <2 x double> %A, i32 1 387 %vecext1 = extractelement <2 x double> %A, i32 0 388 %add = fadd double %vecext, %vecext1 389 %vecinit = insertelement <2 x double> undef, double %add, i32 0 390 %vecext2 = extractelement <2 x double> %B, i32 1 391 %vecext3 = extractelement <2 x double> %B, i32 0 392 %add2 = fadd double %vecext2, %vecext3 393 %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1 394 ret <2 x double> %vecinit2 395 } 396 397 define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) { 398 ; SSE-LABEL: hsub_pd_test1: 399 ; SSE: # BB#0: 400 ; SSE-NEXT: hsubpd %xmm1, %xmm0 401 ; SSE-NEXT: retq 402 ; 403 ; AVX-LABEL: hsub_pd_test1: 404 ; AVX: # BB#0: 405 ; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 406 ; AVX-NEXT: retq 407 %vecext = extractelement <2 x double> %A, i32 0 408 %vecext1 = extractelement <2 x double> %A, i32 1 409 %sub = fsub double %vecext, %vecext1 410 %vecinit = insertelement <2 x double> undef, double %sub, i32 0 411 %vecext2 = extractelement <2 x double> %B, i32 0 412 %vecext3 = extractelement <2 x double> %B, i32 1 413 %sub2 = fsub double %vecext2, %vecext3 414 %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 1 415 ret <2 x double> %vecinit2 416 } 417 418 define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) { 419 ; SSE-LABEL: hsub_pd_test2: 420 ; SSE: # BB#0: 421 ; SSE-NEXT: hsubpd %xmm1, %xmm0 422 ; SSE-NEXT: retq 423 ; 424 ; AVX-LABEL: hsub_pd_test2: 425 ; AVX: # BB#0: 426 ; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 427 ; AVX-NEXT: retq 428 %vecext = extractelement <2 x double> %B, i32 0 429 %vecext1 = extractelement <2 x double> %B, i32 1 430 %sub = fsub double %vecext, %vecext1 431 %vecinit = insertelement <2 x double> undef, double %sub, i32 1 432 %vecext2 = extractelement <2 x double> %A, i32 0 433 %vecext3 = extractelement <2 x double> %A, i32 1 434 %sub2 = fsub double %vecext2, %vecext3 435 %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0 436 ret <2 x double> %vecinit2 437 } 438 439 define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) { 440 ; SSE-LABEL: avx_vhadd_pd_test: 441 ; SSE: # BB#0: 442 ; SSE-NEXT: haddpd %xmm1, %xmm0 443 ; SSE-NEXT: haddpd %xmm3, %xmm2 444 ; SSE-NEXT: movapd %xmm2, %xmm1 445 ; SSE-NEXT: retq 446 ; 447 ; AVX-LABEL: avx_vhadd_pd_test: 448 ; AVX: # BB#0: 449 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 450 ; AVX-NEXT: vhaddpd %xmm2, %xmm1, %xmm1 451 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 452 ; AVX-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 453 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 454 ; AVX-NEXT: retq 455 %vecext = extractelement <4 x double> %A, i32 0 456 %vecext1 = extractelement <4 x double> %A, i32 1 457 %add = fadd double %vecext, %vecext1 458 %vecinit = insertelement <4 x double> undef, double %add, i32 0 459 %vecext2 = extractelement <4 x double> %A, i32 2 460 %vecext3 = extractelement <4 x double> %A, i32 3 461 %add4 = fadd double %vecext2, %vecext3 462 %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1 463 %vecext6 = extractelement <4 x double> %B, i32 0 464 %vecext7 = extractelement <4 x double> %B, i32 1 465 %add8 = fadd double %vecext6, %vecext7 466 %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2 467 %vecext10 = extractelement <4 x double> %B, i32 2 468 %vecext11 = extractelement <4 x double> %B, i32 3 469 %add12 = fadd double %vecext10, %vecext11 470 %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3 471 ret <4 x double> %vecinit13 472 } 473 474 define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) { 475 ; SSE-LABEL: avx_vhsub_pd_test: 476 ; SSE: # BB#0: 477 ; SSE-NEXT: hsubpd %xmm1, %xmm0 478 ; SSE-NEXT: hsubpd %xmm3, %xmm2 479 ; SSE-NEXT: movapd %xmm2, %xmm1 480 ; SSE-NEXT: retq 481 ; 482 ; AVX-LABEL: avx_vhsub_pd_test: 483 ; AVX: # BB#0: 484 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 485 ; AVX-NEXT: vhsubpd %xmm2, %xmm1, %xmm1 486 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 487 ; AVX-NEXT: vhsubpd %xmm2, %xmm0, %xmm0 488 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 489 ; AVX-NEXT: retq 490 %vecext = extractelement <4 x double> %A, i32 0 491 %vecext1 = extractelement <4 x double> %A, i32 1 492 %sub = fsub double %vecext, %vecext1 493 %vecinit = insertelement <4 x double> undef, double %sub, i32 0 494 %vecext2 = extractelement <4 x double> %A, i32 2 495 %vecext3 = extractelement <4 x double> %A, i32 3 496 %sub4 = fsub double %vecext2, %vecext3 497 %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1 498 %vecext6 = extractelement <4 x double> %B, i32 0 499 %vecext7 = extractelement <4 x double> %B, i32 1 500 %sub8 = fsub double %vecext6, %vecext7 501 %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2 502 %vecext10 = extractelement <4 x double> %B, i32 2 503 %vecext11 = extractelement <4 x double> %B, i32 3 504 %sub12 = fsub double %vecext10, %vecext11 505 %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3 506 ret <4 x double> %vecinit13 507 } 508 509 define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) { 510 ; SSE3-LABEL: avx2_vphadd_d_test: 511 ; SSE3: # BB#0: 512 ; SSE3-NEXT: movd %xmm0, %ecx 513 ; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] 514 ; SSE3-NEXT: movd %xmm4, %r8d 515 ; SSE3-NEXT: addl %ecx, %r8d 516 ; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 517 ; SSE3-NEXT: movd %xmm4, %edx 518 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 519 ; SSE3-NEXT: movd %xmm0, %r9d 520 ; SSE3-NEXT: addl %edx, %r9d 521 ; SSE3-NEXT: movd %xmm1, %esi 522 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 523 ; SSE3-NEXT: movd %xmm0, %r10d 524 ; SSE3-NEXT: addl %esi, %r10d 525 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 526 ; SSE3-NEXT: movd %xmm0, %esi 527 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 528 ; SSE3-NEXT: movd %xmm0, %edi 529 ; SSE3-NEXT: addl %esi, %edi 530 ; SSE3-NEXT: movd %xmm2, %eax 531 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] 532 ; SSE3-NEXT: movd %xmm0, %r11d 533 ; SSE3-NEXT: addl %eax, %r11d 534 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] 535 ; SSE3-NEXT: movd %xmm0, %eax 536 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] 537 ; SSE3-NEXT: movd %xmm0, %ecx 538 ; SSE3-NEXT: addl %eax, %ecx 539 ; SSE3-NEXT: movd %xmm3, %eax 540 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] 541 ; SSE3-NEXT: movd %xmm0, %edx 542 ; SSE3-NEXT: addl %eax, %edx 543 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] 544 ; SSE3-NEXT: movd %xmm0, %eax 545 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] 546 ; SSE3-NEXT: movd %xmm0, %esi 547 ; SSE3-NEXT: addl %eax, %esi 548 ; SSE3-NEXT: movd %edi, %xmm0 549 ; SSE3-NEXT: movd %r9d, %xmm1 550 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 551 ; SSE3-NEXT: movd %r10d, %xmm2 552 ; SSE3-NEXT: movd %r8d, %xmm0 553 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 554 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 555 ; SSE3-NEXT: movd %esi, %xmm1 556 ; SSE3-NEXT: movd %ecx, %xmm2 557 ; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 558 ; SSE3-NEXT: movd %edx, %xmm3 559 ; SSE3-NEXT: movd %r11d, %xmm1 560 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 561 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 562 ; SSE3-NEXT: retq 563 ; 564 ; SSSE3-LABEL: avx2_vphadd_d_test: 565 ; SSSE3: # BB#0: 566 ; SSSE3-NEXT: phaddd %xmm1, %xmm0 567 ; SSSE3-NEXT: phaddd %xmm3, %xmm2 568 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 569 ; SSSE3-NEXT: retq 570 ; 571 ; AVX1-LABEL: avx2_vphadd_d_test: 572 ; AVX1: # BB#0: 573 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 574 ; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 575 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 576 ; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 577 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 578 ; AVX1-NEXT: retq 579 ; 580 ; AVX2-LABEL: avx2_vphadd_d_test: 581 ; AVX2: # BB#0: 582 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 583 ; AVX2-NEXT: vphaddd %xmm2, %xmm1, %xmm1 584 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 585 ; AVX2-NEXT: vphaddd %xmm2, %xmm0, %xmm0 586 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 587 ; AVX2-NEXT: retq 588 %vecext = extractelement <8 x i32> %A, i32 0 589 %vecext1 = extractelement <8 x i32> %A, i32 1 590 %add = add i32 %vecext, %vecext1 591 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 592 %vecext2 = extractelement <8 x i32> %A, i32 2 593 %vecext3 = extractelement <8 x i32> %A, i32 3 594 %add4 = add i32 %vecext2, %vecext3 595 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1 596 %vecext6 = extractelement <8 x i32> %A, i32 4 597 %vecext7 = extractelement <8 x i32> %A, i32 5 598 %add8 = add i32 %vecext6, %vecext7 599 %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2 600 %vecext10 = extractelement <8 x i32> %A, i32 6 601 %vecext11 = extractelement <8 x i32> %A, i32 7 602 %add12 = add i32 %vecext10, %vecext11 603 %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3 604 %vecext14 = extractelement <8 x i32> %B, i32 0 605 %vecext15 = extractelement <8 x i32> %B, i32 1 606 %add16 = add i32 %vecext14, %vecext15 607 %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4 608 %vecext18 = extractelement <8 x i32> %B, i32 2 609 %vecext19 = extractelement <8 x i32> %B, i32 3 610 %add20 = add i32 %vecext18, %vecext19 611 %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5 612 %vecext22 = extractelement <8 x i32> %B, i32 4 613 %vecext23 = extractelement <8 x i32> %B, i32 5 614 %add24 = add i32 %vecext22, %vecext23 615 %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6 616 %vecext26 = extractelement <8 x i32> %B, i32 6 617 %vecext27 = extractelement <8 x i32> %B, i32 7 618 %add28 = add i32 %vecext26, %vecext27 619 %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7 620 ret <8 x i32> %vecinit29 621 } 622 623 define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) { 624 ; SSE3-LABEL: avx2_vphadd_w_test: 625 ; SSE3: # BB#0: 626 ; SSE3-NEXT: pushq %rbp 627 ; SSE3-NEXT: .Ltmp0: 628 ; SSE3-NEXT: .cfi_def_cfa_offset 16 629 ; SSE3-NEXT: pushq %r15 630 ; SSE3-NEXT: .Ltmp1: 631 ; SSE3-NEXT: .cfi_def_cfa_offset 24 632 ; SSE3-NEXT: pushq %r14 633 ; SSE3-NEXT: .Ltmp2: 634 ; SSE3-NEXT: .cfi_def_cfa_offset 32 635 ; SSE3-NEXT: pushq %r13 636 ; SSE3-NEXT: .Ltmp3: 637 ; SSE3-NEXT: .cfi_def_cfa_offset 40 638 ; SSE3-NEXT: pushq %r12 639 ; SSE3-NEXT: .Ltmp4: 640 ; SSE3-NEXT: .cfi_def_cfa_offset 48 641 ; SSE3-NEXT: pushq %rbx 642 ; SSE3-NEXT: .Ltmp5: 643 ; SSE3-NEXT: .cfi_def_cfa_offset 56 644 ; SSE3-NEXT: .Ltmp6: 645 ; SSE3-NEXT: .cfi_offset %rbx, -56 646 ; SSE3-NEXT: .Ltmp7: 647 ; SSE3-NEXT: .cfi_offset %r12, -48 648 ; SSE3-NEXT: .Ltmp8: 649 ; SSE3-NEXT: .cfi_offset %r13, -40 650 ; SSE3-NEXT: .Ltmp9: 651 ; SSE3-NEXT: .cfi_offset %r14, -32 652 ; SSE3-NEXT: .Ltmp10: 653 ; SSE3-NEXT: .cfi_offset %r15, -24 654 ; SSE3-NEXT: .Ltmp11: 655 ; SSE3-NEXT: .cfi_offset %rbp, -16 656 ; SSE3-NEXT: movd %xmm0, %eax 657 ; SSE3-NEXT: pextrw $1, %xmm0, %ecx 658 ; SSE3-NEXT: addl %eax, %ecx 659 ; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill 660 ; SSE3-NEXT: pextrw $2, %xmm0, %eax 661 ; SSE3-NEXT: pextrw $3, %xmm0, %r11d 662 ; SSE3-NEXT: addl %eax, %r11d 663 ; SSE3-NEXT: pextrw $4, %xmm0, %eax 664 ; SSE3-NEXT: pextrw $5, %xmm0, %r10d 665 ; SSE3-NEXT: addl %eax, %r10d 666 ; SSE3-NEXT: pextrw $6, %xmm0, %eax 667 ; SSE3-NEXT: pextrw $7, %xmm0, %r13d 668 ; SSE3-NEXT: addl %eax, %r13d 669 ; SSE3-NEXT: movd %xmm1, %eax 670 ; SSE3-NEXT: pextrw $1, %xmm1, %r14d 671 ; SSE3-NEXT: addl %eax, %r14d 672 ; SSE3-NEXT: pextrw $2, %xmm1, %eax 673 ; SSE3-NEXT: pextrw $3, %xmm1, %ebp 674 ; SSE3-NEXT: addl %eax, %ebp 675 ; SSE3-NEXT: pextrw $4, %xmm1, %eax 676 ; SSE3-NEXT: pextrw $5, %xmm1, %ebx 677 ; SSE3-NEXT: addl %eax, %ebx 678 ; SSE3-NEXT: pextrw $6, %xmm1, %eax 679 ; SSE3-NEXT: pextrw $7, %xmm1, %edx 680 ; SSE3-NEXT: addl %eax, %edx 681 ; SSE3-NEXT: movd %xmm2, %eax 682 ; SSE3-NEXT: pextrw $1, %xmm2, %ecx 683 ; SSE3-NEXT: addl %eax, %ecx 684 ; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill 685 ; SSE3-NEXT: pextrw $2, %xmm2, %eax 686 ; SSE3-NEXT: pextrw $3, %xmm2, %r12d 687 ; SSE3-NEXT: addl %eax, %r12d 688 ; SSE3-NEXT: pextrw $4, %xmm2, %eax 689 ; SSE3-NEXT: pextrw $5, %xmm2, %r15d 690 ; SSE3-NEXT: addl %eax, %r15d 691 ; SSE3-NEXT: pextrw $6, %xmm2, %eax 692 ; SSE3-NEXT: pextrw $7, %xmm2, %r8d 693 ; SSE3-NEXT: addl %eax, %r8d 694 ; SSE3-NEXT: movd %xmm3, %eax 695 ; SSE3-NEXT: pextrw $1, %xmm3, %r9d 696 ; SSE3-NEXT: addl %eax, %r9d 697 ; SSE3-NEXT: pextrw $2, %xmm3, %eax 698 ; SSE3-NEXT: pextrw $3, %xmm3, %esi 699 ; SSE3-NEXT: addl %eax, %esi 700 ; SSE3-NEXT: pextrw $4, %xmm3, %eax 701 ; SSE3-NEXT: pextrw $5, %xmm3, %edi 702 ; SSE3-NEXT: addl %eax, %edi 703 ; SSE3-NEXT: pextrw $6, %xmm3, %ecx 704 ; SSE3-NEXT: pextrw $7, %xmm3, %eax 705 ; SSE3-NEXT: addl %ecx, %eax 706 ; SSE3-NEXT: movd %edx, %xmm8 707 ; SSE3-NEXT: movd %r13d, %xmm3 708 ; SSE3-NEXT: movd %ebp, %xmm9 709 ; SSE3-NEXT: movd %r11d, %xmm4 710 ; SSE3-NEXT: movd %ebx, %xmm10 711 ; SSE3-NEXT: movd %r10d, %xmm7 712 ; SSE3-NEXT: movd %r14d, %xmm11 713 ; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload 714 ; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero 715 ; SSE3-NEXT: movd %eax, %xmm12 716 ; SSE3-NEXT: movd %r8d, %xmm6 717 ; SSE3-NEXT: movd %esi, %xmm13 718 ; SSE3-NEXT: movd %r12d, %xmm5 719 ; SSE3-NEXT: movd %edi, %xmm14 720 ; SSE3-NEXT: movd %r15d, %xmm2 721 ; SSE3-NEXT: movd %r9d, %xmm15 722 ; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload 723 ; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero 724 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] 725 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] 726 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 727 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] 728 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] 729 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] 730 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 731 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] 732 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] 733 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 734 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] 735 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] 736 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 737 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] 738 ; SSE3-NEXT: popq %rbx 739 ; SSE3-NEXT: popq %r12 740 ; SSE3-NEXT: popq %r13 741 ; SSE3-NEXT: popq %r14 742 ; SSE3-NEXT: popq %r15 743 ; SSE3-NEXT: popq %rbp 744 ; SSE3-NEXT: retq 745 ; 746 ; SSSE3-LABEL: avx2_vphadd_w_test: 747 ; SSSE3: # BB#0: 748 ; SSSE3-NEXT: phaddw %xmm1, %xmm0 749 ; SSSE3-NEXT: phaddw %xmm3, %xmm2 750 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 751 ; SSSE3-NEXT: retq 752 ; 753 ; AVX1-LABEL: avx2_vphadd_w_test: 754 ; AVX1: # BB#0: 755 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 756 ; AVX1-NEXT: vphaddw %xmm2, %xmm1, %xmm1 757 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 758 ; AVX1-NEXT: vphaddw %xmm2, %xmm0, %xmm0 759 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 760 ; AVX1-NEXT: retq 761 ; 762 ; AVX2-LABEL: avx2_vphadd_w_test: 763 ; AVX2: # BB#0: 764 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 765 ; AVX2-NEXT: vphaddw %xmm2, %xmm1, %xmm1 766 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 767 ; AVX2-NEXT: vphaddw %xmm2, %xmm0, %xmm0 768 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 769 ; AVX2-NEXT: retq 770 %vecext = extractelement <16 x i16> %a, i32 0 771 %vecext1 = extractelement <16 x i16> %a, i32 1 772 %add = add i16 %vecext, %vecext1 773 %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0 774 %vecext4 = extractelement <16 x i16> %a, i32 2 775 %vecext6 = extractelement <16 x i16> %a, i32 3 776 %add8 = add i16 %vecext4, %vecext6 777 %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1 778 %vecext11 = extractelement <16 x i16> %a, i32 4 779 %vecext13 = extractelement <16 x i16> %a, i32 5 780 %add15 = add i16 %vecext11, %vecext13 781 %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2 782 %vecext18 = extractelement <16 x i16> %a, i32 6 783 %vecext20 = extractelement <16 x i16> %a, i32 7 784 %add22 = add i16 %vecext18, %vecext20 785 %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3 786 %vecext25 = extractelement <16 x i16> %a, i32 8 787 %vecext27 = extractelement <16 x i16> %a, i32 9 788 %add29 = add i16 %vecext25, %vecext27 789 %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 4 790 %vecext32 = extractelement <16 x i16> %a, i32 10 791 %vecext34 = extractelement <16 x i16> %a, i32 11 792 %add36 = add i16 %vecext32, %vecext34 793 %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 5 794 %vecext39 = extractelement <16 x i16> %a, i32 12 795 %vecext41 = extractelement <16 x i16> %a, i32 13 796 %add43 = add i16 %vecext39, %vecext41 797 %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 6 798 %vecext46 = extractelement <16 x i16> %a, i32 14 799 %vecext48 = extractelement <16 x i16> %a, i32 15 800 %add50 = add i16 %vecext46, %vecext48 801 %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 7 802 %vecext53 = extractelement <16 x i16> %b, i32 0 803 %vecext55 = extractelement <16 x i16> %b, i32 1 804 %add57 = add i16 %vecext53, %vecext55 805 %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 8 806 %vecext60 = extractelement <16 x i16> %b, i32 2 807 %vecext62 = extractelement <16 x i16> %b, i32 3 808 %add64 = add i16 %vecext60, %vecext62 809 %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 9 810 %vecext67 = extractelement <16 x i16> %b, i32 4 811 %vecext69 = extractelement <16 x i16> %b, i32 5 812 %add71 = add i16 %vecext67, %vecext69 813 %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 10 814 %vecext74 = extractelement <16 x i16> %b, i32 6 815 %vecext76 = extractelement <16 x i16> %b, i32 7 816 %add78 = add i16 %vecext74, %vecext76 817 %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 11 818 %vecext81 = extractelement <16 x i16> %b, i32 8 819 %vecext83 = extractelement <16 x i16> %b, i32 9 820 %add85 = add i16 %vecext81, %vecext83 821 %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12 822 %vecext88 = extractelement <16 x i16> %b, i32 10 823 %vecext90 = extractelement <16 x i16> %b, i32 11 824 %add92 = add i16 %vecext88, %vecext90 825 %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13 826 %vecext95 = extractelement <16 x i16> %b, i32 12 827 %vecext97 = extractelement <16 x i16> %b, i32 13 828 %add99 = add i16 %vecext95, %vecext97 829 %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14 830 %vecext102 = extractelement <16 x i16> %b, i32 14 831 %vecext104 = extractelement <16 x i16> %b, i32 15 832 %add106 = add i16 %vecext102, %vecext104 833 %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15 834 ret <16 x i16> %vecinit108 835 } 836 837 ; Verify that we don't select horizontal subs in the following functions. 838 839 define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) { 840 ; SSE-LABEL: not_a_hsub_1: 841 ; SSE: # BB#0: 842 ; SSE-NEXT: movd %xmm0, %eax 843 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 844 ; SSE-NEXT: movd %xmm2, %ecx 845 ; SSE-NEXT: subl %ecx, %eax 846 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 847 ; SSE-NEXT: movd %xmm2, %ecx 848 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 849 ; SSE-NEXT: movd %xmm0, %edx 850 ; SSE-NEXT: subl %edx, %ecx 851 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 852 ; SSE-NEXT: movd %xmm0, %edx 853 ; SSE-NEXT: movd %xmm1, %esi 854 ; SSE-NEXT: subl %esi, %edx 855 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 856 ; SSE-NEXT: movd %xmm0, %esi 857 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 858 ; SSE-NEXT: movd %xmm0, %edi 859 ; SSE-NEXT: subl %edi, %esi 860 ; SSE-NEXT: movd %esi, %xmm0 861 ; SSE-NEXT: movd %ecx, %xmm1 862 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 863 ; SSE-NEXT: movd %edx, %xmm2 864 ; SSE-NEXT: movd %eax, %xmm0 865 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 866 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 867 ; SSE-NEXT: retq 868 ; 869 ; AVX-LABEL: not_a_hsub_1: 870 ; AVX: # BB#0: 871 ; AVX-NEXT: vmovd %xmm0, %eax 872 ; AVX-NEXT: vpextrd $1, %xmm0, %ecx 873 ; AVX-NEXT: subl %ecx, %eax 874 ; AVX-NEXT: vpextrd $2, %xmm0, %ecx 875 ; AVX-NEXT: vpextrd $3, %xmm0, %edx 876 ; AVX-NEXT: subl %edx, %ecx 877 ; AVX-NEXT: vpextrd $1, %xmm1, %edx 878 ; AVX-NEXT: vmovd %xmm1, %esi 879 ; AVX-NEXT: subl %esi, %edx 880 ; AVX-NEXT: vpextrd $3, %xmm1, %esi 881 ; AVX-NEXT: vpextrd $2, %xmm1, %edi 882 ; AVX-NEXT: subl %edi, %esi 883 ; AVX-NEXT: vmovd %eax, %xmm0 884 ; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 885 ; AVX-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 886 ; AVX-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 887 ; AVX-NEXT: retq 888 %vecext = extractelement <4 x i32> %A, i32 0 889 %vecext1 = extractelement <4 x i32> %A, i32 1 890 %sub = sub i32 %vecext, %vecext1 891 %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0 892 %vecext2 = extractelement <4 x i32> %A, i32 2 893 %vecext3 = extractelement <4 x i32> %A, i32 3 894 %sub4 = sub i32 %vecext2, %vecext3 895 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1 896 %vecext6 = extractelement <4 x i32> %B, i32 1 897 %vecext7 = extractelement <4 x i32> %B, i32 0 898 %sub8 = sub i32 %vecext6, %vecext7 899 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2 900 %vecext10 = extractelement <4 x i32> %B, i32 3 901 %vecext11 = extractelement <4 x i32> %B, i32 2 902 %sub12 = sub i32 %vecext10, %vecext11 903 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3 904 ret <4 x i32> %vecinit13 905 } 906 907 define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) { 908 ; SSE-LABEL: not_a_hsub_2: 909 ; SSE: # BB#0: 910 ; SSE-NEXT: movapd %xmm0, %xmm2 911 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0] 912 ; SSE-NEXT: movapd %xmm0, %xmm3 913 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] 914 ; SSE-NEXT: subss %xmm3, %xmm2 915 ; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 916 ; SSE-NEXT: subss %xmm3, %xmm0 917 ; SSE-NEXT: movaps %xmm1, %xmm3 918 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] 919 ; SSE-NEXT: movaps %xmm1, %xmm4 920 ; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1,0] 921 ; SSE-NEXT: subss %xmm4, %xmm3 922 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 923 ; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 924 ; SSE-NEXT: subss %xmm3, %xmm1 925 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 926 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 927 ; SSE-NEXT: retq 928 ; 929 ; AVX-LABEL: not_a_hsub_2: 930 ; AVX: # BB#0: 931 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 932 ; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3] 933 ; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 934 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 935 ; AVX-NEXT: vsubss %xmm3, %xmm0, %xmm0 936 ; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3] 937 ; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] 938 ; AVX-NEXT: vsubss %xmm4, %xmm3, %xmm3 939 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] 940 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 941 ; AVX-NEXT: vsubss %xmm2, %xmm1, %xmm1 942 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 943 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] 944 ; AVX-NEXT: retq 945 %vecext = extractelement <4 x float> %A, i32 2 946 %vecext1 = extractelement <4 x float> %A, i32 3 947 %sub = fsub float %vecext, %vecext1 948 %vecinit = insertelement <4 x float> undef, float %sub, i32 1 949 %vecext2 = extractelement <4 x float> %A, i32 0 950 %vecext3 = extractelement <4 x float> %A, i32 1 951 %sub4 = fsub float %vecext2, %vecext3 952 %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0 953 %vecext6 = extractelement <4 x float> %B, i32 3 954 %vecext7 = extractelement <4 x float> %B, i32 2 955 %sub8 = fsub float %vecext6, %vecext7 956 %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3 957 %vecext10 = extractelement <4 x float> %B, i32 0 958 %vecext11 = extractelement <4 x float> %B, i32 1 959 %sub12 = fsub float %vecext10, %vecext11 960 %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2 961 ret <4 x float> %vecinit13 962 } 963 964 define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) { 965 ; SSE-LABEL: not_a_hsub_3: 966 ; SSE: # BB#0: 967 ; SSE-NEXT: movapd %xmm1, %xmm2 968 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0] 969 ; SSE-NEXT: subsd %xmm2, %xmm1 970 ; SSE-NEXT: movapd %xmm0, %xmm2 971 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0] 972 ; SSE-NEXT: subsd %xmm0, %xmm2 973 ; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] 974 ; SSE-NEXT: movapd %xmm2, %xmm0 975 ; SSE-NEXT: retq 976 ; 977 ; AVX-LABEL: not_a_hsub_3: 978 ; AVX: # BB#0: 979 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 980 ; AVX-NEXT: vsubsd %xmm2, %xmm1, %xmm1 981 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 982 ; AVX-NEXT: vsubsd %xmm0, %xmm2, %xmm0 983 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 984 ; AVX-NEXT: retq 985 %vecext = extractelement <2 x double> %B, i32 0 986 %vecext1 = extractelement <2 x double> %B, i32 1 987 %sub = fsub double %vecext, %vecext1 988 %vecinit = insertelement <2 x double> undef, double %sub, i32 1 989 %vecext2 = extractelement <2 x double> %A, i32 1 990 %vecext3 = extractelement <2 x double> %A, i32 0 991 %sub2 = fsub double %vecext2, %vecext3 992 %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0 993 ret <2 x double> %vecinit2 994 } 995 996 ; Test AVX horizontal add/sub of packed single/double precision 997 ; floating point values from 256-bit vectors. 998 999 define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) { 1000 ; SSE-LABEL: avx_vhadd_ps: 1001 ; SSE: # BB#0: 1002 ; SSE-NEXT: haddps %xmm2, %xmm0 1003 ; SSE-NEXT: haddps %xmm3, %xmm1 1004 ; SSE-NEXT: retq 1005 ; 1006 ; AVX-LABEL: avx_vhadd_ps: 1007 ; AVX: # BB#0: 1008 ; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 1009 ; AVX-NEXT: retq 1010 %vecext = extractelement <8 x float> %a, i32 0 1011 %vecext1 = extractelement <8 x float> %a, i32 1 1012 %add = fadd float %vecext, %vecext1 1013 %vecinit = insertelement <8 x float> undef, float %add, i32 0 1014 %vecext2 = extractelement <8 x float> %a, i32 2 1015 %vecext3 = extractelement <8 x float> %a, i32 3 1016 %add4 = fadd float %vecext2, %vecext3 1017 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1 1018 %vecext6 = extractelement <8 x float> %b, i32 0 1019 %vecext7 = extractelement <8 x float> %b, i32 1 1020 %add8 = fadd float %vecext6, %vecext7 1021 %vecinit9 = insertelement <8 x float> %vecinit5, float %add8, i32 2 1022 %vecext10 = extractelement <8 x float> %b, i32 2 1023 %vecext11 = extractelement <8 x float> %b, i32 3 1024 %add12 = fadd float %vecext10, %vecext11 1025 %vecinit13 = insertelement <8 x float> %vecinit9, float %add12, i32 3 1026 %vecext14 = extractelement <8 x float> %a, i32 4 1027 %vecext15 = extractelement <8 x float> %a, i32 5 1028 %add16 = fadd float %vecext14, %vecext15 1029 %vecinit17 = insertelement <8 x float> %vecinit13, float %add16, i32 4 1030 %vecext18 = extractelement <8 x float> %a, i32 6 1031 %vecext19 = extractelement <8 x float> %a, i32 7 1032 %add20 = fadd float %vecext18, %vecext19 1033 %vecinit21 = insertelement <8 x float> %vecinit17, float %add20, i32 5 1034 %vecext22 = extractelement <8 x float> %b, i32 4 1035 %vecext23 = extractelement <8 x float> %b, i32 5 1036 %add24 = fadd float %vecext22, %vecext23 1037 %vecinit25 = insertelement <8 x float> %vecinit21, float %add24, i32 6 1038 %vecext26 = extractelement <8 x float> %b, i32 6 1039 %vecext27 = extractelement <8 x float> %b, i32 7 1040 %add28 = fadd float %vecext26, %vecext27 1041 %vecinit29 = insertelement <8 x float> %vecinit25, float %add28, i32 7 1042 ret <8 x float> %vecinit29 1043 } 1044 1045 define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) { 1046 ; SSE-LABEL: avx_vhsub_ps: 1047 ; SSE: # BB#0: 1048 ; SSE-NEXT: hsubps %xmm2, %xmm0 1049 ; SSE-NEXT: hsubps %xmm3, %xmm1 1050 ; SSE-NEXT: retq 1051 ; 1052 ; AVX-LABEL: avx_vhsub_ps: 1053 ; AVX: # BB#0: 1054 ; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0 1055 ; AVX-NEXT: retq 1056 %vecext = extractelement <8 x float> %a, i32 0 1057 %vecext1 = extractelement <8 x float> %a, i32 1 1058 %sub = fsub float %vecext, %vecext1 1059 %vecinit = insertelement <8 x float> undef, float %sub, i32 0 1060 %vecext2 = extractelement <8 x float> %a, i32 2 1061 %vecext3 = extractelement <8 x float> %a, i32 3 1062 %sub4 = fsub float %vecext2, %vecext3 1063 %vecinit5 = insertelement <8 x float> %vecinit, float %sub4, i32 1 1064 %vecext6 = extractelement <8 x float> %b, i32 0 1065 %vecext7 = extractelement <8 x float> %b, i32 1 1066 %sub8 = fsub float %vecext6, %vecext7 1067 %vecinit9 = insertelement <8 x float> %vecinit5, float %sub8, i32 2 1068 %vecext10 = extractelement <8 x float> %b, i32 2 1069 %vecext11 = extractelement <8 x float> %b, i32 3 1070 %sub12 = fsub float %vecext10, %vecext11 1071 %vecinit13 = insertelement <8 x float> %vecinit9, float %sub12, i32 3 1072 %vecext14 = extractelement <8 x float> %a, i32 4 1073 %vecext15 = extractelement <8 x float> %a, i32 5 1074 %sub16 = fsub float %vecext14, %vecext15 1075 %vecinit17 = insertelement <8 x float> %vecinit13, float %sub16, i32 4 1076 %vecext18 = extractelement <8 x float> %a, i32 6 1077 %vecext19 = extractelement <8 x float> %a, i32 7 1078 %sub20 = fsub float %vecext18, %vecext19 1079 %vecinit21 = insertelement <8 x float> %vecinit17, float %sub20, i32 5 1080 %vecext22 = extractelement <8 x float> %b, i32 4 1081 %vecext23 = extractelement <8 x float> %b, i32 5 1082 %sub24 = fsub float %vecext22, %vecext23 1083 %vecinit25 = insertelement <8 x float> %vecinit21, float %sub24, i32 6 1084 %vecext26 = extractelement <8 x float> %b, i32 6 1085 %vecext27 = extractelement <8 x float> %b, i32 7 1086 %sub28 = fsub float %vecext26, %vecext27 1087 %vecinit29 = insertelement <8 x float> %vecinit25, float %sub28, i32 7 1088 ret <8 x float> %vecinit29 1089 } 1090 1091 define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) { 1092 ; SSE-LABEL: avx_hadd_pd: 1093 ; SSE: # BB#0: 1094 ; SSE-NEXT: haddpd %xmm2, %xmm0 1095 ; SSE-NEXT: haddpd %xmm3, %xmm1 1096 ; SSE-NEXT: retq 1097 ; 1098 ; AVX-LABEL: avx_hadd_pd: 1099 ; AVX: # BB#0: 1100 ; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 1101 ; AVX-NEXT: retq 1102 %vecext = extractelement <4 x double> %a, i32 0 1103 %vecext1 = extractelement <4 x double> %a, i32 1 1104 %add = fadd double %vecext, %vecext1 1105 %vecinit = insertelement <4 x double> undef, double %add, i32 0 1106 %vecext2 = extractelement <4 x double> %b, i32 0 1107 %vecext3 = extractelement <4 x double> %b, i32 1 1108 %add4 = fadd double %vecext2, %vecext3 1109 %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1 1110 %vecext6 = extractelement <4 x double> %a, i32 2 1111 %vecext7 = extractelement <4 x double> %a, i32 3 1112 %add8 = fadd double %vecext6, %vecext7 1113 %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2 1114 %vecext10 = extractelement <4 x double> %b, i32 2 1115 %vecext11 = extractelement <4 x double> %b, i32 3 1116 %add12 = fadd double %vecext10, %vecext11 1117 %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3 1118 ret <4 x double> %vecinit13 1119 } 1120 1121 define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) { 1122 ; SSE-LABEL: avx_hsub_pd: 1123 ; SSE: # BB#0: 1124 ; SSE-NEXT: hsubpd %xmm2, %xmm0 1125 ; SSE-NEXT: hsubpd %xmm3, %xmm1 1126 ; SSE-NEXT: retq 1127 ; 1128 ; AVX-LABEL: avx_hsub_pd: 1129 ; AVX: # BB#0: 1130 ; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 1131 ; AVX-NEXT: retq 1132 %vecext = extractelement <4 x double> %a, i32 0 1133 %vecext1 = extractelement <4 x double> %a, i32 1 1134 %sub = fsub double %vecext, %vecext1 1135 %vecinit = insertelement <4 x double> undef, double %sub, i32 0 1136 %vecext2 = extractelement <4 x double> %b, i32 0 1137 %vecext3 = extractelement <4 x double> %b, i32 1 1138 %sub4 = fsub double %vecext2, %vecext3 1139 %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1 1140 %vecext6 = extractelement <4 x double> %a, i32 2 1141 %vecext7 = extractelement <4 x double> %a, i32 3 1142 %sub8 = fsub double %vecext6, %vecext7 1143 %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2 1144 %vecext10 = extractelement <4 x double> %b, i32 2 1145 %vecext11 = extractelement <4 x double> %b, i32 3 1146 %sub12 = fsub double %vecext10, %vecext11 1147 %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3 1148 ret <4 x double> %vecinit13 1149 } 1150 1151 ; Test AVX2 horizontal add of packed integer values from 256-bit vectors. 1152 1153 define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) { 1154 ; SSE3-LABEL: avx2_hadd_d: 1155 ; SSE3: # BB#0: 1156 ; SSE3-NEXT: movd %xmm0, %ecx 1157 ; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] 1158 ; SSE3-NEXT: movd %xmm4, %r8d 1159 ; SSE3-NEXT: addl %ecx, %r8d 1160 ; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 1161 ; SSE3-NEXT: movd %xmm4, %edx 1162 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 1163 ; SSE3-NEXT: movd %xmm0, %r9d 1164 ; SSE3-NEXT: addl %edx, %r9d 1165 ; SSE3-NEXT: movd %xmm2, %esi 1166 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] 1167 ; SSE3-NEXT: movd %xmm0, %r10d 1168 ; SSE3-NEXT: addl %esi, %r10d 1169 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] 1170 ; SSE3-NEXT: movd %xmm0, %esi 1171 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] 1172 ; SSE3-NEXT: movd %xmm0, %edi 1173 ; SSE3-NEXT: addl %esi, %edi 1174 ; SSE3-NEXT: movd %xmm1, %eax 1175 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 1176 ; SSE3-NEXT: movd %xmm0, %r11d 1177 ; SSE3-NEXT: addl %eax, %r11d 1178 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 1179 ; SSE3-NEXT: movd %xmm0, %eax 1180 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 1181 ; SSE3-NEXT: movd %xmm0, %ecx 1182 ; SSE3-NEXT: addl %eax, %ecx 1183 ; SSE3-NEXT: movd %xmm3, %eax 1184 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] 1185 ; SSE3-NEXT: movd %xmm0, %edx 1186 ; SSE3-NEXT: addl %eax, %edx 1187 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] 1188 ; SSE3-NEXT: movd %xmm0, %eax 1189 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] 1190 ; SSE3-NEXT: movd %xmm0, %esi 1191 ; SSE3-NEXT: addl %eax, %esi 1192 ; SSE3-NEXT: movd %edi, %xmm0 1193 ; SSE3-NEXT: movd %r9d, %xmm1 1194 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1195 ; SSE3-NEXT: movd %r10d, %xmm2 1196 ; SSE3-NEXT: movd %r8d, %xmm0 1197 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1198 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1199 ; SSE3-NEXT: movd %esi, %xmm1 1200 ; SSE3-NEXT: movd %ecx, %xmm2 1201 ; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1202 ; SSE3-NEXT: movd %edx, %xmm3 1203 ; SSE3-NEXT: movd %r11d, %xmm1 1204 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 1205 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1206 ; SSE3-NEXT: retq 1207 ; 1208 ; SSSE3-LABEL: avx2_hadd_d: 1209 ; SSSE3: # BB#0: 1210 ; SSSE3-NEXT: phaddd %xmm2, %xmm0 1211 ; SSSE3-NEXT: phaddd %xmm3, %xmm1 1212 ; SSSE3-NEXT: retq 1213 ; 1214 ; AVX1-LABEL: avx2_hadd_d: 1215 ; AVX1: # BB#0: 1216 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1217 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1218 ; AVX1-NEXT: vphaddd %xmm2, %xmm3, %xmm2 1219 ; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 1220 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1221 ; AVX1-NEXT: retq 1222 ; 1223 ; AVX2-LABEL: avx2_hadd_d: 1224 ; AVX2: # BB#0: 1225 ; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0 1226 ; AVX2-NEXT: retq 1227 %vecext = extractelement <8 x i32> %a, i32 0 1228 %vecext1 = extractelement <8 x i32> %a, i32 1 1229 %add = add i32 %vecext, %vecext1 1230 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 1231 %vecext2 = extractelement <8 x i32> %a, i32 2 1232 %vecext3 = extractelement <8 x i32> %a, i32 3 1233 %add4 = add i32 %vecext2, %vecext3 1234 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1 1235 %vecext6 = extractelement <8 x i32> %b, i32 0 1236 %vecext7 = extractelement <8 x i32> %b, i32 1 1237 %add8 = add i32 %vecext6, %vecext7 1238 %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2 1239 %vecext10 = extractelement <8 x i32> %b, i32 2 1240 %vecext11 = extractelement <8 x i32> %b, i32 3 1241 %add12 = add i32 %vecext10, %vecext11 1242 %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3 1243 %vecext14 = extractelement <8 x i32> %a, i32 4 1244 %vecext15 = extractelement <8 x i32> %a, i32 5 1245 %add16 = add i32 %vecext14, %vecext15 1246 %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4 1247 %vecext18 = extractelement <8 x i32> %a, i32 6 1248 %vecext19 = extractelement <8 x i32> %a, i32 7 1249 %add20 = add i32 %vecext18, %vecext19 1250 %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5 1251 %vecext22 = extractelement <8 x i32> %b, i32 4 1252 %vecext23 = extractelement <8 x i32> %b, i32 5 1253 %add24 = add i32 %vecext22, %vecext23 1254 %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6 1255 %vecext26 = extractelement <8 x i32> %b, i32 6 1256 %vecext27 = extractelement <8 x i32> %b, i32 7 1257 %add28 = add i32 %vecext26, %vecext27 1258 %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7 1259 ret <8 x i32> %vecinit29 1260 } 1261 1262 define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) { 1263 ; SSE3-LABEL: avx2_hadd_w: 1264 ; SSE3: # BB#0: 1265 ; SSE3-NEXT: pushq %rbp 1266 ; SSE3-NEXT: .Ltmp12: 1267 ; SSE3-NEXT: .cfi_def_cfa_offset 16 1268 ; SSE3-NEXT: pushq %r15 1269 ; SSE3-NEXT: .Ltmp13: 1270 ; SSE3-NEXT: .cfi_def_cfa_offset 24 1271 ; SSE3-NEXT: pushq %r14 1272 ; SSE3-NEXT: .Ltmp14: 1273 ; SSE3-NEXT: .cfi_def_cfa_offset 32 1274 ; SSE3-NEXT: pushq %r13 1275 ; SSE3-NEXT: .Ltmp15: 1276 ; SSE3-NEXT: .cfi_def_cfa_offset 40 1277 ; SSE3-NEXT: pushq %r12 1278 ; SSE3-NEXT: .Ltmp16: 1279 ; SSE3-NEXT: .cfi_def_cfa_offset 48 1280 ; SSE3-NEXT: pushq %rbx 1281 ; SSE3-NEXT: .Ltmp17: 1282 ; SSE3-NEXT: .cfi_def_cfa_offset 56 1283 ; SSE3-NEXT: .Ltmp18: 1284 ; SSE3-NEXT: .cfi_offset %rbx, -56 1285 ; SSE3-NEXT: .Ltmp19: 1286 ; SSE3-NEXT: .cfi_offset %r12, -48 1287 ; SSE3-NEXT: .Ltmp20: 1288 ; SSE3-NEXT: .cfi_offset %r13, -40 1289 ; SSE3-NEXT: .Ltmp21: 1290 ; SSE3-NEXT: .cfi_offset %r14, -32 1291 ; SSE3-NEXT: .Ltmp22: 1292 ; SSE3-NEXT: .cfi_offset %r15, -24 1293 ; SSE3-NEXT: .Ltmp23: 1294 ; SSE3-NEXT: .cfi_offset %rbp, -16 1295 ; SSE3-NEXT: movd %xmm0, %eax 1296 ; SSE3-NEXT: pextrw $1, %xmm0, %ecx 1297 ; SSE3-NEXT: addl %eax, %ecx 1298 ; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill 1299 ; SSE3-NEXT: pextrw $2, %xmm0, %eax 1300 ; SSE3-NEXT: pextrw $3, %xmm0, %r15d 1301 ; SSE3-NEXT: addl %eax, %r15d 1302 ; SSE3-NEXT: pextrw $4, %xmm0, %eax 1303 ; SSE3-NEXT: pextrw $5, %xmm0, %r14d 1304 ; SSE3-NEXT: addl %eax, %r14d 1305 ; SSE3-NEXT: pextrw $6, %xmm0, %eax 1306 ; SSE3-NEXT: pextrw $7, %xmm0, %r13d 1307 ; SSE3-NEXT: addl %eax, %r13d 1308 ; SSE3-NEXT: movd %xmm1, %eax 1309 ; SSE3-NEXT: pextrw $1, %xmm1, %ecx 1310 ; SSE3-NEXT: addl %eax, %ecx 1311 ; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill 1312 ; SSE3-NEXT: pextrw $2, %xmm1, %eax 1313 ; SSE3-NEXT: pextrw $3, %xmm1, %r11d 1314 ; SSE3-NEXT: addl %eax, %r11d 1315 ; SSE3-NEXT: pextrw $4, %xmm1, %eax 1316 ; SSE3-NEXT: pextrw $5, %xmm1, %r10d 1317 ; SSE3-NEXT: addl %eax, %r10d 1318 ; SSE3-NEXT: pextrw $6, %xmm1, %eax 1319 ; SSE3-NEXT: pextrw $7, %xmm1, %r12d 1320 ; SSE3-NEXT: addl %eax, %r12d 1321 ; SSE3-NEXT: movd %xmm2, %eax 1322 ; SSE3-NEXT: pextrw $1, %xmm2, %ebx 1323 ; SSE3-NEXT: addl %eax, %ebx 1324 ; SSE3-NEXT: pextrw $2, %xmm2, %eax 1325 ; SSE3-NEXT: pextrw $3, %xmm2, %ecx 1326 ; SSE3-NEXT: addl %eax, %ecx 1327 ; SSE3-NEXT: pextrw $4, %xmm2, %esi 1328 ; SSE3-NEXT: pextrw $5, %xmm2, %r8d 1329 ; SSE3-NEXT: addl %esi, %r8d 1330 ; SSE3-NEXT: pextrw $6, %xmm2, %esi 1331 ; SSE3-NEXT: pextrw $7, %xmm2, %edx 1332 ; SSE3-NEXT: addl %esi, %edx 1333 ; SSE3-NEXT: movd %xmm3, %edi 1334 ; SSE3-NEXT: pextrw $1, %xmm3, %r9d 1335 ; SSE3-NEXT: addl %edi, %r9d 1336 ; SSE3-NEXT: pextrw $2, %xmm3, %ebp 1337 ; SSE3-NEXT: pextrw $3, %xmm3, %edi 1338 ; SSE3-NEXT: addl %ebp, %edi 1339 ; SSE3-NEXT: pextrw $4, %xmm3, %eax 1340 ; SSE3-NEXT: pextrw $5, %xmm3, %ebp 1341 ; SSE3-NEXT: addl %eax, %ebp 1342 ; SSE3-NEXT: pextrw $6, %xmm3, %esi 1343 ; SSE3-NEXT: pextrw $7, %xmm3, %eax 1344 ; SSE3-NEXT: addl %esi, %eax 1345 ; SSE3-NEXT: movd %edx, %xmm8 1346 ; SSE3-NEXT: movd %r13d, %xmm3 1347 ; SSE3-NEXT: movd %ecx, %xmm9 1348 ; SSE3-NEXT: movd %r15d, %xmm4 1349 ; SSE3-NEXT: movd %r8d, %xmm10 1350 ; SSE3-NEXT: movd %r14d, %xmm7 1351 ; SSE3-NEXT: movd %ebx, %xmm11 1352 ; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload 1353 ; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero 1354 ; SSE3-NEXT: movd %eax, %xmm12 1355 ; SSE3-NEXT: movd %r12d, %xmm6 1356 ; SSE3-NEXT: movd %edi, %xmm13 1357 ; SSE3-NEXT: movd %r11d, %xmm5 1358 ; SSE3-NEXT: movd %ebp, %xmm14 1359 ; SSE3-NEXT: movd %r10d, %xmm2 1360 ; SSE3-NEXT: movd %r9d, %xmm15 1361 ; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload 1362 ; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero 1363 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] 1364 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] 1365 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 1366 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] 1367 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] 1368 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] 1369 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 1370 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] 1371 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] 1372 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 1373 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] 1374 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] 1375 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1376 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] 1377 ; SSE3-NEXT: popq %rbx 1378 ; SSE3-NEXT: popq %r12 1379 ; SSE3-NEXT: popq %r13 1380 ; SSE3-NEXT: popq %r14 1381 ; SSE3-NEXT: popq %r15 1382 ; SSE3-NEXT: popq %rbp 1383 ; SSE3-NEXT: retq 1384 ; 1385 ; SSSE3-LABEL: avx2_hadd_w: 1386 ; SSSE3: # BB#0: 1387 ; SSSE3-NEXT: phaddw %xmm2, %xmm0 1388 ; SSSE3-NEXT: phaddw %xmm3, %xmm1 1389 ; SSSE3-NEXT: retq 1390 ; 1391 ; AVX1-LABEL: avx2_hadd_w: 1392 ; AVX1: # BB#0: 1393 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1394 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1395 ; AVX1-NEXT: vphaddw %xmm2, %xmm3, %xmm2 1396 ; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0 1397 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1398 ; AVX1-NEXT: retq 1399 ; 1400 ; AVX2-LABEL: avx2_hadd_w: 1401 ; AVX2: # BB#0: 1402 ; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0 1403 ; AVX2-NEXT: retq 1404 %vecext = extractelement <16 x i16> %a, i32 0 1405 %vecext1 = extractelement <16 x i16> %a, i32 1 1406 %add = add i16 %vecext, %vecext1 1407 %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0 1408 %vecext4 = extractelement <16 x i16> %a, i32 2 1409 %vecext6 = extractelement <16 x i16> %a, i32 3 1410 %add8 = add i16 %vecext4, %vecext6 1411 %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1 1412 %vecext11 = extractelement <16 x i16> %a, i32 4 1413 %vecext13 = extractelement <16 x i16> %a, i32 5 1414 %add15 = add i16 %vecext11, %vecext13 1415 %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2 1416 %vecext18 = extractelement <16 x i16> %a, i32 6 1417 %vecext20 = extractelement <16 x i16> %a, i32 7 1418 %add22 = add i16 %vecext18, %vecext20 1419 %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3 1420 %vecext25 = extractelement <16 x i16> %a, i32 8 1421 %vecext27 = extractelement <16 x i16> %a, i32 9 1422 %add29 = add i16 %vecext25, %vecext27 1423 %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 8 1424 %vecext32 = extractelement <16 x i16> %a, i32 10 1425 %vecext34 = extractelement <16 x i16> %a, i32 11 1426 %add36 = add i16 %vecext32, %vecext34 1427 %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 9 1428 %vecext39 = extractelement <16 x i16> %a, i32 12 1429 %vecext41 = extractelement <16 x i16> %a, i32 13 1430 %add43 = add i16 %vecext39, %vecext41 1431 %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 10 1432 %vecext46 = extractelement <16 x i16> %a, i32 14 1433 %vecext48 = extractelement <16 x i16> %a, i32 15 1434 %add50 = add i16 %vecext46, %vecext48 1435 %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 11 1436 %vecext53 = extractelement <16 x i16> %b, i32 0 1437 %vecext55 = extractelement <16 x i16> %b, i32 1 1438 %add57 = add i16 %vecext53, %vecext55 1439 %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 4 1440 %vecext60 = extractelement <16 x i16> %b, i32 2 1441 %vecext62 = extractelement <16 x i16> %b, i32 3 1442 %add64 = add i16 %vecext60, %vecext62 1443 %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 5 1444 %vecext67 = extractelement <16 x i16> %b, i32 4 1445 %vecext69 = extractelement <16 x i16> %b, i32 5 1446 %add71 = add i16 %vecext67, %vecext69 1447 %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 6 1448 %vecext74 = extractelement <16 x i16> %b, i32 6 1449 %vecext76 = extractelement <16 x i16> %b, i32 7 1450 %add78 = add i16 %vecext74, %vecext76 1451 %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 7 1452 %vecext81 = extractelement <16 x i16> %b, i32 8 1453 %vecext83 = extractelement <16 x i16> %b, i32 9 1454 %add85 = add i16 %vecext81, %vecext83 1455 %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12 1456 %vecext88 = extractelement <16 x i16> %b, i32 10 1457 %vecext90 = extractelement <16 x i16> %b, i32 11 1458 %add92 = add i16 %vecext88, %vecext90 1459 %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13 1460 %vecext95 = extractelement <16 x i16> %b, i32 12 1461 %vecext97 = extractelement <16 x i16> %b, i32 13 1462 %add99 = add i16 %vecext95, %vecext97 1463 %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14 1464 %vecext102 = extractelement <16 x i16> %b, i32 14 1465 %vecext104 = extractelement <16 x i16> %b, i32 15 1466 %add106 = add i16 %vecext102, %vecext104 1467 %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15 1468 ret <16 x i16> %vecinit108 1469 } 1470