1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSE3 3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3,+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 6 7 define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) { 8 ; SSE-LABEL: hadd_ps_test1: 9 ; SSE: # %bb.0: 10 ; SSE-NEXT: haddps %xmm1, %xmm0 11 ; SSE-NEXT: retq 12 ; 13 ; AVX-LABEL: hadd_ps_test1: 14 ; AVX: # %bb.0: 15 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 16 ; AVX-NEXT: retq 17 %vecext = extractelement <4 x float> %A, i32 0 18 %vecext1 = extractelement <4 x float> %A, i32 1 19 %add = fadd float %vecext, %vecext1 20 %vecinit = insertelement <4 x float> undef, float %add, i32 0 21 %vecext2 = extractelement <4 x float> %A, i32 2 22 %vecext3 = extractelement <4 x float> %A, i32 3 23 %add4 = fadd float %vecext2, %vecext3 24 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1 25 %vecext6 = extractelement <4 x float> %B, i32 0 26 %vecext7 = extractelement <4 x float> %B, i32 1 27 %add8 = fadd float %vecext6, %vecext7 28 %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2 29 %vecext10 = extractelement <4 x float> %B, i32 2 30 %vecext11 = extractelement <4 x float> %B, i32 3 31 %add12 = fadd float %vecext10, %vecext11 32 %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3 33 ret <4 x float> %vecinit13 34 } 35 36 define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) { 37 ; SSE-LABEL: hadd_ps_test2: 38 ; SSE: # %bb.0: 39 ; SSE-NEXT: haddps %xmm1, %xmm0 40 ; SSE-NEXT: retq 41 ; 42 ; AVX-LABEL: hadd_ps_test2: 43 ; AVX: # %bb.0: 44 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 45 ; AVX-NEXT: retq 46 %vecext = extractelement <4 x float> %A, i32 2 47 %vecext1 = extractelement <4 x float> %A, i32 3 48 %add = fadd float %vecext, %vecext1 49 %vecinit = insertelement <4 x float> undef, float %add, i32 1 50 %vecext2 = extractelement <4 x float> %A, i32 0 51 %vecext3 = extractelement <4 x float> %A, i32 1 52 %add4 = fadd float %vecext2, %vecext3 53 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 0 54 %vecext6 = extractelement <4 x float> %B, i32 2 55 %vecext7 = extractelement <4 x float> %B, i32 3 56 %add8 = fadd float %vecext6, %vecext7 57 %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 3 58 %vecext10 = extractelement <4 x float> %B, i32 0 59 %vecext11 = extractelement <4 x float> %B, i32 1 60 %add12 = fadd float %vecext10, %vecext11 61 %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 2 62 ret <4 x float> %vecinit13 63 } 64 65 define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) { 66 ; SSE-LABEL: hsub_ps_test1: 67 ; SSE: # %bb.0: 68 ; SSE-NEXT: hsubps %xmm1, %xmm0 69 ; SSE-NEXT: retq 70 ; 71 ; AVX-LABEL: hsub_ps_test1: 72 ; AVX: # %bb.0: 73 ; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0 74 ; AVX-NEXT: retq 75 %vecext = extractelement <4 x float> %A, i32 0 76 %vecext1 = extractelement <4 x float> %A, i32 1 77 %sub = fsub float %vecext, %vecext1 78 %vecinit = insertelement <4 x float> undef, float %sub, i32 0 79 %vecext2 = extractelement <4 x float> %A, i32 2 80 %vecext3 = extractelement <4 x float> %A, i32 3 81 %sub4 = fsub float %vecext2, %vecext3 82 %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 1 83 %vecext6 = extractelement <4 x float> %B, i32 0 84 %vecext7 = extractelement <4 x float> %B, i32 1 85 %sub8 = fsub float %vecext6, %vecext7 86 %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 2 87 %vecext10 = extractelement <4 x float> %B, i32 2 88 %vecext11 = extractelement <4 x float> %B, i32 3 89 %sub12 = fsub float %vecext10, %vecext11 90 %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 3 91 ret <4 x float> %vecinit13 92 } 93 94 define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) { 95 ; SSE-LABEL: hsub_ps_test2: 96 ; SSE: # %bb.0: 97 ; SSE-NEXT: hsubps %xmm1, %xmm0 98 ; SSE-NEXT: retq 99 ; 100 ; AVX-LABEL: hsub_ps_test2: 101 ; AVX: # %bb.0: 102 ; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0 103 ; AVX-NEXT: retq 104 %vecext = extractelement <4 x float> %A, i32 2 105 %vecext1 = extractelement <4 x float> %A, i32 3 106 %sub = fsub float %vecext, %vecext1 107 %vecinit = insertelement <4 x float> undef, float %sub, i32 1 108 %vecext2 = extractelement <4 x float> %A, i32 0 109 %vecext3 = extractelement <4 x float> %A, i32 1 110 %sub4 = fsub float %vecext2, %vecext3 111 %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0 112 %vecext6 = extractelement <4 x float> %B, i32 2 113 %vecext7 = extractelement <4 x float> %B, i32 3 114 %sub8 = fsub float %vecext6, %vecext7 115 %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3 116 %vecext10 = extractelement <4 x float> %B, i32 0 117 %vecext11 = extractelement <4 x float> %B, i32 1 118 %sub12 = fsub float %vecext10, %vecext11 119 %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2 120 ret <4 x float> %vecinit13 121 } 122 123 define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) { 124 ; SSE3-LABEL: phadd_d_test1: 125 ; SSE3: # %bb.0: 126 ; SSE3-NEXT: movd %xmm0, %eax 127 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 128 ; SSE3-NEXT: movd %xmm2, %ecx 129 ; SSE3-NEXT: addl %eax, %ecx 130 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 131 ; SSE3-NEXT: movd %xmm2, %eax 132 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 133 ; SSE3-NEXT: movd %xmm0, %edx 134 ; SSE3-NEXT: addl %eax, %edx 135 ; SSE3-NEXT: movd %xmm1, %eax 136 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 137 ; SSE3-NEXT: movd %xmm0, %esi 138 ; SSE3-NEXT: addl %eax, %esi 139 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 140 ; SSE3-NEXT: movd %xmm0, %eax 141 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 142 ; SSE3-NEXT: movd %xmm0, %edi 143 ; SSE3-NEXT: addl %eax, %edi 144 ; SSE3-NEXT: movd %edi, %xmm0 145 ; SSE3-NEXT: movd %esi, %xmm1 146 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 147 ; SSE3-NEXT: movd %edx, %xmm2 148 ; SSE3-NEXT: movd %ecx, %xmm0 149 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 150 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 151 ; SSE3-NEXT: retq 152 ; 153 ; SSSE3-LABEL: phadd_d_test1: 154 ; SSSE3: # %bb.0: 155 ; SSSE3-NEXT: phaddd %xmm1, %xmm0 156 ; SSSE3-NEXT: retq 157 ; 158 ; AVX-LABEL: phadd_d_test1: 159 ; AVX: # %bb.0: 160 ; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 161 ; AVX-NEXT: retq 162 %vecext = extractelement <4 x i32> %A, i32 0 163 %vecext1 = extractelement <4 x i32> %A, i32 1 164 %add = add i32 %vecext, %vecext1 165 %vecinit = insertelement <4 x i32> undef, i32 %add, i32 0 166 %vecext2 = extractelement <4 x i32> %A, i32 2 167 %vecext3 = extractelement <4 x i32> %A, i32 3 168 %add4 = add i32 %vecext2, %vecext3 169 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 1 170 %vecext6 = extractelement <4 x i32> %B, i32 0 171 %vecext7 = extractelement <4 x i32> %B, i32 1 172 %add8 = add i32 %vecext6, %vecext7 173 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 2 174 %vecext10 = extractelement <4 x i32> %B, i32 2 175 %vecext11 = extractelement <4 x i32> %B, i32 3 176 %add12 = add i32 %vecext10, %vecext11 177 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 3 178 ret <4 x i32> %vecinit13 179 } 180 181 define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) { 182 ; SSE3-LABEL: phadd_d_test2: 183 ; SSE3: # %bb.0: 184 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 185 ; SSE3-NEXT: movd %xmm2, %eax 186 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] 187 ; SSE3-NEXT: movd %xmm2, %ecx 188 ; SSE3-NEXT: addl %eax, %ecx 189 ; SSE3-NEXT: movd %xmm0, %eax 190 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 191 ; SSE3-NEXT: movd %xmm0, %edx 192 ; SSE3-NEXT: addl %eax, %edx 193 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 194 ; SSE3-NEXT: movd %xmm0, %eax 195 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 196 ; SSE3-NEXT: movd %xmm0, %esi 197 ; SSE3-NEXT: addl %eax, %esi 198 ; SSE3-NEXT: movd %esi, %xmm0 199 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] 200 ; SSE3-NEXT: movd %xmm2, %eax 201 ; SSE3-NEXT: movd %xmm1, %esi 202 ; SSE3-NEXT: addl %eax, %esi 203 ; SSE3-NEXT: movd %esi, %xmm1 204 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 205 ; SSE3-NEXT: movd %ecx, %xmm2 206 ; SSE3-NEXT: movd %edx, %xmm0 207 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 208 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 209 ; SSE3-NEXT: retq 210 ; 211 ; SSSE3-LABEL: phadd_d_test2: 212 ; SSSE3: # %bb.0: 213 ; SSSE3-NEXT: phaddd %xmm1, %xmm0 214 ; SSSE3-NEXT: retq 215 ; 216 ; AVX-LABEL: phadd_d_test2: 217 ; AVX: # %bb.0: 218 ; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 219 ; AVX-NEXT: retq 220 %vecext = extractelement <4 x i32> %A, i32 2 221 %vecext1 = extractelement <4 x i32> %A, i32 3 222 %add = add i32 %vecext, %vecext1 223 %vecinit = insertelement <4 x i32> undef, i32 %add, i32 1 224 %vecext2 = extractelement <4 x i32> %A, i32 0 225 %vecext3 = extractelement <4 x i32> %A, i32 1 226 %add4 = add i32 %vecext2, %vecext3 227 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 0 228 %vecext6 = extractelement <4 x i32> %B, i32 3 229 %vecext7 = extractelement <4 x i32> %B, i32 2 230 %add8 = add i32 %vecext6, %vecext7 231 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 3 232 %vecext10 = extractelement <4 x i32> %B, i32 1 233 %vecext11 = extractelement <4 x i32> %B, i32 0 234 %add12 = add i32 %vecext10, %vecext11 235 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 2 236 ret <4 x i32> %vecinit13 237 } 238 239 define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) { 240 ; SSE3-LABEL: phsub_d_test1: 241 ; SSE3: # %bb.0: 242 ; SSE3-NEXT: movd %xmm0, %eax 243 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 244 ; SSE3-NEXT: movd %xmm2, %ecx 245 ; SSE3-NEXT: subl %ecx, %eax 246 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 247 ; SSE3-NEXT: movd %xmm2, %ecx 248 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 249 ; SSE3-NEXT: movd %xmm0, %edx 250 ; SSE3-NEXT: subl %edx, %ecx 251 ; SSE3-NEXT: movd %xmm1, %edx 252 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 253 ; SSE3-NEXT: movd %xmm0, %esi 254 ; SSE3-NEXT: subl %esi, %edx 255 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 256 ; SSE3-NEXT: movd %xmm0, %esi 257 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 258 ; SSE3-NEXT: movd %xmm0, %edi 259 ; SSE3-NEXT: subl %edi, %esi 260 ; SSE3-NEXT: movd %esi, %xmm0 261 ; SSE3-NEXT: movd %edx, %xmm1 262 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 263 ; SSE3-NEXT: movd %ecx, %xmm2 264 ; SSE3-NEXT: movd %eax, %xmm0 265 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 266 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 267 ; SSE3-NEXT: retq 268 ; 269 ; SSSE3-LABEL: phsub_d_test1: 270 ; SSSE3: # %bb.0: 271 ; SSSE3-NEXT: phsubd %xmm1, %xmm0 272 ; SSSE3-NEXT: retq 273 ; 274 ; AVX-LABEL: phsub_d_test1: 275 ; AVX: # %bb.0: 276 ; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0 277 ; AVX-NEXT: retq 278 %vecext = extractelement <4 x i32> %A, i32 0 279 %vecext1 = extractelement <4 x i32> %A, i32 1 280 %sub = sub i32 %vecext, %vecext1 281 %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0 282 %vecext2 = extractelement <4 x i32> %A, i32 2 283 %vecext3 = extractelement <4 x i32> %A, i32 3 284 %sub4 = sub i32 %vecext2, %vecext3 285 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1 286 %vecext6 = extractelement <4 x i32> %B, i32 0 287 %vecext7 = extractelement <4 x i32> %B, i32 1 288 %sub8 = sub i32 %vecext6, %vecext7 289 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2 290 %vecext10 = extractelement <4 x i32> %B, i32 2 291 %vecext11 = extractelement <4 x i32> %B, i32 3 292 %sub12 = sub i32 %vecext10, %vecext11 293 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3 294 ret <4 x i32> %vecinit13 295 } 296 297 define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) { 298 ; SSE3-LABEL: phsub_d_test2: 299 ; SSE3: # %bb.0: 300 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 301 ; SSE3-NEXT: movd %xmm2, %eax 302 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] 303 ; SSE3-NEXT: movd %xmm2, %ecx 304 ; SSE3-NEXT: subl %ecx, %eax 305 ; SSE3-NEXT: movd %xmm0, %ecx 306 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 307 ; SSE3-NEXT: movd %xmm0, %edx 308 ; SSE3-NEXT: subl %edx, %ecx 309 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 310 ; SSE3-NEXT: movd %xmm0, %edx 311 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 312 ; SSE3-NEXT: movd %xmm0, %esi 313 ; SSE3-NEXT: subl %esi, %edx 314 ; SSE3-NEXT: movd %edx, %xmm0 315 ; SSE3-NEXT: movd %xmm1, %edx 316 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] 317 ; SSE3-NEXT: movd %xmm1, %esi 318 ; SSE3-NEXT: subl %esi, %edx 319 ; SSE3-NEXT: movd %edx, %xmm1 320 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 321 ; SSE3-NEXT: movd %eax, %xmm2 322 ; SSE3-NEXT: movd %ecx, %xmm0 323 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 324 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 325 ; SSE3-NEXT: retq 326 ; 327 ; SSSE3-LABEL: phsub_d_test2: 328 ; SSSE3: # %bb.0: 329 ; SSSE3-NEXT: phsubd %xmm1, %xmm0 330 ; SSSE3-NEXT: retq 331 ; 332 ; AVX-LABEL: phsub_d_test2: 333 ; AVX: # %bb.0: 334 ; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0 335 ; AVX-NEXT: retq 336 %vecext = extractelement <4 x i32> %A, i32 2 337 %vecext1 = extractelement <4 x i32> %A, i32 3 338 %sub = sub i32 %vecext, %vecext1 339 %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 1 340 %vecext2 = extractelement <4 x i32> %A, i32 0 341 %vecext3 = extractelement <4 x i32> %A, i32 1 342 %sub4 = sub i32 %vecext2, %vecext3 343 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 0 344 %vecext6 = extractelement <4 x i32> %B, i32 2 345 %vecext7 = extractelement <4 x i32> %B, i32 3 346 %sub8 = sub i32 %vecext6, %vecext7 347 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 3 348 %vecext10 = extractelement <4 x i32> %B, i32 0 349 %vecext11 = extractelement <4 x i32> %B, i32 1 350 %sub12 = sub i32 %vecext10, %vecext11 351 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 2 352 ret <4 x i32> %vecinit13 353 } 354 355 define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) { 356 ; SSE-LABEL: hadd_pd_test1: 357 ; SSE: # %bb.0: 358 ; SSE-NEXT: haddpd %xmm1, %xmm0 359 ; SSE-NEXT: retq 360 ; 361 ; AVX-LABEL: hadd_pd_test1: 362 ; AVX: # %bb.0: 363 ; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 364 ; AVX-NEXT: retq 365 %vecext = extractelement <2 x double> %A, i32 0 366 %vecext1 = extractelement <2 x double> %A, i32 1 367 %add = fadd double %vecext, %vecext1 368 %vecinit = insertelement <2 x double> undef, double %add, i32 0 369 %vecext2 = extractelement <2 x double> %B, i32 0 370 %vecext3 = extractelement <2 x double> %B, i32 1 371 %add2 = fadd double %vecext2, %vecext3 372 %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1 373 ret <2 x double> %vecinit2 374 } 375 376 define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) { 377 ; SSE-LABEL: hadd_pd_test2: 378 ; SSE: # %bb.0: 379 ; SSE-NEXT: haddpd %xmm1, %xmm0 380 ; SSE-NEXT: retq 381 ; 382 ; AVX-LABEL: hadd_pd_test2: 383 ; AVX: # %bb.0: 384 ; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 385 ; AVX-NEXT: retq 386 %vecext = extractelement <2 x double> %A, i32 1 387 %vecext1 = extractelement <2 x double> %A, i32 0 388 %add = fadd double %vecext, %vecext1 389 %vecinit = insertelement <2 x double> undef, double %add, i32 0 390 %vecext2 = extractelement <2 x double> %B, i32 1 391 %vecext3 = extractelement <2 x double> %B, i32 0 392 %add2 = fadd double %vecext2, %vecext3 393 %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1 394 ret <2 x double> %vecinit2 395 } 396 397 define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) { 398 ; SSE-LABEL: hsub_pd_test1: 399 ; SSE: # %bb.0: 400 ; SSE-NEXT: hsubpd %xmm1, %xmm0 401 ; SSE-NEXT: retq 402 ; 403 ; AVX-LABEL: hsub_pd_test1: 404 ; AVX: # %bb.0: 405 ; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 406 ; AVX-NEXT: retq 407 %vecext = extractelement <2 x double> %A, i32 0 408 %vecext1 = extractelement <2 x double> %A, i32 1 409 %sub = fsub double %vecext, %vecext1 410 %vecinit = insertelement <2 x double> undef, double %sub, i32 0 411 %vecext2 = extractelement <2 x double> %B, i32 0 412 %vecext3 = extractelement <2 x double> %B, i32 1 413 %sub2 = fsub double %vecext2, %vecext3 414 %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 1 415 ret <2 x double> %vecinit2 416 } 417 418 define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) { 419 ; SSE-LABEL: hsub_pd_test2: 420 ; SSE: # %bb.0: 421 ; SSE-NEXT: hsubpd %xmm1, %xmm0 422 ; SSE-NEXT: retq 423 ; 424 ; AVX-LABEL: hsub_pd_test2: 425 ; AVX: # %bb.0: 426 ; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 427 ; AVX-NEXT: retq 428 %vecext = extractelement <2 x double> %B, i32 0 429 %vecext1 = extractelement <2 x double> %B, i32 1 430 %sub = fsub double %vecext, %vecext1 431 %vecinit = insertelement <2 x double> undef, double %sub, i32 1 432 %vecext2 = extractelement <2 x double> %A, i32 0 433 %vecext3 = extractelement <2 x double> %A, i32 1 434 %sub2 = fsub double %vecext2, %vecext3 435 %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0 436 ret <2 x double> %vecinit2 437 } 438 439 define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) { 440 ; SSE-LABEL: avx_vhadd_pd_test: 441 ; SSE: # %bb.0: 442 ; SSE-NEXT: haddpd %xmm1, %xmm0 443 ; SSE-NEXT: haddpd %xmm3, %xmm2 444 ; SSE-NEXT: movapd %xmm2, %xmm1 445 ; SSE-NEXT: retq 446 ; 447 ; AVX-LABEL: avx_vhadd_pd_test: 448 ; AVX: # %bb.0: 449 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 450 ; AVX-NEXT: vhaddpd %xmm2, %xmm1, %xmm1 451 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 452 ; AVX-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 453 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 454 ; AVX-NEXT: retq 455 %vecext = extractelement <4 x double> %A, i32 0 456 %vecext1 = extractelement <4 x double> %A, i32 1 457 %add = fadd double %vecext, %vecext1 458 %vecinit = insertelement <4 x double> undef, double %add, i32 0 459 %vecext2 = extractelement <4 x double> %A, i32 2 460 %vecext3 = extractelement <4 x double> %A, i32 3 461 %add4 = fadd double %vecext2, %vecext3 462 %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1 463 %vecext6 = extractelement <4 x double> %B, i32 0 464 %vecext7 = extractelement <4 x double> %B, i32 1 465 %add8 = fadd double %vecext6, %vecext7 466 %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2 467 %vecext10 = extractelement <4 x double> %B, i32 2 468 %vecext11 = extractelement <4 x double> %B, i32 3 469 %add12 = fadd double %vecext10, %vecext11 470 %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3 471 ret <4 x double> %vecinit13 472 } 473 474 define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) { 475 ; SSE-LABEL: avx_vhsub_pd_test: 476 ; SSE: # %bb.0: 477 ; SSE-NEXT: hsubpd %xmm1, %xmm0 478 ; SSE-NEXT: hsubpd %xmm3, %xmm2 479 ; SSE-NEXT: movapd %xmm2, %xmm1 480 ; SSE-NEXT: retq 481 ; 482 ; AVX-LABEL: avx_vhsub_pd_test: 483 ; AVX: # %bb.0: 484 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 485 ; AVX-NEXT: vhsubpd %xmm2, %xmm1, %xmm1 486 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 487 ; AVX-NEXT: vhsubpd %xmm2, %xmm0, %xmm0 488 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 489 ; AVX-NEXT: retq 490 %vecext = extractelement <4 x double> %A, i32 0 491 %vecext1 = extractelement <4 x double> %A, i32 1 492 %sub = fsub double %vecext, %vecext1 493 %vecinit = insertelement <4 x double> undef, double %sub, i32 0 494 %vecext2 = extractelement <4 x double> %A, i32 2 495 %vecext3 = extractelement <4 x double> %A, i32 3 496 %sub4 = fsub double %vecext2, %vecext3 497 %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1 498 %vecext6 = extractelement <4 x double> %B, i32 0 499 %vecext7 = extractelement <4 x double> %B, i32 1 500 %sub8 = fsub double %vecext6, %vecext7 501 %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2 502 %vecext10 = extractelement <4 x double> %B, i32 2 503 %vecext11 = extractelement <4 x double> %B, i32 3 504 %sub12 = fsub double %vecext10, %vecext11 505 %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3 506 ret <4 x double> %vecinit13 507 } 508 509 define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) { 510 ; SSE3-LABEL: avx2_vphadd_d_test: 511 ; SSE3: # %bb.0: 512 ; SSE3-NEXT: movd %xmm0, %ecx 513 ; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] 514 ; SSE3-NEXT: movd %xmm4, %r8d 515 ; SSE3-NEXT: addl %ecx, %r8d 516 ; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 517 ; SSE3-NEXT: movd %xmm4, %edx 518 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 519 ; SSE3-NEXT: movd %xmm0, %r9d 520 ; SSE3-NEXT: addl %edx, %r9d 521 ; SSE3-NEXT: movd %xmm1, %edx 522 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 523 ; SSE3-NEXT: movd %xmm0, %esi 524 ; SSE3-NEXT: addl %edx, %esi 525 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 526 ; SSE3-NEXT: movd %xmm0, %edx 527 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 528 ; SSE3-NEXT: movd %xmm0, %edi 529 ; SSE3-NEXT: addl %edx, %edi 530 ; SSE3-NEXT: movd %xmm2, %eax 531 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] 532 ; SSE3-NEXT: movd %xmm0, %r10d 533 ; SSE3-NEXT: addl %eax, %r10d 534 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] 535 ; SSE3-NEXT: movd %xmm0, %eax 536 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] 537 ; SSE3-NEXT: movd %xmm0, %ecx 538 ; SSE3-NEXT: addl %eax, %ecx 539 ; SSE3-NEXT: movd %xmm3, %eax 540 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] 541 ; SSE3-NEXT: movd %xmm0, %edx 542 ; SSE3-NEXT: addl %eax, %edx 543 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] 544 ; SSE3-NEXT: movd %xmm0, %r11d 545 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] 546 ; SSE3-NEXT: movd %xmm0, %eax 547 ; SSE3-NEXT: addl %r11d, %eax 548 ; SSE3-NEXT: movd %edi, %xmm0 549 ; SSE3-NEXT: movd %esi, %xmm1 550 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 551 ; SSE3-NEXT: movd %r9d, %xmm2 552 ; SSE3-NEXT: movd %r8d, %xmm0 553 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 554 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 555 ; SSE3-NEXT: movd %eax, %xmm1 556 ; SSE3-NEXT: movd %edx, %xmm2 557 ; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 558 ; SSE3-NEXT: movd %ecx, %xmm3 559 ; SSE3-NEXT: movd %r10d, %xmm1 560 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 561 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 562 ; SSE3-NEXT: retq 563 ; 564 ; SSSE3-LABEL: avx2_vphadd_d_test: 565 ; SSSE3: # %bb.0: 566 ; SSSE3-NEXT: phaddd %xmm1, %xmm0 567 ; SSSE3-NEXT: phaddd %xmm3, %xmm2 568 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 569 ; SSSE3-NEXT: retq 570 ; 571 ; AVX1-LABEL: avx2_vphadd_d_test: 572 ; AVX1: # %bb.0: 573 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 574 ; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 575 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 576 ; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 577 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 578 ; AVX1-NEXT: retq 579 ; 580 ; AVX2-LABEL: avx2_vphadd_d_test: 581 ; AVX2: # %bb.0: 582 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 583 ; AVX2-NEXT: vphaddd %xmm2, %xmm1, %xmm1 584 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 585 ; AVX2-NEXT: vphaddd %xmm2, %xmm0, %xmm0 586 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 587 ; AVX2-NEXT: retq 588 %vecext = extractelement <8 x i32> %A, i32 0 589 %vecext1 = extractelement <8 x i32> %A, i32 1 590 %add = add i32 %vecext, %vecext1 591 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 592 %vecext2 = extractelement <8 x i32> %A, i32 2 593 %vecext3 = extractelement <8 x i32> %A, i32 3 594 %add4 = add i32 %vecext2, %vecext3 595 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1 596 %vecext6 = extractelement <8 x i32> %A, i32 4 597 %vecext7 = extractelement <8 x i32> %A, i32 5 598 %add8 = add i32 %vecext6, %vecext7 599 %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2 600 %vecext10 = extractelement <8 x i32> %A, i32 6 601 %vecext11 = extractelement <8 x i32> %A, i32 7 602 %add12 = add i32 %vecext10, %vecext11 603 %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3 604 %vecext14 = extractelement <8 x i32> %B, i32 0 605 %vecext15 = extractelement <8 x i32> %B, i32 1 606 %add16 = add i32 %vecext14, %vecext15 607 %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4 608 %vecext18 = extractelement <8 x i32> %B, i32 2 609 %vecext19 = extractelement <8 x i32> %B, i32 3 610 %add20 = add i32 %vecext18, %vecext19 611 %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5 612 %vecext22 = extractelement <8 x i32> %B, i32 4 613 %vecext23 = extractelement <8 x i32> %B, i32 5 614 %add24 = add i32 %vecext22, %vecext23 615 %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6 616 %vecext26 = extractelement <8 x i32> %B, i32 6 617 %vecext27 = extractelement <8 x i32> %B, i32 7 618 %add28 = add i32 %vecext26, %vecext27 619 %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7 620 ret <8 x i32> %vecinit29 621 } 622 623 define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) { 624 ; SSE3-LABEL: avx2_vphadd_w_test: 625 ; SSE3: # %bb.0: 626 ; SSE3-NEXT: pushq %rbp 627 ; SSE3-NEXT: .cfi_def_cfa_offset 16 628 ; SSE3-NEXT: pushq %r15 629 ; SSE3-NEXT: .cfi_def_cfa_offset 24 630 ; SSE3-NEXT: pushq %r14 631 ; SSE3-NEXT: .cfi_def_cfa_offset 32 632 ; SSE3-NEXT: pushq %r13 633 ; SSE3-NEXT: .cfi_def_cfa_offset 40 634 ; SSE3-NEXT: pushq %r12 635 ; SSE3-NEXT: .cfi_def_cfa_offset 48 636 ; SSE3-NEXT: pushq %rbx 637 ; SSE3-NEXT: .cfi_def_cfa_offset 56 638 ; SSE3-NEXT: .cfi_offset %rbx, -56 639 ; SSE3-NEXT: .cfi_offset %r12, -48 640 ; SSE3-NEXT: .cfi_offset %r13, -40 641 ; SSE3-NEXT: .cfi_offset %r14, -32 642 ; SSE3-NEXT: .cfi_offset %r15, -24 643 ; SSE3-NEXT: .cfi_offset %rbp, -16 644 ; SSE3-NEXT: movd %xmm0, %eax 645 ; SSE3-NEXT: pextrw $1, %xmm0, %ecx 646 ; SSE3-NEXT: addl %eax, %ecx 647 ; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 648 ; SSE3-NEXT: pextrw $2, %xmm0, %eax 649 ; SSE3-NEXT: pextrw $3, %xmm0, %ecx 650 ; SSE3-NEXT: addl %eax, %ecx 651 ; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 652 ; SSE3-NEXT: pextrw $4, %xmm0, %eax 653 ; SSE3-NEXT: pextrw $5, %xmm0, %r11d 654 ; SSE3-NEXT: addl %eax, %r11d 655 ; SSE3-NEXT: pextrw $6, %xmm0, %eax 656 ; SSE3-NEXT: pextrw $7, %xmm0, %r15d 657 ; SSE3-NEXT: addl %eax, %r15d 658 ; SSE3-NEXT: movd %xmm1, %eax 659 ; SSE3-NEXT: pextrw $1, %xmm1, %r13d 660 ; SSE3-NEXT: addl %eax, %r13d 661 ; SSE3-NEXT: pextrw $2, %xmm1, %eax 662 ; SSE3-NEXT: pextrw $3, %xmm1, %ebx 663 ; SSE3-NEXT: addl %eax, %ebx 664 ; SSE3-NEXT: pextrw $4, %xmm1, %eax 665 ; SSE3-NEXT: pextrw $5, %xmm1, %r8d 666 ; SSE3-NEXT: addl %eax, %r8d 667 ; SSE3-NEXT: pextrw $6, %xmm1, %eax 668 ; SSE3-NEXT: pextrw $7, %xmm1, %esi 669 ; SSE3-NEXT: addl %eax, %esi 670 ; SSE3-NEXT: movd %xmm2, %eax 671 ; SSE3-NEXT: pextrw $1, %xmm2, %r10d 672 ; SSE3-NEXT: addl %eax, %r10d 673 ; SSE3-NEXT: pextrw $2, %xmm2, %eax 674 ; SSE3-NEXT: pextrw $3, %xmm2, %r14d 675 ; SSE3-NEXT: addl %eax, %r14d 676 ; SSE3-NEXT: pextrw $4, %xmm2, %eax 677 ; SSE3-NEXT: pextrw $5, %xmm2, %r12d 678 ; SSE3-NEXT: addl %eax, %r12d 679 ; SSE3-NEXT: pextrw $6, %xmm2, %eax 680 ; SSE3-NEXT: pextrw $7, %xmm2, %r9d 681 ; SSE3-NEXT: addl %eax, %r9d 682 ; SSE3-NEXT: movd %xmm3, %eax 683 ; SSE3-NEXT: pextrw $1, %xmm3, %ebp 684 ; SSE3-NEXT: addl %eax, %ebp 685 ; SSE3-NEXT: pextrw $2, %xmm3, %edx 686 ; SSE3-NEXT: pextrw $3, %xmm3, %edi 687 ; SSE3-NEXT: addl %edx, %edi 688 ; SSE3-NEXT: pextrw $4, %xmm3, %edx 689 ; SSE3-NEXT: pextrw $5, %xmm3, %ecx 690 ; SSE3-NEXT: addl %edx, %ecx 691 ; SSE3-NEXT: pextrw $6, %xmm3, %edx 692 ; SSE3-NEXT: pextrw $7, %xmm3, %eax 693 ; SSE3-NEXT: addl %edx, %eax 694 ; SSE3-NEXT: movd %esi, %xmm8 695 ; SSE3-NEXT: movd %r8d, %xmm3 696 ; SSE3-NEXT: movd %ebx, %xmm9 697 ; SSE3-NEXT: movd %r13d, %xmm4 698 ; SSE3-NEXT: movd %r15d, %xmm10 699 ; SSE3-NEXT: movd %r11d, %xmm7 700 ; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload 701 ; SSE3-NEXT: # xmm11 = mem[0],zero,zero,zero 702 ; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 703 ; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero 704 ; SSE3-NEXT: movd %eax, %xmm12 705 ; SSE3-NEXT: movd %ecx, %xmm6 706 ; SSE3-NEXT: movd %edi, %xmm13 707 ; SSE3-NEXT: movd %ebp, %xmm5 708 ; SSE3-NEXT: movd %r9d, %xmm14 709 ; SSE3-NEXT: movd %r12d, %xmm2 710 ; SSE3-NEXT: movd %r14d, %xmm15 711 ; SSE3-NEXT: movd %r10d, %xmm1 712 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] 713 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] 714 ; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 715 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] 716 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] 717 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] 718 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] 719 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] 720 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] 721 ; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 722 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] 723 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] 724 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 725 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] 726 ; SSE3-NEXT: popq %rbx 727 ; SSE3-NEXT: .cfi_def_cfa_offset 48 728 ; SSE3-NEXT: popq %r12 729 ; SSE3-NEXT: .cfi_def_cfa_offset 40 730 ; SSE3-NEXT: popq %r13 731 ; SSE3-NEXT: .cfi_def_cfa_offset 32 732 ; SSE3-NEXT: popq %r14 733 ; SSE3-NEXT: .cfi_def_cfa_offset 24 734 ; SSE3-NEXT: popq %r15 735 ; SSE3-NEXT: .cfi_def_cfa_offset 16 736 ; SSE3-NEXT: popq %rbp 737 ; SSE3-NEXT: .cfi_def_cfa_offset 8 738 ; SSE3-NEXT: retq 739 ; 740 ; SSSE3-LABEL: avx2_vphadd_w_test: 741 ; SSSE3: # %bb.0: 742 ; SSSE3-NEXT: phaddw %xmm1, %xmm0 743 ; SSSE3-NEXT: phaddw %xmm3, %xmm2 744 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 745 ; SSSE3-NEXT: retq 746 ; 747 ; AVX1-LABEL: avx2_vphadd_w_test: 748 ; AVX1: # %bb.0: 749 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 750 ; AVX1-NEXT: vphaddw %xmm2, %xmm1, %xmm1 751 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 752 ; AVX1-NEXT: vphaddw %xmm2, %xmm0, %xmm0 753 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 754 ; AVX1-NEXT: retq 755 ; 756 ; AVX2-LABEL: avx2_vphadd_w_test: 757 ; AVX2: # %bb.0: 758 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 759 ; AVX2-NEXT: vphaddw %xmm2, %xmm1, %xmm1 760 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 761 ; AVX2-NEXT: vphaddw %xmm2, %xmm0, %xmm0 762 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 763 ; AVX2-NEXT: retq 764 %vecext = extractelement <16 x i16> %a, i32 0 765 %vecext1 = extractelement <16 x i16> %a, i32 1 766 %add = add i16 %vecext, %vecext1 767 %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0 768 %vecext4 = extractelement <16 x i16> %a, i32 2 769 %vecext6 = extractelement <16 x i16> %a, i32 3 770 %add8 = add i16 %vecext4, %vecext6 771 %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1 772 %vecext11 = extractelement <16 x i16> %a, i32 4 773 %vecext13 = extractelement <16 x i16> %a, i32 5 774 %add15 = add i16 %vecext11, %vecext13 775 %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2 776 %vecext18 = extractelement <16 x i16> %a, i32 6 777 %vecext20 = extractelement <16 x i16> %a, i32 7 778 %add22 = add i16 %vecext18, %vecext20 779 %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3 780 %vecext25 = extractelement <16 x i16> %a, i32 8 781 %vecext27 = extractelement <16 x i16> %a, i32 9 782 %add29 = add i16 %vecext25, %vecext27 783 %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 4 784 %vecext32 = extractelement <16 x i16> %a, i32 10 785 %vecext34 = extractelement <16 x i16> %a, i32 11 786 %add36 = add i16 %vecext32, %vecext34 787 %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 5 788 %vecext39 = extractelement <16 x i16> %a, i32 12 789 %vecext41 = extractelement <16 x i16> %a, i32 13 790 %add43 = add i16 %vecext39, %vecext41 791 %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 6 792 %vecext46 = extractelement <16 x i16> %a, i32 14 793 %vecext48 = extractelement <16 x i16> %a, i32 15 794 %add50 = add i16 %vecext46, %vecext48 795 %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 7 796 %vecext53 = extractelement <16 x i16> %b, i32 0 797 %vecext55 = extractelement <16 x i16> %b, i32 1 798 %add57 = add i16 %vecext53, %vecext55 799 %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 8 800 %vecext60 = extractelement <16 x i16> %b, i32 2 801 %vecext62 = extractelement <16 x i16> %b, i32 3 802 %add64 = add i16 %vecext60, %vecext62 803 %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 9 804 %vecext67 = extractelement <16 x i16> %b, i32 4 805 %vecext69 = extractelement <16 x i16> %b, i32 5 806 %add71 = add i16 %vecext67, %vecext69 807 %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 10 808 %vecext74 = extractelement <16 x i16> %b, i32 6 809 %vecext76 = extractelement <16 x i16> %b, i32 7 810 %add78 = add i16 %vecext74, %vecext76 811 %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 11 812 %vecext81 = extractelement <16 x i16> %b, i32 8 813 %vecext83 = extractelement <16 x i16> %b, i32 9 814 %add85 = add i16 %vecext81, %vecext83 815 %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12 816 %vecext88 = extractelement <16 x i16> %b, i32 10 817 %vecext90 = extractelement <16 x i16> %b, i32 11 818 %add92 = add i16 %vecext88, %vecext90 819 %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13 820 %vecext95 = extractelement <16 x i16> %b, i32 12 821 %vecext97 = extractelement <16 x i16> %b, i32 13 822 %add99 = add i16 %vecext95, %vecext97 823 %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14 824 %vecext102 = extractelement <16 x i16> %b, i32 14 825 %vecext104 = extractelement <16 x i16> %b, i32 15 826 %add106 = add i16 %vecext102, %vecext104 827 %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15 828 ret <16 x i16> %vecinit108 829 } 830 831 ; Verify that we don't select horizontal subs in the following functions. 832 833 define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) { 834 ; SSE-LABEL: not_a_hsub_1: 835 ; SSE: # %bb.0: 836 ; SSE-NEXT: movd %xmm0, %eax 837 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 838 ; SSE-NEXT: movd %xmm2, %ecx 839 ; SSE-NEXT: subl %ecx, %eax 840 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 841 ; SSE-NEXT: movd %xmm2, %ecx 842 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 843 ; SSE-NEXT: movd %xmm0, %edx 844 ; SSE-NEXT: subl %edx, %ecx 845 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 846 ; SSE-NEXT: movd %xmm0, %edx 847 ; SSE-NEXT: movd %xmm1, %esi 848 ; SSE-NEXT: subl %esi, %edx 849 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 850 ; SSE-NEXT: movd %xmm0, %esi 851 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 852 ; SSE-NEXT: movd %xmm0, %edi 853 ; SSE-NEXT: subl %edi, %esi 854 ; SSE-NEXT: movd %esi, %xmm0 855 ; SSE-NEXT: movd %edx, %xmm1 856 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 857 ; SSE-NEXT: movd %ecx, %xmm2 858 ; SSE-NEXT: movd %eax, %xmm0 859 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 860 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 861 ; SSE-NEXT: retq 862 ; 863 ; AVX-LABEL: not_a_hsub_1: 864 ; AVX: # %bb.0: 865 ; AVX-NEXT: vmovd %xmm0, %eax 866 ; AVX-NEXT: vpextrd $1, %xmm0, %ecx 867 ; AVX-NEXT: subl %ecx, %eax 868 ; AVX-NEXT: vpextrd $2, %xmm0, %ecx 869 ; AVX-NEXT: vpextrd $3, %xmm0, %edx 870 ; AVX-NEXT: subl %edx, %ecx 871 ; AVX-NEXT: vpextrd $1, %xmm1, %edx 872 ; AVX-NEXT: vmovd %xmm1, %esi 873 ; AVX-NEXT: subl %esi, %edx 874 ; AVX-NEXT: vpextrd $3, %xmm1, %esi 875 ; AVX-NEXT: vpextrd $2, %xmm1, %edi 876 ; AVX-NEXT: subl %edi, %esi 877 ; AVX-NEXT: vmovd %eax, %xmm0 878 ; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 879 ; AVX-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 880 ; AVX-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 881 ; AVX-NEXT: retq 882 %vecext = extractelement <4 x i32> %A, i32 0 883 %vecext1 = extractelement <4 x i32> %A, i32 1 884 %sub = sub i32 %vecext, %vecext1 885 %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0 886 %vecext2 = extractelement <4 x i32> %A, i32 2 887 %vecext3 = extractelement <4 x i32> %A, i32 3 888 %sub4 = sub i32 %vecext2, %vecext3 889 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1 890 %vecext6 = extractelement <4 x i32> %B, i32 1 891 %vecext7 = extractelement <4 x i32> %B, i32 0 892 %sub8 = sub i32 %vecext6, %vecext7 893 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2 894 %vecext10 = extractelement <4 x i32> %B, i32 3 895 %vecext11 = extractelement <4 x i32> %B, i32 2 896 %sub12 = sub i32 %vecext10, %vecext11 897 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3 898 ret <4 x i32> %vecinit13 899 } 900 901 define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) { 902 ; SSE-LABEL: not_a_hsub_2: 903 ; SSE: # %bb.0: 904 ; SSE-NEXT: movaps %xmm0, %xmm2 905 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] 906 ; SSE-NEXT: movaps %xmm0, %xmm3 907 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3] 908 ; SSE-NEXT: subss %xmm3, %xmm2 909 ; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 910 ; SSE-NEXT: subss %xmm3, %xmm0 911 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 912 ; SSE-NEXT: movaps %xmm1, %xmm2 913 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] 914 ; SSE-NEXT: movaps %xmm1, %xmm3 915 ; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1] 916 ; SSE-NEXT: subss %xmm3, %xmm2 917 ; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 918 ; SSE-NEXT: subss %xmm3, %xmm1 919 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 920 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 921 ; SSE-NEXT: retq 922 ; 923 ; AVX-LABEL: not_a_hsub_2: 924 ; AVX: # %bb.0: 925 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 926 ; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3] 927 ; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 928 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 929 ; AVX-NEXT: vsubss %xmm3, %xmm0, %xmm0 930 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] 931 ; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 932 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 933 ; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 934 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 935 ; AVX-NEXT: vsubss %xmm3, %xmm1, %xmm1 936 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 937 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] 938 ; AVX-NEXT: retq 939 %vecext = extractelement <4 x float> %A, i32 2 940 %vecext1 = extractelement <4 x float> %A, i32 3 941 %sub = fsub float %vecext, %vecext1 942 %vecinit = insertelement <4 x float> undef, float %sub, i32 1 943 %vecext2 = extractelement <4 x float> %A, i32 0 944 %vecext3 = extractelement <4 x float> %A, i32 1 945 %sub4 = fsub float %vecext2, %vecext3 946 %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0 947 %vecext6 = extractelement <4 x float> %B, i32 3 948 %vecext7 = extractelement <4 x float> %B, i32 2 949 %sub8 = fsub float %vecext6, %vecext7 950 %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3 951 %vecext10 = extractelement <4 x float> %B, i32 0 952 %vecext11 = extractelement <4 x float> %B, i32 1 953 %sub12 = fsub float %vecext10, %vecext11 954 %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2 955 ret <4 x float> %vecinit13 956 } 957 958 define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) { 959 ; SSE-LABEL: not_a_hsub_3: 960 ; SSE: # %bb.0: 961 ; SSE-NEXT: movaps %xmm1, %xmm2 962 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 963 ; SSE-NEXT: subsd %xmm2, %xmm1 964 ; SSE-NEXT: movaps %xmm0, %xmm2 965 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] 966 ; SSE-NEXT: subsd %xmm0, %xmm2 967 ; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] 968 ; SSE-NEXT: movapd %xmm2, %xmm0 969 ; SSE-NEXT: retq 970 ; 971 ; AVX-LABEL: not_a_hsub_3: 972 ; AVX: # %bb.0: 973 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 974 ; AVX-NEXT: vsubsd %xmm2, %xmm1, %xmm1 975 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 976 ; AVX-NEXT: vsubsd %xmm0, %xmm2, %xmm0 977 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 978 ; AVX-NEXT: retq 979 %vecext = extractelement <2 x double> %B, i32 0 980 %vecext1 = extractelement <2 x double> %B, i32 1 981 %sub = fsub double %vecext, %vecext1 982 %vecinit = insertelement <2 x double> undef, double %sub, i32 1 983 %vecext2 = extractelement <2 x double> %A, i32 1 984 %vecext3 = extractelement <2 x double> %A, i32 0 985 %sub2 = fsub double %vecext2, %vecext3 986 %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0 987 ret <2 x double> %vecinit2 988 } 989 990 ; Test AVX horizontal add/sub of packed single/double precision 991 ; floating point values from 256-bit vectors. 992 993 define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) { 994 ; SSE-LABEL: avx_vhadd_ps: 995 ; SSE: # %bb.0: 996 ; SSE-NEXT: haddps %xmm2, %xmm0 997 ; SSE-NEXT: haddps %xmm3, %xmm1 998 ; SSE-NEXT: retq 999 ; 1000 ; AVX-LABEL: avx_vhadd_ps: 1001 ; AVX: # %bb.0: 1002 ; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 1003 ; AVX-NEXT: retq 1004 %vecext = extractelement <8 x float> %a, i32 0 1005 %vecext1 = extractelement <8 x float> %a, i32 1 1006 %add = fadd float %vecext, %vecext1 1007 %vecinit = insertelement <8 x float> undef, float %add, i32 0 1008 %vecext2 = extractelement <8 x float> %a, i32 2 1009 %vecext3 = extractelement <8 x float> %a, i32 3 1010 %add4 = fadd float %vecext2, %vecext3 1011 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1 1012 %vecext6 = extractelement <8 x float> %b, i32 0 1013 %vecext7 = extractelement <8 x float> %b, i32 1 1014 %add8 = fadd float %vecext6, %vecext7 1015 %vecinit9 = insertelement <8 x float> %vecinit5, float %add8, i32 2 1016 %vecext10 = extractelement <8 x float> %b, i32 2 1017 %vecext11 = extractelement <8 x float> %b, i32 3 1018 %add12 = fadd float %vecext10, %vecext11 1019 %vecinit13 = insertelement <8 x float> %vecinit9, float %add12, i32 3 1020 %vecext14 = extractelement <8 x float> %a, i32 4 1021 %vecext15 = extractelement <8 x float> %a, i32 5 1022 %add16 = fadd float %vecext14, %vecext15 1023 %vecinit17 = insertelement <8 x float> %vecinit13, float %add16, i32 4 1024 %vecext18 = extractelement <8 x float> %a, i32 6 1025 %vecext19 = extractelement <8 x float> %a, i32 7 1026 %add20 = fadd float %vecext18, %vecext19 1027 %vecinit21 = insertelement <8 x float> %vecinit17, float %add20, i32 5 1028 %vecext22 = extractelement <8 x float> %b, i32 4 1029 %vecext23 = extractelement <8 x float> %b, i32 5 1030 %add24 = fadd float %vecext22, %vecext23 1031 %vecinit25 = insertelement <8 x float> %vecinit21, float %add24, i32 6 1032 %vecext26 = extractelement <8 x float> %b, i32 6 1033 %vecext27 = extractelement <8 x float> %b, i32 7 1034 %add28 = fadd float %vecext26, %vecext27 1035 %vecinit29 = insertelement <8 x float> %vecinit25, float %add28, i32 7 1036 ret <8 x float> %vecinit29 1037 } 1038 1039 define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) { 1040 ; SSE-LABEL: avx_vhsub_ps: 1041 ; SSE: # %bb.0: 1042 ; SSE-NEXT: hsubps %xmm2, %xmm0 1043 ; SSE-NEXT: hsubps %xmm3, %xmm1 1044 ; SSE-NEXT: retq 1045 ; 1046 ; AVX-LABEL: avx_vhsub_ps: 1047 ; AVX: # %bb.0: 1048 ; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0 1049 ; AVX-NEXT: retq 1050 %vecext = extractelement <8 x float> %a, i32 0 1051 %vecext1 = extractelement <8 x float> %a, i32 1 1052 %sub = fsub float %vecext, %vecext1 1053 %vecinit = insertelement <8 x float> undef, float %sub, i32 0 1054 %vecext2 = extractelement <8 x float> %a, i32 2 1055 %vecext3 = extractelement <8 x float> %a, i32 3 1056 %sub4 = fsub float %vecext2, %vecext3 1057 %vecinit5 = insertelement <8 x float> %vecinit, float %sub4, i32 1 1058 %vecext6 = extractelement <8 x float> %b, i32 0 1059 %vecext7 = extractelement <8 x float> %b, i32 1 1060 %sub8 = fsub float %vecext6, %vecext7 1061 %vecinit9 = insertelement <8 x float> %vecinit5, float %sub8, i32 2 1062 %vecext10 = extractelement <8 x float> %b, i32 2 1063 %vecext11 = extractelement <8 x float> %b, i32 3 1064 %sub12 = fsub float %vecext10, %vecext11 1065 %vecinit13 = insertelement <8 x float> %vecinit9, float %sub12, i32 3 1066 %vecext14 = extractelement <8 x float> %a, i32 4 1067 %vecext15 = extractelement <8 x float> %a, i32 5 1068 %sub16 = fsub float %vecext14, %vecext15 1069 %vecinit17 = insertelement <8 x float> %vecinit13, float %sub16, i32 4 1070 %vecext18 = extractelement <8 x float> %a, i32 6 1071 %vecext19 = extractelement <8 x float> %a, i32 7 1072 %sub20 = fsub float %vecext18, %vecext19 1073 %vecinit21 = insertelement <8 x float> %vecinit17, float %sub20, i32 5 1074 %vecext22 = extractelement <8 x float> %b, i32 4 1075 %vecext23 = extractelement <8 x float> %b, i32 5 1076 %sub24 = fsub float %vecext22, %vecext23 1077 %vecinit25 = insertelement <8 x float> %vecinit21, float %sub24, i32 6 1078 %vecext26 = extractelement <8 x float> %b, i32 6 1079 %vecext27 = extractelement <8 x float> %b, i32 7 1080 %sub28 = fsub float %vecext26, %vecext27 1081 %vecinit29 = insertelement <8 x float> %vecinit25, float %sub28, i32 7 1082 ret <8 x float> %vecinit29 1083 } 1084 1085 define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) { 1086 ; SSE-LABEL: avx_hadd_pd: 1087 ; SSE: # %bb.0: 1088 ; SSE-NEXT: haddpd %xmm2, %xmm0 1089 ; SSE-NEXT: haddpd %xmm3, %xmm1 1090 ; SSE-NEXT: retq 1091 ; 1092 ; AVX-LABEL: avx_hadd_pd: 1093 ; AVX: # %bb.0: 1094 ; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 1095 ; AVX-NEXT: retq 1096 %vecext = extractelement <4 x double> %a, i32 0 1097 %vecext1 = extractelement <4 x double> %a, i32 1 1098 %add = fadd double %vecext, %vecext1 1099 %vecinit = insertelement <4 x double> undef, double %add, i32 0 1100 %vecext2 = extractelement <4 x double> %b, i32 0 1101 %vecext3 = extractelement <4 x double> %b, i32 1 1102 %add4 = fadd double %vecext2, %vecext3 1103 %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1 1104 %vecext6 = extractelement <4 x double> %a, i32 2 1105 %vecext7 = extractelement <4 x double> %a, i32 3 1106 %add8 = fadd double %vecext6, %vecext7 1107 %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2 1108 %vecext10 = extractelement <4 x double> %b, i32 2 1109 %vecext11 = extractelement <4 x double> %b, i32 3 1110 %add12 = fadd double %vecext10, %vecext11 1111 %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3 1112 ret <4 x double> %vecinit13 1113 } 1114 1115 define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) { 1116 ; SSE-LABEL: avx_hsub_pd: 1117 ; SSE: # %bb.0: 1118 ; SSE-NEXT: hsubpd %xmm2, %xmm0 1119 ; SSE-NEXT: hsubpd %xmm3, %xmm1 1120 ; SSE-NEXT: retq 1121 ; 1122 ; AVX-LABEL: avx_hsub_pd: 1123 ; AVX: # %bb.0: 1124 ; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 1125 ; AVX-NEXT: retq 1126 %vecext = extractelement <4 x double> %a, i32 0 1127 %vecext1 = extractelement <4 x double> %a, i32 1 1128 %sub = fsub double %vecext, %vecext1 1129 %vecinit = insertelement <4 x double> undef, double %sub, i32 0 1130 %vecext2 = extractelement <4 x double> %b, i32 0 1131 %vecext3 = extractelement <4 x double> %b, i32 1 1132 %sub4 = fsub double %vecext2, %vecext3 1133 %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1 1134 %vecext6 = extractelement <4 x double> %a, i32 2 1135 %vecext7 = extractelement <4 x double> %a, i32 3 1136 %sub8 = fsub double %vecext6, %vecext7 1137 %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2 1138 %vecext10 = extractelement <4 x double> %b, i32 2 1139 %vecext11 = extractelement <4 x double> %b, i32 3 1140 %sub12 = fsub double %vecext10, %vecext11 1141 %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3 1142 ret <4 x double> %vecinit13 1143 } 1144 1145 ; Test AVX2 horizontal add of packed integer values from 256-bit vectors. 1146 1147 define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) { 1148 ; SSE3-LABEL: avx2_hadd_d: 1149 ; SSE3: # %bb.0: 1150 ; SSE3-NEXT: movd %xmm0, %ecx 1151 ; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] 1152 ; SSE3-NEXT: movd %xmm4, %r8d 1153 ; SSE3-NEXT: addl %ecx, %r8d 1154 ; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 1155 ; SSE3-NEXT: movd %xmm4, %edx 1156 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 1157 ; SSE3-NEXT: movd %xmm0, %r9d 1158 ; SSE3-NEXT: addl %edx, %r9d 1159 ; SSE3-NEXT: movd %xmm2, %edx 1160 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] 1161 ; SSE3-NEXT: movd %xmm0, %esi 1162 ; SSE3-NEXT: addl %edx, %esi 1163 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] 1164 ; SSE3-NEXT: movd %xmm0, %edx 1165 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] 1166 ; SSE3-NEXT: movd %xmm0, %edi 1167 ; SSE3-NEXT: addl %edx, %edi 1168 ; SSE3-NEXT: movd %xmm1, %eax 1169 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 1170 ; SSE3-NEXT: movd %xmm0, %r10d 1171 ; SSE3-NEXT: addl %eax, %r10d 1172 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 1173 ; SSE3-NEXT: movd %xmm0, %eax 1174 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 1175 ; SSE3-NEXT: movd %xmm0, %ecx 1176 ; SSE3-NEXT: addl %eax, %ecx 1177 ; SSE3-NEXT: movd %xmm3, %eax 1178 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] 1179 ; SSE3-NEXT: movd %xmm0, %edx 1180 ; SSE3-NEXT: addl %eax, %edx 1181 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] 1182 ; SSE3-NEXT: movd %xmm0, %r11d 1183 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] 1184 ; SSE3-NEXT: movd %xmm0, %eax 1185 ; SSE3-NEXT: addl %r11d, %eax 1186 ; SSE3-NEXT: movd %edi, %xmm0 1187 ; SSE3-NEXT: movd %esi, %xmm1 1188 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1189 ; SSE3-NEXT: movd %r9d, %xmm2 1190 ; SSE3-NEXT: movd %r8d, %xmm0 1191 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1192 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1193 ; SSE3-NEXT: movd %eax, %xmm1 1194 ; SSE3-NEXT: movd %edx, %xmm2 1195 ; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1196 ; SSE3-NEXT: movd %ecx, %xmm3 1197 ; SSE3-NEXT: movd %r10d, %xmm1 1198 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 1199 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1200 ; SSE3-NEXT: retq 1201 ; 1202 ; SSSE3-LABEL: avx2_hadd_d: 1203 ; SSSE3: # %bb.0: 1204 ; SSSE3-NEXT: phaddd %xmm2, %xmm0 1205 ; SSSE3-NEXT: phaddd %xmm3, %xmm1 1206 ; SSSE3-NEXT: retq 1207 ; 1208 ; AVX1-LABEL: avx2_hadd_d: 1209 ; AVX1: # %bb.0: 1210 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1211 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1212 ; AVX1-NEXT: vphaddd %xmm2, %xmm3, %xmm2 1213 ; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 1214 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1215 ; AVX1-NEXT: retq 1216 ; 1217 ; AVX2-LABEL: avx2_hadd_d: 1218 ; AVX2: # %bb.0: 1219 ; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0 1220 ; AVX2-NEXT: retq 1221 %vecext = extractelement <8 x i32> %a, i32 0 1222 %vecext1 = extractelement <8 x i32> %a, i32 1 1223 %add = add i32 %vecext, %vecext1 1224 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 1225 %vecext2 = extractelement <8 x i32> %a, i32 2 1226 %vecext3 = extractelement <8 x i32> %a, i32 3 1227 %add4 = add i32 %vecext2, %vecext3 1228 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1 1229 %vecext6 = extractelement <8 x i32> %b, i32 0 1230 %vecext7 = extractelement <8 x i32> %b, i32 1 1231 %add8 = add i32 %vecext6, %vecext7 1232 %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2 1233 %vecext10 = extractelement <8 x i32> %b, i32 2 1234 %vecext11 = extractelement <8 x i32> %b, i32 3 1235 %add12 = add i32 %vecext10, %vecext11 1236 %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3 1237 %vecext14 = extractelement <8 x i32> %a, i32 4 1238 %vecext15 = extractelement <8 x i32> %a, i32 5 1239 %add16 = add i32 %vecext14, %vecext15 1240 %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4 1241 %vecext18 = extractelement <8 x i32> %a, i32 6 1242 %vecext19 = extractelement <8 x i32> %a, i32 7 1243 %add20 = add i32 %vecext18, %vecext19 1244 %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5 1245 %vecext22 = extractelement <8 x i32> %b, i32 4 1246 %vecext23 = extractelement <8 x i32> %b, i32 5 1247 %add24 = add i32 %vecext22, %vecext23 1248 %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6 1249 %vecext26 = extractelement <8 x i32> %b, i32 6 1250 %vecext27 = extractelement <8 x i32> %b, i32 7 1251 %add28 = add i32 %vecext26, %vecext27 1252 %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7 1253 ret <8 x i32> %vecinit29 1254 } 1255 1256 define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) { 1257 ; SSE3-LABEL: avx2_hadd_w: 1258 ; SSE3: # %bb.0: 1259 ; SSE3-NEXT: pushq %rbp 1260 ; SSE3-NEXT: .cfi_def_cfa_offset 16 1261 ; SSE3-NEXT: pushq %r15 1262 ; SSE3-NEXT: .cfi_def_cfa_offset 24 1263 ; SSE3-NEXT: pushq %r14 1264 ; SSE3-NEXT: .cfi_def_cfa_offset 32 1265 ; SSE3-NEXT: pushq %r13 1266 ; SSE3-NEXT: .cfi_def_cfa_offset 40 1267 ; SSE3-NEXT: pushq %r12 1268 ; SSE3-NEXT: .cfi_def_cfa_offset 48 1269 ; SSE3-NEXT: pushq %rbx 1270 ; SSE3-NEXT: .cfi_def_cfa_offset 56 1271 ; SSE3-NEXT: .cfi_offset %rbx, -56 1272 ; SSE3-NEXT: .cfi_offset %r12, -48 1273 ; SSE3-NEXT: .cfi_offset %r13, -40 1274 ; SSE3-NEXT: .cfi_offset %r14, -32 1275 ; SSE3-NEXT: .cfi_offset %r15, -24 1276 ; SSE3-NEXT: .cfi_offset %rbp, -16 1277 ; SSE3-NEXT: movd %xmm0, %eax 1278 ; SSE3-NEXT: pextrw $1, %xmm0, %r10d 1279 ; SSE3-NEXT: addl %eax, %r10d 1280 ; SSE3-NEXT: pextrw $2, %xmm0, %eax 1281 ; SSE3-NEXT: pextrw $3, %xmm0, %r11d 1282 ; SSE3-NEXT: addl %eax, %r11d 1283 ; SSE3-NEXT: pextrw $4, %xmm0, %eax 1284 ; SSE3-NEXT: pextrw $5, %xmm0, %r12d 1285 ; SSE3-NEXT: addl %eax, %r12d 1286 ; SSE3-NEXT: pextrw $6, %xmm0, %eax 1287 ; SSE3-NEXT: pextrw $7, %xmm0, %r13d 1288 ; SSE3-NEXT: addl %eax, %r13d 1289 ; SSE3-NEXT: movd %xmm1, %eax 1290 ; SSE3-NEXT: pextrw $1, %xmm1, %ecx 1291 ; SSE3-NEXT: addl %eax, %ecx 1292 ; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1293 ; SSE3-NEXT: pextrw $2, %xmm1, %eax 1294 ; SSE3-NEXT: pextrw $3, %xmm1, %ecx 1295 ; SSE3-NEXT: addl %eax, %ecx 1296 ; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1297 ; SSE3-NEXT: pextrw $4, %xmm1, %eax 1298 ; SSE3-NEXT: pextrw $5, %xmm1, %r14d 1299 ; SSE3-NEXT: addl %eax, %r14d 1300 ; SSE3-NEXT: pextrw $6, %xmm1, %esi 1301 ; SSE3-NEXT: pextrw $7, %xmm1, %r15d 1302 ; SSE3-NEXT: addl %esi, %r15d 1303 ; SSE3-NEXT: movd %xmm2, %esi 1304 ; SSE3-NEXT: pextrw $1, %xmm2, %ebp 1305 ; SSE3-NEXT: addl %esi, %ebp 1306 ; SSE3-NEXT: pextrw $2, %xmm2, %esi 1307 ; SSE3-NEXT: pextrw $3, %xmm2, %edi 1308 ; SSE3-NEXT: addl %esi, %edi 1309 ; SSE3-NEXT: pextrw $4, %xmm2, %esi 1310 ; SSE3-NEXT: pextrw $5, %xmm2, %eax 1311 ; SSE3-NEXT: addl %esi, %eax 1312 ; SSE3-NEXT: pextrw $6, %xmm2, %esi 1313 ; SSE3-NEXT: pextrw $7, %xmm2, %ecx 1314 ; SSE3-NEXT: addl %esi, %ecx 1315 ; SSE3-NEXT: movd %xmm3, %ebx 1316 ; SSE3-NEXT: pextrw $1, %xmm3, %r9d 1317 ; SSE3-NEXT: addl %ebx, %r9d 1318 ; SSE3-NEXT: pextrw $2, %xmm3, %edx 1319 ; SSE3-NEXT: pextrw $3, %xmm3, %ebx 1320 ; SSE3-NEXT: addl %edx, %ebx 1321 ; SSE3-NEXT: pextrw $4, %xmm3, %edx 1322 ; SSE3-NEXT: pextrw $5, %xmm3, %esi 1323 ; SSE3-NEXT: addl %edx, %esi 1324 ; SSE3-NEXT: pextrw $6, %xmm3, %r8d 1325 ; SSE3-NEXT: pextrw $7, %xmm3, %edx 1326 ; SSE3-NEXT: addl %r8d, %edx 1327 ; SSE3-NEXT: movd %ecx, %xmm8 1328 ; SSE3-NEXT: movd %eax, %xmm3 1329 ; SSE3-NEXT: movd %edi, %xmm9 1330 ; SSE3-NEXT: movd %ebp, %xmm4 1331 ; SSE3-NEXT: movd %r13d, %xmm10 1332 ; SSE3-NEXT: movd %r12d, %xmm7 1333 ; SSE3-NEXT: movd %r11d, %xmm11 1334 ; SSE3-NEXT: movd %r10d, %xmm0 1335 ; SSE3-NEXT: movd %edx, %xmm12 1336 ; SSE3-NEXT: movd %esi, %xmm6 1337 ; SSE3-NEXT: movd %ebx, %xmm13 1338 ; SSE3-NEXT: movd %r9d, %xmm5 1339 ; SSE3-NEXT: movd %r15d, %xmm14 1340 ; SSE3-NEXT: movd %r14d, %xmm2 1341 ; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload 1342 ; SSE3-NEXT: # xmm15 = mem[0],zero,zero,zero 1343 ; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload 1344 ; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero 1345 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] 1346 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] 1347 ; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 1348 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] 1349 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] 1350 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] 1351 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] 1352 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] 1353 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] 1354 ; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 1355 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] 1356 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] 1357 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1358 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] 1359 ; SSE3-NEXT: popq %rbx 1360 ; SSE3-NEXT: .cfi_def_cfa_offset 48 1361 ; SSE3-NEXT: popq %r12 1362 ; SSE3-NEXT: .cfi_def_cfa_offset 40 1363 ; SSE3-NEXT: popq %r13 1364 ; SSE3-NEXT: .cfi_def_cfa_offset 32 1365 ; SSE3-NEXT: popq %r14 1366 ; SSE3-NEXT: .cfi_def_cfa_offset 24 1367 ; SSE3-NEXT: popq %r15 1368 ; SSE3-NEXT: .cfi_def_cfa_offset 16 1369 ; SSE3-NEXT: popq %rbp 1370 ; SSE3-NEXT: .cfi_def_cfa_offset 8 1371 ; SSE3-NEXT: retq 1372 ; 1373 ; SSSE3-LABEL: avx2_hadd_w: 1374 ; SSSE3: # %bb.0: 1375 ; SSSE3-NEXT: phaddw %xmm2, %xmm0 1376 ; SSSE3-NEXT: phaddw %xmm3, %xmm1 1377 ; SSSE3-NEXT: retq 1378 ; 1379 ; AVX1-LABEL: avx2_hadd_w: 1380 ; AVX1: # %bb.0: 1381 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1382 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1383 ; AVX1-NEXT: vphaddw %xmm2, %xmm3, %xmm2 1384 ; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0 1385 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1386 ; AVX1-NEXT: retq 1387 ; 1388 ; AVX2-LABEL: avx2_hadd_w: 1389 ; AVX2: # %bb.0: 1390 ; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0 1391 ; AVX2-NEXT: retq 1392 %vecext = extractelement <16 x i16> %a, i32 0 1393 %vecext1 = extractelement <16 x i16> %a, i32 1 1394 %add = add i16 %vecext, %vecext1 1395 %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0 1396 %vecext4 = extractelement <16 x i16> %a, i32 2 1397 %vecext6 = extractelement <16 x i16> %a, i32 3 1398 %add8 = add i16 %vecext4, %vecext6 1399 %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1 1400 %vecext11 = extractelement <16 x i16> %a, i32 4 1401 %vecext13 = extractelement <16 x i16> %a, i32 5 1402 %add15 = add i16 %vecext11, %vecext13 1403 %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2 1404 %vecext18 = extractelement <16 x i16> %a, i32 6 1405 %vecext20 = extractelement <16 x i16> %a, i32 7 1406 %add22 = add i16 %vecext18, %vecext20 1407 %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3 1408 %vecext25 = extractelement <16 x i16> %a, i32 8 1409 %vecext27 = extractelement <16 x i16> %a, i32 9 1410 %add29 = add i16 %vecext25, %vecext27 1411 %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 8 1412 %vecext32 = extractelement <16 x i16> %a, i32 10 1413 %vecext34 = extractelement <16 x i16> %a, i32 11 1414 %add36 = add i16 %vecext32, %vecext34 1415 %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 9 1416 %vecext39 = extractelement <16 x i16> %a, i32 12 1417 %vecext41 = extractelement <16 x i16> %a, i32 13 1418 %add43 = add i16 %vecext39, %vecext41 1419 %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 10 1420 %vecext46 = extractelement <16 x i16> %a, i32 14 1421 %vecext48 = extractelement <16 x i16> %a, i32 15 1422 %add50 = add i16 %vecext46, %vecext48 1423 %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 11 1424 %vecext53 = extractelement <16 x i16> %b, i32 0 1425 %vecext55 = extractelement <16 x i16> %b, i32 1 1426 %add57 = add i16 %vecext53, %vecext55 1427 %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 4 1428 %vecext60 = extractelement <16 x i16> %b, i32 2 1429 %vecext62 = extractelement <16 x i16> %b, i32 3 1430 %add64 = add i16 %vecext60, %vecext62 1431 %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 5 1432 %vecext67 = extractelement <16 x i16> %b, i32 4 1433 %vecext69 = extractelement <16 x i16> %b, i32 5 1434 %add71 = add i16 %vecext67, %vecext69 1435 %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 6 1436 %vecext74 = extractelement <16 x i16> %b, i32 6 1437 %vecext76 = extractelement <16 x i16> %b, i32 7 1438 %add78 = add i16 %vecext74, %vecext76 1439 %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 7 1440 %vecext81 = extractelement <16 x i16> %b, i32 8 1441 %vecext83 = extractelement <16 x i16> %b, i32 9 1442 %add85 = add i16 %vecext81, %vecext83 1443 %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12 1444 %vecext88 = extractelement <16 x i16> %b, i32 10 1445 %vecext90 = extractelement <16 x i16> %b, i32 11 1446 %add92 = add i16 %vecext88, %vecext90 1447 %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13 1448 %vecext95 = extractelement <16 x i16> %b, i32 12 1449 %vecext97 = extractelement <16 x i16> %b, i32 13 1450 %add99 = add i16 %vecext95, %vecext97 1451 %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14 1452 %vecext102 = extractelement <16 x i16> %b, i32 14 1453 %vecext104 = extractelement <16 x i16> %b, i32 15 1454 %add106 = add i16 %vecext102, %vecext104 1455 %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15 1456 ret <16 x i16> %vecinit108 1457 } 1458