Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSE3
      3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3,+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
      4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
      5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
      6 
      7 define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) {
      8 ; SSE-LABEL: hadd_ps_test1:
      9 ; SSE:       # BB#0:
     10 ; SSE-NEXT:    haddps %xmm1, %xmm0
     11 ; SSE-NEXT:    retq
     12 ;
     13 ; AVX-LABEL: hadd_ps_test1:
     14 ; AVX:       # BB#0:
     15 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
     16 ; AVX-NEXT:    retq
     17   %vecext = extractelement <4 x float> %A, i32 0
     18   %vecext1 = extractelement <4 x float> %A, i32 1
     19   %add = fadd float %vecext, %vecext1
     20   %vecinit = insertelement <4 x float> undef, float %add, i32 0
     21   %vecext2 = extractelement <4 x float> %A, i32 2
     22   %vecext3 = extractelement <4 x float> %A, i32 3
     23   %add4 = fadd float %vecext2, %vecext3
     24   %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
     25   %vecext6 = extractelement <4 x float> %B, i32 0
     26   %vecext7 = extractelement <4 x float> %B, i32 1
     27   %add8 = fadd float %vecext6, %vecext7
     28   %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
     29   %vecext10 = extractelement <4 x float> %B, i32 2
     30   %vecext11 = extractelement <4 x float> %B, i32 3
     31   %add12 = fadd float %vecext10, %vecext11
     32   %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
     33   ret <4 x float> %vecinit13
     34 }
     35 
     36 define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) {
     37 ; SSE-LABEL: hadd_ps_test2:
     38 ; SSE:       # BB#0:
     39 ; SSE-NEXT:    haddps %xmm1, %xmm0
     40 ; SSE-NEXT:    retq
     41 ;
     42 ; AVX-LABEL: hadd_ps_test2:
     43 ; AVX:       # BB#0:
     44 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
     45 ; AVX-NEXT:    retq
     46   %vecext = extractelement <4 x float> %A, i32 2
     47   %vecext1 = extractelement <4 x float> %A, i32 3
     48   %add = fadd float %vecext, %vecext1
     49   %vecinit = insertelement <4 x float> undef, float %add, i32 1
     50   %vecext2 = extractelement <4 x float> %A, i32 0
     51   %vecext3 = extractelement <4 x float> %A, i32 1
     52   %add4 = fadd float %vecext2, %vecext3
     53   %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 0
     54   %vecext6 = extractelement <4 x float> %B, i32 2
     55   %vecext7 = extractelement <4 x float> %B, i32 3
     56   %add8 = fadd float %vecext6, %vecext7
     57   %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 3
     58   %vecext10 = extractelement <4 x float> %B, i32 0
     59   %vecext11 = extractelement <4 x float> %B, i32 1
     60   %add12 = fadd float %vecext10, %vecext11
     61   %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 2
     62   ret <4 x float> %vecinit13
     63 }
     64 
     65 define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) {
     66 ; SSE-LABEL: hsub_ps_test1:
     67 ; SSE:       # BB#0:
     68 ; SSE-NEXT:    hsubps %xmm1, %xmm0
     69 ; SSE-NEXT:    retq
     70 ;
     71 ; AVX-LABEL: hsub_ps_test1:
     72 ; AVX:       # BB#0:
     73 ; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
     74 ; AVX-NEXT:    retq
     75   %vecext = extractelement <4 x float> %A, i32 0
     76   %vecext1 = extractelement <4 x float> %A, i32 1
     77   %sub = fsub float %vecext, %vecext1
     78   %vecinit = insertelement <4 x float> undef, float %sub, i32 0
     79   %vecext2 = extractelement <4 x float> %A, i32 2
     80   %vecext3 = extractelement <4 x float> %A, i32 3
     81   %sub4 = fsub float %vecext2, %vecext3
     82   %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 1
     83   %vecext6 = extractelement <4 x float> %B, i32 0
     84   %vecext7 = extractelement <4 x float> %B, i32 1
     85   %sub8 = fsub float %vecext6, %vecext7
     86   %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 2
     87   %vecext10 = extractelement <4 x float> %B, i32 2
     88   %vecext11 = extractelement <4 x float> %B, i32 3
     89   %sub12 = fsub float %vecext10, %vecext11
     90   %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 3
     91   ret <4 x float> %vecinit13
     92 }
     93 
     94 define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) {
     95 ; SSE-LABEL: hsub_ps_test2:
     96 ; SSE:       # BB#0:
     97 ; SSE-NEXT:    hsubps %xmm1, %xmm0
     98 ; SSE-NEXT:    retq
     99 ;
    100 ; AVX-LABEL: hsub_ps_test2:
    101 ; AVX:       # BB#0:
    102 ; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
    103 ; AVX-NEXT:    retq
    104   %vecext = extractelement <4 x float> %A, i32 2
    105   %vecext1 = extractelement <4 x float> %A, i32 3
    106   %sub = fsub float %vecext, %vecext1
    107   %vecinit = insertelement <4 x float> undef, float %sub, i32 1
    108   %vecext2 = extractelement <4 x float> %A, i32 0
    109   %vecext3 = extractelement <4 x float> %A, i32 1
    110   %sub4 = fsub float %vecext2, %vecext3
    111   %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0
    112   %vecext6 = extractelement <4 x float> %B, i32 2
    113   %vecext7 = extractelement <4 x float> %B, i32 3
    114   %sub8 = fsub float %vecext6, %vecext7
    115   %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3
    116   %vecext10 = extractelement <4 x float> %B, i32 0
    117   %vecext11 = extractelement <4 x float> %B, i32 1
    118   %sub12 = fsub float %vecext10, %vecext11
    119   %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
    120   ret <4 x float> %vecinit13
    121 }
    122 
    123 define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) {
    124 ; SSE3-LABEL: phadd_d_test1:
    125 ; SSE3:       # BB#0:
    126 ; SSE3-NEXT:    movd %xmm0, %eax
    127 ; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
    128 ; SSE3-NEXT:    movd %xmm2, %ecx
    129 ; SSE3-NEXT:    addl %eax, %ecx
    130 ; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
    131 ; SSE3-NEXT:    movd %xmm2, %eax
    132 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    133 ; SSE3-NEXT:    movd %xmm0, %edx
    134 ; SSE3-NEXT:    addl %eax, %edx
    135 ; SSE3-NEXT:    movd %xmm1, %eax
    136 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    137 ; SSE3-NEXT:    movd %xmm0, %esi
    138 ; SSE3-NEXT:    addl %eax, %esi
    139 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
    140 ; SSE3-NEXT:    movd %xmm0, %eax
    141 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
    142 ; SSE3-NEXT:    movd %xmm0, %edi
    143 ; SSE3-NEXT:    addl %eax, %edi
    144 ; SSE3-NEXT:    movd %edi, %xmm0
    145 ; SSE3-NEXT:    movd %edx, %xmm1
    146 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    147 ; SSE3-NEXT:    movd %esi, %xmm2
    148 ; SSE3-NEXT:    movd %ecx, %xmm0
    149 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    150 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    151 ; SSE3-NEXT:    retq
    152 ;
    153 ; SSSE3-LABEL: phadd_d_test1:
    154 ; SSSE3:       # BB#0:
    155 ; SSSE3-NEXT:    phaddd %xmm1, %xmm0
    156 ; SSSE3-NEXT:    retq
    157 ;
    158 ; AVX-LABEL: phadd_d_test1:
    159 ; AVX:       # BB#0:
    160 ; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
    161 ; AVX-NEXT:    retq
    162   %vecext = extractelement <4 x i32> %A, i32 0
    163   %vecext1 = extractelement <4 x i32> %A, i32 1
    164   %add = add i32 %vecext, %vecext1
    165   %vecinit = insertelement <4 x i32> undef, i32 %add, i32 0
    166   %vecext2 = extractelement <4 x i32> %A, i32 2
    167   %vecext3 = extractelement <4 x i32> %A, i32 3
    168   %add4 = add i32 %vecext2, %vecext3
    169   %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 1
    170   %vecext6 = extractelement <4 x i32> %B, i32 0
    171   %vecext7 = extractelement <4 x i32> %B, i32 1
    172   %add8 = add i32 %vecext6, %vecext7
    173   %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 2
    174   %vecext10 = extractelement <4 x i32> %B, i32 2
    175   %vecext11 = extractelement <4 x i32> %B, i32 3
    176   %add12 = add i32 %vecext10, %vecext11
    177   %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 3
    178   ret <4 x i32> %vecinit13
    179 }
    180 
    181 define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) {
    182 ; SSE3-LABEL: phadd_d_test2:
    183 ; SSE3:       # BB#0:
    184 ; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
    185 ; SSE3-NEXT:    movd %xmm2, %eax
    186 ; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
    187 ; SSE3-NEXT:    movd %xmm2, %ecx
    188 ; SSE3-NEXT:    addl %eax, %ecx
    189 ; SSE3-NEXT:    movd %xmm0, %eax
    190 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
    191 ; SSE3-NEXT:    movd %xmm0, %edx
    192 ; SSE3-NEXT:    addl %eax, %edx
    193 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
    194 ; SSE3-NEXT:    movd %xmm0, %eax
    195 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
    196 ; SSE3-NEXT:    movd %xmm0, %esi
    197 ; SSE3-NEXT:    addl %eax, %esi
    198 ; SSE3-NEXT:    movd %esi, %xmm0
    199 ; SSE3-NEXT:    movd %ecx, %xmm2
    200 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
    201 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    202 ; SSE3-NEXT:    movd %xmm0, %eax
    203 ; SSE3-NEXT:    movd %xmm1, %ecx
    204 ; SSE3-NEXT:    addl %eax, %ecx
    205 ; SSE3-NEXT:    movd %ecx, %xmm1
    206 ; SSE3-NEXT:    movd %edx, %xmm0
    207 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    208 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    209 ; SSE3-NEXT:    retq
    210 ;
    211 ; SSSE3-LABEL: phadd_d_test2:
    212 ; SSSE3:       # BB#0:
    213 ; SSSE3-NEXT:    phaddd %xmm1, %xmm0
    214 ; SSSE3-NEXT:    retq
    215 ;
    216 ; AVX-LABEL: phadd_d_test2:
    217 ; AVX:       # BB#0:
    218 ; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
    219 ; AVX-NEXT:    retq
    220   %vecext = extractelement <4 x i32> %A, i32 2
    221   %vecext1 = extractelement <4 x i32> %A, i32 3
    222   %add = add i32 %vecext, %vecext1
    223   %vecinit = insertelement <4 x i32> undef, i32 %add, i32 1
    224   %vecext2 = extractelement <4 x i32> %A, i32 0
    225   %vecext3 = extractelement <4 x i32> %A, i32 1
    226   %add4 = add i32 %vecext2, %vecext3
    227   %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 0
    228   %vecext6 = extractelement <4 x i32> %B, i32 3
    229   %vecext7 = extractelement <4 x i32> %B, i32 2
    230   %add8 = add i32 %vecext6, %vecext7
    231   %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 3
    232   %vecext10 = extractelement <4 x i32> %B, i32 1
    233   %vecext11 = extractelement <4 x i32> %B, i32 0
    234   %add12 = add i32 %vecext10, %vecext11
    235   %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 2
    236   ret <4 x i32> %vecinit13
    237 }
    238 
    239 define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) {
    240 ; SSE3-LABEL: phsub_d_test1:
    241 ; SSE3:       # BB#0:
    242 ; SSE3-NEXT:    movd %xmm0, %eax
    243 ; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
    244 ; SSE3-NEXT:    movd %xmm2, %ecx
    245 ; SSE3-NEXT:    subl %ecx, %eax
    246 ; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
    247 ; SSE3-NEXT:    movd %xmm2, %ecx
    248 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    249 ; SSE3-NEXT:    movd %xmm0, %edx
    250 ; SSE3-NEXT:    subl %edx, %ecx
    251 ; SSE3-NEXT:    movd %xmm1, %edx
    252 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    253 ; SSE3-NEXT:    movd %xmm0, %esi
    254 ; SSE3-NEXT:    subl %esi, %edx
    255 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
    256 ; SSE3-NEXT:    movd %xmm0, %esi
    257 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
    258 ; SSE3-NEXT:    movd %xmm0, %edi
    259 ; SSE3-NEXT:    subl %edi, %esi
    260 ; SSE3-NEXT:    movd %esi, %xmm0
    261 ; SSE3-NEXT:    movd %ecx, %xmm1
    262 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    263 ; SSE3-NEXT:    movd %edx, %xmm2
    264 ; SSE3-NEXT:    movd %eax, %xmm0
    265 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    266 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    267 ; SSE3-NEXT:    retq
    268 ;
    269 ; SSSE3-LABEL: phsub_d_test1:
    270 ; SSSE3:       # BB#0:
    271 ; SSSE3-NEXT:    phsubd %xmm1, %xmm0
    272 ; SSSE3-NEXT:    retq
    273 ;
    274 ; AVX-LABEL: phsub_d_test1:
    275 ; AVX:       # BB#0:
    276 ; AVX-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
    277 ; AVX-NEXT:    retq
    278   %vecext = extractelement <4 x i32> %A, i32 0
    279   %vecext1 = extractelement <4 x i32> %A, i32 1
    280   %sub = sub i32 %vecext, %vecext1
    281   %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0
    282   %vecext2 = extractelement <4 x i32> %A, i32 2
    283   %vecext3 = extractelement <4 x i32> %A, i32 3
    284   %sub4 = sub i32 %vecext2, %vecext3
    285   %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1
    286   %vecext6 = extractelement <4 x i32> %B, i32 0
    287   %vecext7 = extractelement <4 x i32> %B, i32 1
    288   %sub8 = sub i32 %vecext6, %vecext7
    289   %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2
    290   %vecext10 = extractelement <4 x i32> %B, i32 2
    291   %vecext11 = extractelement <4 x i32> %B, i32 3
    292   %sub12 = sub i32 %vecext10, %vecext11
    293   %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
    294   ret <4 x i32> %vecinit13
    295 }
    296 
    297 define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) {
    298 ; SSE3-LABEL: phsub_d_test2:
    299 ; SSE3:       # BB#0:
    300 ; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
    301 ; SSE3-NEXT:    movd %xmm2, %eax
    302 ; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
    303 ; SSE3-NEXT:    movd %xmm2, %ecx
    304 ; SSE3-NEXT:    subl %ecx, %eax
    305 ; SSE3-NEXT:    movd %xmm0, %ecx
    306 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
    307 ; SSE3-NEXT:    movd %xmm0, %edx
    308 ; SSE3-NEXT:    subl %edx, %ecx
    309 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
    310 ; SSE3-NEXT:    movd %xmm0, %edx
    311 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
    312 ; SSE3-NEXT:    movd %xmm0, %esi
    313 ; SSE3-NEXT:    subl %esi, %edx
    314 ; SSE3-NEXT:    movd %edx, %xmm0
    315 ; SSE3-NEXT:    movd %eax, %xmm2
    316 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
    317 ; SSE3-NEXT:    movd %xmm1, %eax
    318 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    319 ; SSE3-NEXT:    movd %xmm0, %edx
    320 ; SSE3-NEXT:    subl %edx, %eax
    321 ; SSE3-NEXT:    movd %eax, %xmm1
    322 ; SSE3-NEXT:    movd %ecx, %xmm0
    323 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    324 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    325 ; SSE3-NEXT:    retq
    326 ;
    327 ; SSSE3-LABEL: phsub_d_test2:
    328 ; SSSE3:       # BB#0:
    329 ; SSSE3-NEXT:    phsubd %xmm1, %xmm0
    330 ; SSSE3-NEXT:    retq
    331 ;
    332 ; AVX-LABEL: phsub_d_test2:
    333 ; AVX:       # BB#0:
    334 ; AVX-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
    335 ; AVX-NEXT:    retq
    336   %vecext = extractelement <4 x i32> %A, i32 2
    337   %vecext1 = extractelement <4 x i32> %A, i32 3
    338   %sub = sub i32 %vecext, %vecext1
    339   %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 1
    340   %vecext2 = extractelement <4 x i32> %A, i32 0
    341   %vecext3 = extractelement <4 x i32> %A, i32 1
    342   %sub4 = sub i32 %vecext2, %vecext3
    343   %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 0
    344   %vecext6 = extractelement <4 x i32> %B, i32 2
    345   %vecext7 = extractelement <4 x i32> %B, i32 3
    346   %sub8 = sub i32 %vecext6, %vecext7
    347   %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 3
    348   %vecext10 = extractelement <4 x i32> %B, i32 0
    349   %vecext11 = extractelement <4 x i32> %B, i32 1
    350   %sub12 = sub i32 %vecext10, %vecext11
    351   %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 2
    352   ret <4 x i32> %vecinit13
    353 }
    354 
    355 define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) {
    356 ; SSE-LABEL: hadd_pd_test1:
    357 ; SSE:       # BB#0:
    358 ; SSE-NEXT:    haddpd %xmm1, %xmm0
    359 ; SSE-NEXT:    retq
    360 ;
    361 ; AVX-LABEL: hadd_pd_test1:
    362 ; AVX:       # BB#0:
    363 ; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
    364 ; AVX-NEXT:    retq
    365   %vecext = extractelement <2 x double> %A, i32 0
    366   %vecext1 = extractelement <2 x double> %A, i32 1
    367   %add = fadd double %vecext, %vecext1
    368   %vecinit = insertelement <2 x double> undef, double %add, i32 0
    369   %vecext2 = extractelement <2 x double> %B, i32 0
    370   %vecext3 = extractelement <2 x double> %B, i32 1
    371   %add2 = fadd double %vecext2, %vecext3
    372   %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
    373   ret <2 x double> %vecinit2
    374 }
    375 
    376 define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) {
    377 ; SSE-LABEL: hadd_pd_test2:
    378 ; SSE:       # BB#0:
    379 ; SSE-NEXT:    haddpd %xmm1, %xmm0
    380 ; SSE-NEXT:    retq
    381 ;
    382 ; AVX-LABEL: hadd_pd_test2:
    383 ; AVX:       # BB#0:
    384 ; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
    385 ; AVX-NEXT:    retq
    386   %vecext = extractelement <2 x double> %A, i32 1
    387   %vecext1 = extractelement <2 x double> %A, i32 0
    388   %add = fadd double %vecext, %vecext1
    389   %vecinit = insertelement <2 x double> undef, double %add, i32 0
    390   %vecext2 = extractelement <2 x double> %B, i32 1
    391   %vecext3 = extractelement <2 x double> %B, i32 0
    392   %add2 = fadd double %vecext2, %vecext3
    393   %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
    394   ret <2 x double> %vecinit2
    395 }
    396 
    397 define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) {
    398 ; SSE-LABEL: hsub_pd_test1:
    399 ; SSE:       # BB#0:
    400 ; SSE-NEXT:    hsubpd %xmm1, %xmm0
    401 ; SSE-NEXT:    retq
    402 ;
    403 ; AVX-LABEL: hsub_pd_test1:
    404 ; AVX:       # BB#0:
    405 ; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
    406 ; AVX-NEXT:    retq
    407   %vecext = extractelement <2 x double> %A, i32 0
    408   %vecext1 = extractelement <2 x double> %A, i32 1
    409   %sub = fsub double %vecext, %vecext1
    410   %vecinit = insertelement <2 x double> undef, double %sub, i32 0
    411   %vecext2 = extractelement <2 x double> %B, i32 0
    412   %vecext3 = extractelement <2 x double> %B, i32 1
    413   %sub2 = fsub double %vecext2, %vecext3
    414   %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 1
    415   ret <2 x double> %vecinit2
    416 }
    417 
    418 define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) {
    419 ; SSE-LABEL: hsub_pd_test2:
    420 ; SSE:       # BB#0:
    421 ; SSE-NEXT:    hsubpd %xmm1, %xmm0
    422 ; SSE-NEXT:    retq
    423 ;
    424 ; AVX-LABEL: hsub_pd_test2:
    425 ; AVX:       # BB#0:
    426 ; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
    427 ; AVX-NEXT:    retq
    428   %vecext = extractelement <2 x double> %B, i32 0
    429   %vecext1 = extractelement <2 x double> %B, i32 1
    430   %sub = fsub double %vecext, %vecext1
    431   %vecinit = insertelement <2 x double> undef, double %sub, i32 1
    432   %vecext2 = extractelement <2 x double> %A, i32 0
    433   %vecext3 = extractelement <2 x double> %A, i32 1
    434   %sub2 = fsub double %vecext2, %vecext3
    435   %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
    436   ret <2 x double> %vecinit2
    437 }
    438 
    439 define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) {
    440 ; SSE-LABEL: avx_vhadd_pd_test:
    441 ; SSE:       # BB#0:
    442 ; SSE-NEXT:    haddpd %xmm1, %xmm0
    443 ; SSE-NEXT:    haddpd %xmm3, %xmm2
    444 ; SSE-NEXT:    movapd %xmm2, %xmm1
    445 ; SSE-NEXT:    retq
    446 ;
    447 ; AVX-LABEL: avx_vhadd_pd_test:
    448 ; AVX:       # BB#0:
    449 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
    450 ; AVX-NEXT:    vhaddpd %xmm2, %xmm1, %xmm1
    451 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
    452 ; AVX-NEXT:    vhaddpd %xmm2, %xmm0, %xmm0
    453 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    454 ; AVX-NEXT:    retq
    455   %vecext = extractelement <4 x double> %A, i32 0
    456   %vecext1 = extractelement <4 x double> %A, i32 1
    457   %add = fadd double %vecext, %vecext1
    458   %vecinit = insertelement <4 x double> undef, double %add, i32 0
    459   %vecext2 = extractelement <4 x double> %A, i32 2
    460   %vecext3 = extractelement <4 x double> %A, i32 3
    461   %add4 = fadd double %vecext2, %vecext3
    462   %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1
    463   %vecext6 = extractelement <4 x double> %B, i32 0
    464   %vecext7 = extractelement <4 x double> %B, i32 1
    465   %add8 = fadd double %vecext6, %vecext7
    466   %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2
    467   %vecext10 = extractelement <4 x double> %B, i32 2
    468   %vecext11 = extractelement <4 x double> %B, i32 3
    469   %add12 = fadd double %vecext10, %vecext11
    470   %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
    471   ret <4 x double> %vecinit13
    472 }
    473 
    474 define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) {
    475 ; SSE-LABEL: avx_vhsub_pd_test:
    476 ; SSE:       # BB#0:
    477 ; SSE-NEXT:    hsubpd %xmm1, %xmm0
    478 ; SSE-NEXT:    hsubpd %xmm3, %xmm2
    479 ; SSE-NEXT:    movapd %xmm2, %xmm1
    480 ; SSE-NEXT:    retq
    481 ;
    482 ; AVX-LABEL: avx_vhsub_pd_test:
    483 ; AVX:       # BB#0:
    484 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
    485 ; AVX-NEXT:    vhsubpd %xmm2, %xmm1, %xmm1
    486 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
    487 ; AVX-NEXT:    vhsubpd %xmm2, %xmm0, %xmm0
    488 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    489 ; AVX-NEXT:    retq
    490   %vecext = extractelement <4 x double> %A, i32 0
    491   %vecext1 = extractelement <4 x double> %A, i32 1
    492   %sub = fsub double %vecext, %vecext1
    493   %vecinit = insertelement <4 x double> undef, double %sub, i32 0
    494   %vecext2 = extractelement <4 x double> %A, i32 2
    495   %vecext3 = extractelement <4 x double> %A, i32 3
    496   %sub4 = fsub double %vecext2, %vecext3
    497   %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1
    498   %vecext6 = extractelement <4 x double> %B, i32 0
    499   %vecext7 = extractelement <4 x double> %B, i32 1
    500   %sub8 = fsub double %vecext6, %vecext7
    501   %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2
    502   %vecext10 = extractelement <4 x double> %B, i32 2
    503   %vecext11 = extractelement <4 x double> %B, i32 3
    504   %sub12 = fsub double %vecext10, %vecext11
    505   %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
    506   ret <4 x double> %vecinit13
    507 }
    508 
    509 define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
    510 ; SSE3-LABEL: avx2_vphadd_d_test:
    511 ; SSE3:       # BB#0:
    512 ; SSE3-NEXT:    movd %xmm0, %ecx
    513 ; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
    514 ; SSE3-NEXT:    movd %xmm4, %r8d
    515 ; SSE3-NEXT:    addl %ecx, %r8d
    516 ; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
    517 ; SSE3-NEXT:    movd %xmm4, %edx
    518 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    519 ; SSE3-NEXT:    movd %xmm0, %r9d
    520 ; SSE3-NEXT:    addl %edx, %r9d
    521 ; SSE3-NEXT:    movd %xmm1, %esi
    522 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    523 ; SSE3-NEXT:    movd %xmm0, %r10d
    524 ; SSE3-NEXT:    addl %esi, %r10d
    525 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
    526 ; SSE3-NEXT:    movd %xmm0, %esi
    527 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
    528 ; SSE3-NEXT:    movd %xmm0, %edi
    529 ; SSE3-NEXT:    addl %esi, %edi
    530 ; SSE3-NEXT:    movd %xmm2, %eax
    531 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
    532 ; SSE3-NEXT:    movd %xmm0, %r11d
    533 ; SSE3-NEXT:    addl %eax, %r11d
    534 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
    535 ; SSE3-NEXT:    movd %xmm0, %eax
    536 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
    537 ; SSE3-NEXT:    movd %xmm0, %ecx
    538 ; SSE3-NEXT:    addl %eax, %ecx
    539 ; SSE3-NEXT:    movd %xmm3, %eax
    540 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
    541 ; SSE3-NEXT:    movd %xmm0, %edx
    542 ; SSE3-NEXT:    addl %eax, %edx
    543 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
    544 ; SSE3-NEXT:    movd %xmm0, %eax
    545 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
    546 ; SSE3-NEXT:    movd %xmm0, %esi
    547 ; SSE3-NEXT:    addl %eax, %esi
    548 ; SSE3-NEXT:    movd %edi, %xmm0
    549 ; SSE3-NEXT:    movd %r9d, %xmm1
    550 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    551 ; SSE3-NEXT:    movd %r10d, %xmm2
    552 ; SSE3-NEXT:    movd %r8d, %xmm0
    553 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    554 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    555 ; SSE3-NEXT:    movd %esi, %xmm1
    556 ; SSE3-NEXT:    movd %ecx, %xmm2
    557 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
    558 ; SSE3-NEXT:    movd %edx, %xmm3
    559 ; SSE3-NEXT:    movd %r11d, %xmm1
    560 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
    561 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    562 ; SSE3-NEXT:    retq
    563 ;
    564 ; SSSE3-LABEL: avx2_vphadd_d_test:
    565 ; SSSE3:       # BB#0:
    566 ; SSSE3-NEXT:    phaddd %xmm1, %xmm0
    567 ; SSSE3-NEXT:    phaddd %xmm3, %xmm2
    568 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
    569 ; SSSE3-NEXT:    retq
    570 ;
    571 ; AVX1-LABEL: avx2_vphadd_d_test:
    572 ; AVX1:       # BB#0:
    573 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    574 ; AVX1-NEXT:    vphaddd %xmm2, %xmm1, %xmm1
    575 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    576 ; AVX1-NEXT:    vphaddd %xmm2, %xmm0, %xmm0
    577 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    578 ; AVX1-NEXT:    retq
    579 ;
    580 ; AVX2-LABEL: avx2_vphadd_d_test:
    581 ; AVX2:       # BB#0:
    582 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
    583 ; AVX2-NEXT:    vphaddd %xmm2, %xmm1, %xmm1
    584 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
    585 ; AVX2-NEXT:    vphaddd %xmm2, %xmm0, %xmm0
    586 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    587 ; AVX2-NEXT:    retq
    588   %vecext = extractelement <8 x i32> %A, i32 0
    589   %vecext1 = extractelement <8 x i32> %A, i32 1
    590   %add = add i32 %vecext, %vecext1
    591   %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
    592   %vecext2 = extractelement <8 x i32> %A, i32 2
    593   %vecext3 = extractelement <8 x i32> %A, i32 3
    594   %add4 = add i32 %vecext2, %vecext3
    595   %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
    596   %vecext6 = extractelement <8 x i32> %A, i32 4
    597   %vecext7 = extractelement <8 x i32> %A, i32 5
    598   %add8 = add i32 %vecext6, %vecext7
    599   %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2
    600   %vecext10 = extractelement <8 x i32> %A, i32 6
    601   %vecext11 = extractelement <8 x i32> %A, i32 7
    602   %add12 = add i32 %vecext10, %vecext11
    603   %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3
    604   %vecext14 = extractelement <8 x i32> %B, i32 0
    605   %vecext15 = extractelement <8 x i32> %B, i32 1
    606   %add16 = add i32 %vecext14, %vecext15
    607   %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4
    608   %vecext18 = extractelement <8 x i32> %B, i32 2
    609   %vecext19 = extractelement <8 x i32> %B, i32 3
    610   %add20 = add i32 %vecext18, %vecext19
    611   %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5
    612   %vecext22 = extractelement <8 x i32> %B, i32 4
    613   %vecext23 = extractelement <8 x i32> %B, i32 5
    614   %add24 = add i32 %vecext22, %vecext23
    615   %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6
    616   %vecext26 = extractelement <8 x i32> %B, i32 6
    617   %vecext27 = extractelement <8 x i32> %B, i32 7
    618   %add28 = add i32 %vecext26, %vecext27
    619   %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
    620   ret <8 x i32> %vecinit29
    621 }
    622 
    623 define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) {
    624 ; SSE3-LABEL: avx2_vphadd_w_test:
    625 ; SSE3:       # BB#0:
    626 ; SSE3-NEXT:    pushq %rbp
    627 ; SSE3-NEXT:  .Ltmp0:
    628 ; SSE3-NEXT:    .cfi_def_cfa_offset 16
    629 ; SSE3-NEXT:    pushq %r15
    630 ; SSE3-NEXT:  .Ltmp1:
    631 ; SSE3-NEXT:    .cfi_def_cfa_offset 24
    632 ; SSE3-NEXT:    pushq %r14
    633 ; SSE3-NEXT:  .Ltmp2:
    634 ; SSE3-NEXT:    .cfi_def_cfa_offset 32
    635 ; SSE3-NEXT:    pushq %r13
    636 ; SSE3-NEXT:  .Ltmp3:
    637 ; SSE3-NEXT:    .cfi_def_cfa_offset 40
    638 ; SSE3-NEXT:    pushq %r12
    639 ; SSE3-NEXT:  .Ltmp4:
    640 ; SSE3-NEXT:    .cfi_def_cfa_offset 48
    641 ; SSE3-NEXT:    pushq %rbx
    642 ; SSE3-NEXT:  .Ltmp5:
    643 ; SSE3-NEXT:    .cfi_def_cfa_offset 56
    644 ; SSE3-NEXT:  .Ltmp6:
    645 ; SSE3-NEXT:    .cfi_offset %rbx, -56
    646 ; SSE3-NEXT:  .Ltmp7:
    647 ; SSE3-NEXT:    .cfi_offset %r12, -48
    648 ; SSE3-NEXT:  .Ltmp8:
    649 ; SSE3-NEXT:    .cfi_offset %r13, -40
    650 ; SSE3-NEXT:  .Ltmp9:
    651 ; SSE3-NEXT:    .cfi_offset %r14, -32
    652 ; SSE3-NEXT:  .Ltmp10:
    653 ; SSE3-NEXT:    .cfi_offset %r15, -24
    654 ; SSE3-NEXT:  .Ltmp11:
    655 ; SSE3-NEXT:    .cfi_offset %rbp, -16
    656 ; SSE3-NEXT:    movd %xmm0, %eax
    657 ; SSE3-NEXT:    pextrw $1, %xmm0, %ecx
    658 ; SSE3-NEXT:    addl %eax, %ecx
    659 ; SSE3-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
    660 ; SSE3-NEXT:    pextrw $2, %xmm0, %eax
    661 ; SSE3-NEXT:    pextrw $3, %xmm0, %r11d
    662 ; SSE3-NEXT:    addl %eax, %r11d
    663 ; SSE3-NEXT:    pextrw $4, %xmm0, %eax
    664 ; SSE3-NEXT:    pextrw $5, %xmm0, %r10d
    665 ; SSE3-NEXT:    addl %eax, %r10d
    666 ; SSE3-NEXT:    pextrw $6, %xmm0, %eax
    667 ; SSE3-NEXT:    pextrw $7, %xmm0, %r13d
    668 ; SSE3-NEXT:    addl %eax, %r13d
    669 ; SSE3-NEXT:    movd %xmm1, %eax
    670 ; SSE3-NEXT:    pextrw $1, %xmm1, %r14d
    671 ; SSE3-NEXT:    addl %eax, %r14d
    672 ; SSE3-NEXT:    pextrw $2, %xmm1, %eax
    673 ; SSE3-NEXT:    pextrw $3, %xmm1, %ebp
    674 ; SSE3-NEXT:    addl %eax, %ebp
    675 ; SSE3-NEXT:    pextrw $4, %xmm1, %eax
    676 ; SSE3-NEXT:    pextrw $5, %xmm1, %ebx
    677 ; SSE3-NEXT:    addl %eax, %ebx
    678 ; SSE3-NEXT:    pextrw $6, %xmm1, %eax
    679 ; SSE3-NEXT:    pextrw $7, %xmm1, %edx
    680 ; SSE3-NEXT:    addl %eax, %edx
    681 ; SSE3-NEXT:    movd %xmm2, %eax
    682 ; SSE3-NEXT:    pextrw $1, %xmm2, %ecx
    683 ; SSE3-NEXT:    addl %eax, %ecx
    684 ; SSE3-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
    685 ; SSE3-NEXT:    pextrw $2, %xmm2, %eax
    686 ; SSE3-NEXT:    pextrw $3, %xmm2, %r12d
    687 ; SSE3-NEXT:    addl %eax, %r12d
    688 ; SSE3-NEXT:    pextrw $4, %xmm2, %eax
    689 ; SSE3-NEXT:    pextrw $5, %xmm2, %r15d
    690 ; SSE3-NEXT:    addl %eax, %r15d
    691 ; SSE3-NEXT:    pextrw $6, %xmm2, %eax
    692 ; SSE3-NEXT:    pextrw $7, %xmm2, %r8d
    693 ; SSE3-NEXT:    addl %eax, %r8d
    694 ; SSE3-NEXT:    movd %xmm3, %eax
    695 ; SSE3-NEXT:    pextrw $1, %xmm3, %r9d
    696 ; SSE3-NEXT:    addl %eax, %r9d
    697 ; SSE3-NEXT:    pextrw $2, %xmm3, %eax
    698 ; SSE3-NEXT:    pextrw $3, %xmm3, %esi
    699 ; SSE3-NEXT:    addl %eax, %esi
    700 ; SSE3-NEXT:    pextrw $4, %xmm3, %eax
    701 ; SSE3-NEXT:    pextrw $5, %xmm3, %edi
    702 ; SSE3-NEXT:    addl %eax, %edi
    703 ; SSE3-NEXT:    pextrw $6, %xmm3, %ecx
    704 ; SSE3-NEXT:    pextrw $7, %xmm3, %eax
    705 ; SSE3-NEXT:    addl %ecx, %eax
    706 ; SSE3-NEXT:    movd %edx, %xmm8
    707 ; SSE3-NEXT:    movd %r13d, %xmm3
    708 ; SSE3-NEXT:    movd %ebp, %xmm9
    709 ; SSE3-NEXT:    movd %r11d, %xmm4
    710 ; SSE3-NEXT:    movd %ebx, %xmm10
    711 ; SSE3-NEXT:    movd %r10d, %xmm7
    712 ; SSE3-NEXT:    movd %r14d, %xmm11
    713 ; SSE3-NEXT:    movd -{{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload
    714 ; SSE3-NEXT:    # xmm0 = mem[0],zero,zero,zero
    715 ; SSE3-NEXT:    movd %eax, %xmm12
    716 ; SSE3-NEXT:    movd %r8d, %xmm6
    717 ; SSE3-NEXT:    movd %esi, %xmm13
    718 ; SSE3-NEXT:    movd %r12d, %xmm5
    719 ; SSE3-NEXT:    movd %edi, %xmm14
    720 ; SSE3-NEXT:    movd %r15d, %xmm2
    721 ; SSE3-NEXT:    movd %r9d, %xmm15
    722 ; SSE3-NEXT:    movd -{{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload
    723 ; SSE3-NEXT:    # xmm1 = mem[0],zero,zero,zero
    724 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
    725 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
    726 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
    727 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
    728 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
    729 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
    730 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
    731 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
    732 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
    733 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
    734 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
    735 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
    736 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    737 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
    738 ; SSE3-NEXT:    popq %rbx
    739 ; SSE3-NEXT:    popq %r12
    740 ; SSE3-NEXT:    popq %r13
    741 ; SSE3-NEXT:    popq %r14
    742 ; SSE3-NEXT:    popq %r15
    743 ; SSE3-NEXT:    popq %rbp
    744 ; SSE3-NEXT:    retq
    745 ;
    746 ; SSSE3-LABEL: avx2_vphadd_w_test:
    747 ; SSSE3:       # BB#0:
    748 ; SSSE3-NEXT:    phaddw %xmm1, %xmm0
    749 ; SSSE3-NEXT:    phaddw %xmm3, %xmm2
    750 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
    751 ; SSSE3-NEXT:    retq
    752 ;
    753 ; AVX1-LABEL: avx2_vphadd_w_test:
    754 ; AVX1:       # BB#0:
    755 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    756 ; AVX1-NEXT:    vphaddw %xmm2, %xmm1, %xmm1
    757 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    758 ; AVX1-NEXT:    vphaddw %xmm2, %xmm0, %xmm0
    759 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    760 ; AVX1-NEXT:    retq
    761 ;
    762 ; AVX2-LABEL: avx2_vphadd_w_test:
    763 ; AVX2:       # BB#0:
    764 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
    765 ; AVX2-NEXT:    vphaddw %xmm2, %xmm1, %xmm1
    766 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
    767 ; AVX2-NEXT:    vphaddw %xmm2, %xmm0, %xmm0
    768 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    769 ; AVX2-NEXT:    retq
    770   %vecext = extractelement <16 x i16> %a, i32 0
    771   %vecext1 = extractelement <16 x i16> %a, i32 1
    772   %add = add i16 %vecext, %vecext1
    773   %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0
    774   %vecext4 = extractelement <16 x i16> %a, i32 2
    775   %vecext6 = extractelement <16 x i16> %a, i32 3
    776   %add8 = add i16 %vecext4, %vecext6
    777   %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1
    778   %vecext11 = extractelement <16 x i16> %a, i32 4
    779   %vecext13 = extractelement <16 x i16> %a, i32 5
    780   %add15 = add i16 %vecext11, %vecext13
    781   %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2
    782   %vecext18 = extractelement <16 x i16> %a, i32 6
    783   %vecext20 = extractelement <16 x i16> %a, i32 7
    784   %add22 = add i16 %vecext18, %vecext20
    785   %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3
    786   %vecext25 = extractelement <16 x i16> %a, i32 8
    787   %vecext27 = extractelement <16 x i16> %a, i32 9
    788   %add29 = add i16 %vecext25, %vecext27
    789   %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 4
    790   %vecext32 = extractelement <16 x i16> %a, i32 10
    791   %vecext34 = extractelement <16 x i16> %a, i32 11
    792   %add36 = add i16 %vecext32, %vecext34
    793   %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 5
    794   %vecext39 = extractelement <16 x i16> %a, i32 12
    795   %vecext41 = extractelement <16 x i16> %a, i32 13
    796   %add43 = add i16 %vecext39, %vecext41
    797   %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 6
    798   %vecext46 = extractelement <16 x i16> %a, i32 14
    799   %vecext48 = extractelement <16 x i16> %a, i32 15
    800   %add50 = add i16 %vecext46, %vecext48
    801   %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 7
    802   %vecext53 = extractelement <16 x i16> %b, i32 0
    803   %vecext55 = extractelement <16 x i16> %b, i32 1
    804   %add57 = add i16 %vecext53, %vecext55
    805   %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 8
    806   %vecext60 = extractelement <16 x i16> %b, i32 2
    807   %vecext62 = extractelement <16 x i16> %b, i32 3
    808   %add64 = add i16 %vecext60, %vecext62
    809   %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 9
    810   %vecext67 = extractelement <16 x i16> %b, i32 4
    811   %vecext69 = extractelement <16 x i16> %b, i32 5
    812   %add71 = add i16 %vecext67, %vecext69
    813   %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 10
    814   %vecext74 = extractelement <16 x i16> %b, i32 6
    815   %vecext76 = extractelement <16 x i16> %b, i32 7
    816   %add78 = add i16 %vecext74, %vecext76
    817   %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 11
    818   %vecext81 = extractelement <16 x i16> %b, i32 8
    819   %vecext83 = extractelement <16 x i16> %b, i32 9
    820   %add85 = add i16 %vecext81, %vecext83
    821   %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12
    822   %vecext88 = extractelement <16 x i16> %b, i32 10
    823   %vecext90 = extractelement <16 x i16> %b, i32 11
    824   %add92 = add i16 %vecext88, %vecext90
    825   %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13
    826   %vecext95 = extractelement <16 x i16> %b, i32 12
    827   %vecext97 = extractelement <16 x i16> %b, i32 13
    828   %add99 = add i16 %vecext95, %vecext97
    829   %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14
    830   %vecext102 = extractelement <16 x i16> %b, i32 14
    831   %vecext104 = extractelement <16 x i16> %b, i32 15
    832   %add106 = add i16 %vecext102, %vecext104
    833   %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
    834   ret <16 x i16> %vecinit108
    835 }
    836 
    837 ; Verify that we don't select horizontal subs in the following functions.
    838 
    839 define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) {
    840 ; SSE-LABEL: not_a_hsub_1:
    841 ; SSE:       # BB#0:
    842 ; SSE-NEXT:    movd %xmm0, %eax
    843 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
    844 ; SSE-NEXT:    movd %xmm2, %ecx
    845 ; SSE-NEXT:    subl %ecx, %eax
    846 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
    847 ; SSE-NEXT:    movd %xmm2, %ecx
    848 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    849 ; SSE-NEXT:    movd %xmm0, %edx
    850 ; SSE-NEXT:    subl %edx, %ecx
    851 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    852 ; SSE-NEXT:    movd %xmm0, %edx
    853 ; SSE-NEXT:    movd %xmm1, %esi
    854 ; SSE-NEXT:    subl %esi, %edx
    855 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
    856 ; SSE-NEXT:    movd %xmm0, %esi
    857 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
    858 ; SSE-NEXT:    movd %xmm0, %edi
    859 ; SSE-NEXT:    subl %edi, %esi
    860 ; SSE-NEXT:    movd %esi, %xmm0
    861 ; SSE-NEXT:    movd %ecx, %xmm1
    862 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    863 ; SSE-NEXT:    movd %edx, %xmm2
    864 ; SSE-NEXT:    movd %eax, %xmm0
    865 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    866 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    867 ; SSE-NEXT:    retq
    868 ;
    869 ; AVX-LABEL: not_a_hsub_1:
    870 ; AVX:       # BB#0:
    871 ; AVX-NEXT:    vmovd %xmm0, %eax
    872 ; AVX-NEXT:    vpextrd $1, %xmm0, %ecx
    873 ; AVX-NEXT:    subl %ecx, %eax
    874 ; AVX-NEXT:    vpextrd $2, %xmm0, %ecx
    875 ; AVX-NEXT:    vpextrd $3, %xmm0, %edx
    876 ; AVX-NEXT:    subl %edx, %ecx
    877 ; AVX-NEXT:    vpextrd $1, %xmm1, %edx
    878 ; AVX-NEXT:    vmovd %xmm1, %esi
    879 ; AVX-NEXT:    subl %esi, %edx
    880 ; AVX-NEXT:    vpextrd $3, %xmm1, %esi
    881 ; AVX-NEXT:    vpextrd $2, %xmm1, %edi
    882 ; AVX-NEXT:    subl %edi, %esi
    883 ; AVX-NEXT:    vmovd %eax, %xmm0
    884 ; AVX-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
    885 ; AVX-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0
    886 ; AVX-NEXT:    vpinsrd $3, %esi, %xmm0, %xmm0
    887 ; AVX-NEXT:    retq
    888   %vecext = extractelement <4 x i32> %A, i32 0
    889   %vecext1 = extractelement <4 x i32> %A, i32 1
    890   %sub = sub i32 %vecext, %vecext1
    891   %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0
    892   %vecext2 = extractelement <4 x i32> %A, i32 2
    893   %vecext3 = extractelement <4 x i32> %A, i32 3
    894   %sub4 = sub i32 %vecext2, %vecext3
    895   %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1
    896   %vecext6 = extractelement <4 x i32> %B, i32 1
    897   %vecext7 = extractelement <4 x i32> %B, i32 0
    898   %sub8 = sub i32 %vecext6, %vecext7
    899   %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2
    900   %vecext10 = extractelement <4 x i32> %B, i32 3
    901   %vecext11 = extractelement <4 x i32> %B, i32 2
    902   %sub12 = sub i32 %vecext10, %vecext11
    903   %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
    904   ret <4 x i32> %vecinit13
    905 }
    906 
    907 define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
    908 ; SSE-LABEL: not_a_hsub_2:
    909 ; SSE:       # BB#0:
    910 ; SSE-NEXT:    movapd %xmm0, %xmm2
    911 ; SSE-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[1,0]
    912 ; SSE-NEXT:    movapd %xmm0, %xmm3
    913 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
    914 ; SSE-NEXT:    subss %xmm3, %xmm2
    915 ; SSE-NEXT:    movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
    916 ; SSE-NEXT:    subss %xmm3, %xmm0
    917 ; SSE-NEXT:    movaps %xmm1, %xmm3
    918 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
    919 ; SSE-NEXT:    movaps %xmm1, %xmm4
    920 ; SSE-NEXT:    shufpd {{.*#+}} xmm4 = xmm4[1,0]
    921 ; SSE-NEXT:    subss %xmm4, %xmm3
    922 ; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
    923 ; SSE-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
    924 ; SSE-NEXT:    subss %xmm3, %xmm1
    925 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    926 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    927 ; SSE-NEXT:    retq
    928 ;
    929 ; AVX-LABEL: not_a_hsub_2:
    930 ; AVX:       # BB#0:
    931 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    932 ; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3]
    933 ; AVX-NEXT:    vsubss %xmm3, %xmm2, %xmm2
    934 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
    935 ; AVX-NEXT:    vsubss %xmm3, %xmm0, %xmm0
    936 ; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
    937 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
    938 ; AVX-NEXT:    vsubss %xmm4, %xmm3, %xmm3
    939 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
    940 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    941 ; AVX-NEXT:    vsubss %xmm2, %xmm1, %xmm1
    942 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
    943 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
    944 ; AVX-NEXT:    retq
    945   %vecext = extractelement <4 x float> %A, i32 2
    946   %vecext1 = extractelement <4 x float> %A, i32 3
    947   %sub = fsub float %vecext, %vecext1
    948   %vecinit = insertelement <4 x float> undef, float %sub, i32 1
    949   %vecext2 = extractelement <4 x float> %A, i32 0
    950   %vecext3 = extractelement <4 x float> %A, i32 1
    951   %sub4 = fsub float %vecext2, %vecext3
    952   %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0
    953   %vecext6 = extractelement <4 x float> %B, i32 3
    954   %vecext7 = extractelement <4 x float> %B, i32 2
    955   %sub8 = fsub float %vecext6, %vecext7
    956   %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3
    957   %vecext10 = extractelement <4 x float> %B, i32 0
    958   %vecext11 = extractelement <4 x float> %B, i32 1
    959   %sub12 = fsub float %vecext10, %vecext11
    960   %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
    961   ret <4 x float> %vecinit13
    962 }
    963 
    964 define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) {
    965 ; SSE-LABEL: not_a_hsub_3:
    966 ; SSE:       # BB#0:
    967 ; SSE-NEXT:    movapd %xmm1, %xmm2
    968 ; SSE-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[1,0]
    969 ; SSE-NEXT:    subsd %xmm2, %xmm1
    970 ; SSE-NEXT:    movapd %xmm0, %xmm2
    971 ; SSE-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[1,0]
    972 ; SSE-NEXT:    subsd %xmm0, %xmm2
    973 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
    974 ; SSE-NEXT:    movapd %xmm2, %xmm0
    975 ; SSE-NEXT:    retq
    976 ;
    977 ; AVX-LABEL: not_a_hsub_3:
    978 ; AVX:       # BB#0:
    979 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    980 ; AVX-NEXT:    vsubsd %xmm2, %xmm1, %xmm1
    981 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    982 ; AVX-NEXT:    vsubsd %xmm0, %xmm2, %xmm0
    983 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    984 ; AVX-NEXT:    retq
    985   %vecext = extractelement <2 x double> %B, i32 0
    986   %vecext1 = extractelement <2 x double> %B, i32 1
    987   %sub = fsub double %vecext, %vecext1
    988   %vecinit = insertelement <2 x double> undef, double %sub, i32 1
    989   %vecext2 = extractelement <2 x double> %A, i32 1
    990   %vecext3 = extractelement <2 x double> %A, i32 0
    991   %sub2 = fsub double %vecext2, %vecext3
    992   %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
    993   ret <2 x double> %vecinit2
    994 }
    995 
    996 ; Test AVX horizontal add/sub of packed single/double precision
    997 ; floating point values from 256-bit vectors.
    998 
    999 define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) {
   1000 ; SSE-LABEL: avx_vhadd_ps:
   1001 ; SSE:       # BB#0:
   1002 ; SSE-NEXT:    haddps %xmm2, %xmm0
   1003 ; SSE-NEXT:    haddps %xmm3, %xmm1
   1004 ; SSE-NEXT:    retq
   1005 ;
   1006 ; AVX-LABEL: avx_vhadd_ps:
   1007 ; AVX:       # BB#0:
   1008 ; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
   1009 ; AVX-NEXT:    retq
   1010   %vecext = extractelement <8 x float> %a, i32 0
   1011   %vecext1 = extractelement <8 x float> %a, i32 1
   1012   %add = fadd float %vecext, %vecext1
   1013   %vecinit = insertelement <8 x float> undef, float %add, i32 0
   1014   %vecext2 = extractelement <8 x float> %a, i32 2
   1015   %vecext3 = extractelement <8 x float> %a, i32 3
   1016   %add4 = fadd float %vecext2, %vecext3
   1017   %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1
   1018   %vecext6 = extractelement <8 x float> %b, i32 0
   1019   %vecext7 = extractelement <8 x float> %b, i32 1
   1020   %add8 = fadd float %vecext6, %vecext7
   1021   %vecinit9 = insertelement <8 x float> %vecinit5, float %add8, i32 2
   1022   %vecext10 = extractelement <8 x float> %b, i32 2
   1023   %vecext11 = extractelement <8 x float> %b, i32 3
   1024   %add12 = fadd float %vecext10, %vecext11
   1025   %vecinit13 = insertelement <8 x float> %vecinit9, float %add12, i32 3
   1026   %vecext14 = extractelement <8 x float> %a, i32 4
   1027   %vecext15 = extractelement <8 x float> %a, i32 5
   1028   %add16 = fadd float %vecext14, %vecext15
   1029   %vecinit17 = insertelement <8 x float> %vecinit13, float %add16, i32 4
   1030   %vecext18 = extractelement <8 x float> %a, i32 6
   1031   %vecext19 = extractelement <8 x float> %a, i32 7
   1032   %add20 = fadd float %vecext18, %vecext19
   1033   %vecinit21 = insertelement <8 x float> %vecinit17, float %add20, i32 5
   1034   %vecext22 = extractelement <8 x float> %b, i32 4
   1035   %vecext23 = extractelement <8 x float> %b, i32 5
   1036   %add24 = fadd float %vecext22, %vecext23
   1037   %vecinit25 = insertelement <8 x float> %vecinit21, float %add24, i32 6
   1038   %vecext26 = extractelement <8 x float> %b, i32 6
   1039   %vecext27 = extractelement <8 x float> %b, i32 7
   1040   %add28 = fadd float %vecext26, %vecext27
   1041   %vecinit29 = insertelement <8 x float> %vecinit25, float %add28, i32 7
   1042   ret <8 x float> %vecinit29
   1043 }
   1044 
   1045 define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) {
   1046 ; SSE-LABEL: avx_vhsub_ps:
   1047 ; SSE:       # BB#0:
   1048 ; SSE-NEXT:    hsubps %xmm2, %xmm0
   1049 ; SSE-NEXT:    hsubps %xmm3, %xmm1
   1050 ; SSE-NEXT:    retq
   1051 ;
   1052 ; AVX-LABEL: avx_vhsub_ps:
   1053 ; AVX:       # BB#0:
   1054 ; AVX-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
   1055 ; AVX-NEXT:    retq
   1056   %vecext = extractelement <8 x float> %a, i32 0
   1057   %vecext1 = extractelement <8 x float> %a, i32 1
   1058   %sub = fsub float %vecext, %vecext1
   1059   %vecinit = insertelement <8 x float> undef, float %sub, i32 0
   1060   %vecext2 = extractelement <8 x float> %a, i32 2
   1061   %vecext3 = extractelement <8 x float> %a, i32 3
   1062   %sub4 = fsub float %vecext2, %vecext3
   1063   %vecinit5 = insertelement <8 x float> %vecinit, float %sub4, i32 1
   1064   %vecext6 = extractelement <8 x float> %b, i32 0
   1065   %vecext7 = extractelement <8 x float> %b, i32 1
   1066   %sub8 = fsub float %vecext6, %vecext7
   1067   %vecinit9 = insertelement <8 x float> %vecinit5, float %sub8, i32 2
   1068   %vecext10 = extractelement <8 x float> %b, i32 2
   1069   %vecext11 = extractelement <8 x float> %b, i32 3
   1070   %sub12 = fsub float %vecext10, %vecext11
   1071   %vecinit13 = insertelement <8 x float> %vecinit9, float %sub12, i32 3
   1072   %vecext14 = extractelement <8 x float> %a, i32 4
   1073   %vecext15 = extractelement <8 x float> %a, i32 5
   1074   %sub16 = fsub float %vecext14, %vecext15
   1075   %vecinit17 = insertelement <8 x float> %vecinit13, float %sub16, i32 4
   1076   %vecext18 = extractelement <8 x float> %a, i32 6
   1077   %vecext19 = extractelement <8 x float> %a, i32 7
   1078   %sub20 = fsub float %vecext18, %vecext19
   1079   %vecinit21 = insertelement <8 x float> %vecinit17, float %sub20, i32 5
   1080   %vecext22 = extractelement <8 x float> %b, i32 4
   1081   %vecext23 = extractelement <8 x float> %b, i32 5
   1082   %sub24 = fsub float %vecext22, %vecext23
   1083   %vecinit25 = insertelement <8 x float> %vecinit21, float %sub24, i32 6
   1084   %vecext26 = extractelement <8 x float> %b, i32 6
   1085   %vecext27 = extractelement <8 x float> %b, i32 7
   1086   %sub28 = fsub float %vecext26, %vecext27
   1087   %vecinit29 = insertelement <8 x float> %vecinit25, float %sub28, i32 7
   1088   ret <8 x float> %vecinit29
   1089 }
   1090 
   1091 define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) {
   1092 ; SSE-LABEL: avx_hadd_pd:
   1093 ; SSE:       # BB#0:
   1094 ; SSE-NEXT:    haddpd %xmm2, %xmm0
   1095 ; SSE-NEXT:    haddpd %xmm3, %xmm1
   1096 ; SSE-NEXT:    retq
   1097 ;
   1098 ; AVX-LABEL: avx_hadd_pd:
   1099 ; AVX:       # BB#0:
   1100 ; AVX-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
   1101 ; AVX-NEXT:    retq
   1102   %vecext = extractelement <4 x double> %a, i32 0
   1103   %vecext1 = extractelement <4 x double> %a, i32 1
   1104   %add = fadd double %vecext, %vecext1
   1105   %vecinit = insertelement <4 x double> undef, double %add, i32 0
   1106   %vecext2 = extractelement <4 x double> %b, i32 0
   1107   %vecext3 = extractelement <4 x double> %b, i32 1
   1108   %add4 = fadd double %vecext2, %vecext3
   1109   %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1
   1110   %vecext6 = extractelement <4 x double> %a, i32 2
   1111   %vecext7 = extractelement <4 x double> %a, i32 3
   1112   %add8 = fadd double %vecext6, %vecext7
   1113   %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2
   1114   %vecext10 = extractelement <4 x double> %b, i32 2
   1115   %vecext11 = extractelement <4 x double> %b, i32 3
   1116   %add12 = fadd double %vecext10, %vecext11
   1117   %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
   1118   ret <4 x double> %vecinit13
   1119 }
   1120 
   1121 define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) {
   1122 ; SSE-LABEL: avx_hsub_pd:
   1123 ; SSE:       # BB#0:
   1124 ; SSE-NEXT:    hsubpd %xmm2, %xmm0
   1125 ; SSE-NEXT:    hsubpd %xmm3, %xmm1
   1126 ; SSE-NEXT:    retq
   1127 ;
   1128 ; AVX-LABEL: avx_hsub_pd:
   1129 ; AVX:       # BB#0:
   1130 ; AVX-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
   1131 ; AVX-NEXT:    retq
   1132   %vecext = extractelement <4 x double> %a, i32 0
   1133   %vecext1 = extractelement <4 x double> %a, i32 1
   1134   %sub = fsub double %vecext, %vecext1
   1135   %vecinit = insertelement <4 x double> undef, double %sub, i32 0
   1136   %vecext2 = extractelement <4 x double> %b, i32 0
   1137   %vecext3 = extractelement <4 x double> %b, i32 1
   1138   %sub4 = fsub double %vecext2, %vecext3
   1139   %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1
   1140   %vecext6 = extractelement <4 x double> %a, i32 2
   1141   %vecext7 = extractelement <4 x double> %a, i32 3
   1142   %sub8 = fsub double %vecext6, %vecext7
   1143   %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2
   1144   %vecext10 = extractelement <4 x double> %b, i32 2
   1145   %vecext11 = extractelement <4 x double> %b, i32 3
   1146   %sub12 = fsub double %vecext10, %vecext11
   1147   %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
   1148   ret <4 x double> %vecinit13
   1149 }
   1150 
   1151 ; Test AVX2 horizontal add of packed integer values from 256-bit vectors.
   1152 
   1153 define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
   1154 ; SSE3-LABEL: avx2_hadd_d:
   1155 ; SSE3:       # BB#0:
   1156 ; SSE3-NEXT:    movd %xmm0, %ecx
   1157 ; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
   1158 ; SSE3-NEXT:    movd %xmm4, %r8d
   1159 ; SSE3-NEXT:    addl %ecx, %r8d
   1160 ; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
   1161 ; SSE3-NEXT:    movd %xmm4, %edx
   1162 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
   1163 ; SSE3-NEXT:    movd %xmm0, %r9d
   1164 ; SSE3-NEXT:    addl %edx, %r9d
   1165 ; SSE3-NEXT:    movd %xmm2, %esi
   1166 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
   1167 ; SSE3-NEXT:    movd %xmm0, %r10d
   1168 ; SSE3-NEXT:    addl %esi, %r10d
   1169 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
   1170 ; SSE3-NEXT:    movd %xmm0, %esi
   1171 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
   1172 ; SSE3-NEXT:    movd %xmm0, %edi
   1173 ; SSE3-NEXT:    addl %esi, %edi
   1174 ; SSE3-NEXT:    movd %xmm1, %eax
   1175 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
   1176 ; SSE3-NEXT:    movd %xmm0, %r11d
   1177 ; SSE3-NEXT:    addl %eax, %r11d
   1178 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
   1179 ; SSE3-NEXT:    movd %xmm0, %eax
   1180 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
   1181 ; SSE3-NEXT:    movd %xmm0, %ecx
   1182 ; SSE3-NEXT:    addl %eax, %ecx
   1183 ; SSE3-NEXT:    movd %xmm3, %eax
   1184 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
   1185 ; SSE3-NEXT:    movd %xmm0, %edx
   1186 ; SSE3-NEXT:    addl %eax, %edx
   1187 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
   1188 ; SSE3-NEXT:    movd %xmm0, %eax
   1189 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
   1190 ; SSE3-NEXT:    movd %xmm0, %esi
   1191 ; SSE3-NEXT:    addl %eax, %esi
   1192 ; SSE3-NEXT:    movd %edi, %xmm0
   1193 ; SSE3-NEXT:    movd %r9d, %xmm1
   1194 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
   1195 ; SSE3-NEXT:    movd %r10d, %xmm2
   1196 ; SSE3-NEXT:    movd %r8d, %xmm0
   1197 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
   1198 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1199 ; SSE3-NEXT:    movd %esi, %xmm1
   1200 ; SSE3-NEXT:    movd %ecx, %xmm2
   1201 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
   1202 ; SSE3-NEXT:    movd %edx, %xmm3
   1203 ; SSE3-NEXT:    movd %r11d, %xmm1
   1204 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
   1205 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   1206 ; SSE3-NEXT:    retq
   1207 ;
   1208 ; SSSE3-LABEL: avx2_hadd_d:
   1209 ; SSSE3:       # BB#0:
   1210 ; SSSE3-NEXT:    phaddd %xmm2, %xmm0
   1211 ; SSSE3-NEXT:    phaddd %xmm3, %xmm1
   1212 ; SSSE3-NEXT:    retq
   1213 ;
   1214 ; AVX1-LABEL: avx2_hadd_d:
   1215 ; AVX1:       # BB#0:
   1216 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1217 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1218 ; AVX1-NEXT:    vphaddd %xmm2, %xmm3, %xmm2
   1219 ; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
   1220 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1221 ; AVX1-NEXT:    retq
   1222 ;
   1223 ; AVX2-LABEL: avx2_hadd_d:
   1224 ; AVX2:       # BB#0:
   1225 ; AVX2-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
   1226 ; AVX2-NEXT:    retq
   1227   %vecext = extractelement <8 x i32> %a, i32 0
   1228   %vecext1 = extractelement <8 x i32> %a, i32 1
   1229   %add = add i32 %vecext, %vecext1
   1230   %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
   1231   %vecext2 = extractelement <8 x i32> %a, i32 2
   1232   %vecext3 = extractelement <8 x i32> %a, i32 3
   1233   %add4 = add i32 %vecext2, %vecext3
   1234   %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
   1235   %vecext6 = extractelement <8 x i32> %b, i32 0
   1236   %vecext7 = extractelement <8 x i32> %b, i32 1
   1237   %add8 = add i32 %vecext6, %vecext7
   1238   %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2
   1239   %vecext10 = extractelement <8 x i32> %b, i32 2
   1240   %vecext11 = extractelement <8 x i32> %b, i32 3
   1241   %add12 = add i32 %vecext10, %vecext11
   1242   %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3
   1243   %vecext14 = extractelement <8 x i32> %a, i32 4
   1244   %vecext15 = extractelement <8 x i32> %a, i32 5
   1245   %add16 = add i32 %vecext14, %vecext15
   1246   %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4
   1247   %vecext18 = extractelement <8 x i32> %a, i32 6
   1248   %vecext19 = extractelement <8 x i32> %a, i32 7
   1249   %add20 = add i32 %vecext18, %vecext19
   1250   %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5
   1251   %vecext22 = extractelement <8 x i32> %b, i32 4
   1252   %vecext23 = extractelement <8 x i32> %b, i32 5
   1253   %add24 = add i32 %vecext22, %vecext23
   1254   %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6
   1255   %vecext26 = extractelement <8 x i32> %b, i32 6
   1256   %vecext27 = extractelement <8 x i32> %b, i32 7
   1257   %add28 = add i32 %vecext26, %vecext27
   1258   %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
   1259   ret <8 x i32> %vecinit29
   1260 }
   1261 
   1262 define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) {
   1263 ; SSE3-LABEL: avx2_hadd_w:
   1264 ; SSE3:       # BB#0:
   1265 ; SSE3-NEXT:    pushq %rbp
   1266 ; SSE3-NEXT:  .Ltmp12:
   1267 ; SSE3-NEXT:    .cfi_def_cfa_offset 16
   1268 ; SSE3-NEXT:    pushq %r15
   1269 ; SSE3-NEXT:  .Ltmp13:
   1270 ; SSE3-NEXT:    .cfi_def_cfa_offset 24
   1271 ; SSE3-NEXT:    pushq %r14
   1272 ; SSE3-NEXT:  .Ltmp14:
   1273 ; SSE3-NEXT:    .cfi_def_cfa_offset 32
   1274 ; SSE3-NEXT:    pushq %r13
   1275 ; SSE3-NEXT:  .Ltmp15:
   1276 ; SSE3-NEXT:    .cfi_def_cfa_offset 40
   1277 ; SSE3-NEXT:    pushq %r12
   1278 ; SSE3-NEXT:  .Ltmp16:
   1279 ; SSE3-NEXT:    .cfi_def_cfa_offset 48
   1280 ; SSE3-NEXT:    pushq %rbx
   1281 ; SSE3-NEXT:  .Ltmp17:
   1282 ; SSE3-NEXT:    .cfi_def_cfa_offset 56
   1283 ; SSE3-NEXT:  .Ltmp18:
   1284 ; SSE3-NEXT:    .cfi_offset %rbx, -56
   1285 ; SSE3-NEXT:  .Ltmp19:
   1286 ; SSE3-NEXT:    .cfi_offset %r12, -48
   1287 ; SSE3-NEXT:  .Ltmp20:
   1288 ; SSE3-NEXT:    .cfi_offset %r13, -40
   1289 ; SSE3-NEXT:  .Ltmp21:
   1290 ; SSE3-NEXT:    .cfi_offset %r14, -32
   1291 ; SSE3-NEXT:  .Ltmp22:
   1292 ; SSE3-NEXT:    .cfi_offset %r15, -24
   1293 ; SSE3-NEXT:  .Ltmp23:
   1294 ; SSE3-NEXT:    .cfi_offset %rbp, -16
   1295 ; SSE3-NEXT:    movd %xmm0, %eax
   1296 ; SSE3-NEXT:    pextrw $1, %xmm0, %ecx
   1297 ; SSE3-NEXT:    addl %eax, %ecx
   1298 ; SSE3-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
   1299 ; SSE3-NEXT:    pextrw $2, %xmm0, %eax
   1300 ; SSE3-NEXT:    pextrw $3, %xmm0, %r15d
   1301 ; SSE3-NEXT:    addl %eax, %r15d
   1302 ; SSE3-NEXT:    pextrw $4, %xmm0, %eax
   1303 ; SSE3-NEXT:    pextrw $5, %xmm0, %r14d
   1304 ; SSE3-NEXT:    addl %eax, %r14d
   1305 ; SSE3-NEXT:    pextrw $6, %xmm0, %eax
   1306 ; SSE3-NEXT:    pextrw $7, %xmm0, %r13d
   1307 ; SSE3-NEXT:    addl %eax, %r13d
   1308 ; SSE3-NEXT:    movd %xmm1, %eax
   1309 ; SSE3-NEXT:    pextrw $1, %xmm1, %ecx
   1310 ; SSE3-NEXT:    addl %eax, %ecx
   1311 ; SSE3-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
   1312 ; SSE3-NEXT:    pextrw $2, %xmm1, %eax
   1313 ; SSE3-NEXT:    pextrw $3, %xmm1, %r11d
   1314 ; SSE3-NEXT:    addl %eax, %r11d
   1315 ; SSE3-NEXT:    pextrw $4, %xmm1, %eax
   1316 ; SSE3-NEXT:    pextrw $5, %xmm1, %r10d
   1317 ; SSE3-NEXT:    addl %eax, %r10d
   1318 ; SSE3-NEXT:    pextrw $6, %xmm1, %eax
   1319 ; SSE3-NEXT:    pextrw $7, %xmm1, %r12d
   1320 ; SSE3-NEXT:    addl %eax, %r12d
   1321 ; SSE3-NEXT:    movd %xmm2, %eax
   1322 ; SSE3-NEXT:    pextrw $1, %xmm2, %ebx
   1323 ; SSE3-NEXT:    addl %eax, %ebx
   1324 ; SSE3-NEXT:    pextrw $2, %xmm2, %eax
   1325 ; SSE3-NEXT:    pextrw $3, %xmm2, %ecx
   1326 ; SSE3-NEXT:    addl %eax, %ecx
   1327 ; SSE3-NEXT:    pextrw $4, %xmm2, %esi
   1328 ; SSE3-NEXT:    pextrw $5, %xmm2, %r8d
   1329 ; SSE3-NEXT:    addl %esi, %r8d
   1330 ; SSE3-NEXT:    pextrw $6, %xmm2, %esi
   1331 ; SSE3-NEXT:    pextrw $7, %xmm2, %edx
   1332 ; SSE3-NEXT:    addl %esi, %edx
   1333 ; SSE3-NEXT:    movd %xmm3, %edi
   1334 ; SSE3-NEXT:    pextrw $1, %xmm3, %r9d
   1335 ; SSE3-NEXT:    addl %edi, %r9d
   1336 ; SSE3-NEXT:    pextrw $2, %xmm3, %ebp
   1337 ; SSE3-NEXT:    pextrw $3, %xmm3, %edi
   1338 ; SSE3-NEXT:    addl %ebp, %edi
   1339 ; SSE3-NEXT:    pextrw $4, %xmm3, %eax
   1340 ; SSE3-NEXT:    pextrw $5, %xmm3, %ebp
   1341 ; SSE3-NEXT:    addl %eax, %ebp
   1342 ; SSE3-NEXT:    pextrw $6, %xmm3, %esi
   1343 ; SSE3-NEXT:    pextrw $7, %xmm3, %eax
   1344 ; SSE3-NEXT:    addl %esi, %eax
   1345 ; SSE3-NEXT:    movd %edx, %xmm8
   1346 ; SSE3-NEXT:    movd %r13d, %xmm3
   1347 ; SSE3-NEXT:    movd %ecx, %xmm9
   1348 ; SSE3-NEXT:    movd %r15d, %xmm4
   1349 ; SSE3-NEXT:    movd %r8d, %xmm10
   1350 ; SSE3-NEXT:    movd %r14d, %xmm7
   1351 ; SSE3-NEXT:    movd %ebx, %xmm11
   1352 ; SSE3-NEXT:    movd -{{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload
   1353 ; SSE3-NEXT:    # xmm0 = mem[0],zero,zero,zero
   1354 ; SSE3-NEXT:    movd %eax, %xmm12
   1355 ; SSE3-NEXT:    movd %r12d, %xmm6
   1356 ; SSE3-NEXT:    movd %edi, %xmm13
   1357 ; SSE3-NEXT:    movd %r11d, %xmm5
   1358 ; SSE3-NEXT:    movd %ebp, %xmm14
   1359 ; SSE3-NEXT:    movd %r10d, %xmm2
   1360 ; SSE3-NEXT:    movd %r9d, %xmm15
   1361 ; SSE3-NEXT:    movd -{{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload
   1362 ; SSE3-NEXT:    # xmm1 = mem[0],zero,zero,zero
   1363 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
   1364 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
   1365 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
   1366 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
   1367 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
   1368 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
   1369 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
   1370 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
   1371 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
   1372 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
   1373 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
   1374 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
   1375 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
   1376 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
   1377 ; SSE3-NEXT:    popq %rbx
   1378 ; SSE3-NEXT:    popq %r12
   1379 ; SSE3-NEXT:    popq %r13
   1380 ; SSE3-NEXT:    popq %r14
   1381 ; SSE3-NEXT:    popq %r15
   1382 ; SSE3-NEXT:    popq %rbp
   1383 ; SSE3-NEXT:    retq
   1384 ;
   1385 ; SSSE3-LABEL: avx2_hadd_w:
   1386 ; SSSE3:       # BB#0:
   1387 ; SSSE3-NEXT:    phaddw %xmm2, %xmm0
   1388 ; SSSE3-NEXT:    phaddw %xmm3, %xmm1
   1389 ; SSSE3-NEXT:    retq
   1390 ;
   1391 ; AVX1-LABEL: avx2_hadd_w:
   1392 ; AVX1:       # BB#0:
   1393 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1394 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1395 ; AVX1-NEXT:    vphaddw %xmm2, %xmm3, %xmm2
   1396 ; AVX1-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
   1397 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1398 ; AVX1-NEXT:    retq
   1399 ;
   1400 ; AVX2-LABEL: avx2_hadd_w:
   1401 ; AVX2:       # BB#0:
   1402 ; AVX2-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
   1403 ; AVX2-NEXT:    retq
   1404   %vecext = extractelement <16 x i16> %a, i32 0
   1405   %vecext1 = extractelement <16 x i16> %a, i32 1
   1406   %add = add i16 %vecext, %vecext1
   1407   %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0
   1408   %vecext4 = extractelement <16 x i16> %a, i32 2
   1409   %vecext6 = extractelement <16 x i16> %a, i32 3
   1410   %add8 = add i16 %vecext4, %vecext6
   1411   %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1
   1412   %vecext11 = extractelement <16 x i16> %a, i32 4
   1413   %vecext13 = extractelement <16 x i16> %a, i32 5
   1414   %add15 = add i16 %vecext11, %vecext13
   1415   %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2
   1416   %vecext18 = extractelement <16 x i16> %a, i32 6
   1417   %vecext20 = extractelement <16 x i16> %a, i32 7
   1418   %add22 = add i16 %vecext18, %vecext20
   1419   %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3
   1420   %vecext25 = extractelement <16 x i16> %a, i32 8
   1421   %vecext27 = extractelement <16 x i16> %a, i32 9
   1422   %add29 = add i16 %vecext25, %vecext27
   1423   %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 8
   1424   %vecext32 = extractelement <16 x i16> %a, i32 10
   1425   %vecext34 = extractelement <16 x i16> %a, i32 11
   1426   %add36 = add i16 %vecext32, %vecext34
   1427   %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 9
   1428   %vecext39 = extractelement <16 x i16> %a, i32 12
   1429   %vecext41 = extractelement <16 x i16> %a, i32 13
   1430   %add43 = add i16 %vecext39, %vecext41
   1431   %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 10
   1432   %vecext46 = extractelement <16 x i16> %a, i32 14
   1433   %vecext48 = extractelement <16 x i16> %a, i32 15
   1434   %add50 = add i16 %vecext46, %vecext48
   1435   %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 11
   1436   %vecext53 = extractelement <16 x i16> %b, i32 0
   1437   %vecext55 = extractelement <16 x i16> %b, i32 1
   1438   %add57 = add i16 %vecext53, %vecext55
   1439   %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 4
   1440   %vecext60 = extractelement <16 x i16> %b, i32 2
   1441   %vecext62 = extractelement <16 x i16> %b, i32 3
   1442   %add64 = add i16 %vecext60, %vecext62
   1443   %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 5
   1444   %vecext67 = extractelement <16 x i16> %b, i32 4
   1445   %vecext69 = extractelement <16 x i16> %b, i32 5
   1446   %add71 = add i16 %vecext67, %vecext69
   1447   %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 6
   1448   %vecext74 = extractelement <16 x i16> %b, i32 6
   1449   %vecext76 = extractelement <16 x i16> %b, i32 7
   1450   %add78 = add i16 %vecext74, %vecext76
   1451   %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 7
   1452   %vecext81 = extractelement <16 x i16> %b, i32 8
   1453   %vecext83 = extractelement <16 x i16> %b, i32 9
   1454   %add85 = add i16 %vecext81, %vecext83
   1455   %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12
   1456   %vecext88 = extractelement <16 x i16> %b, i32 10
   1457   %vecext90 = extractelement <16 x i16> %b, i32 11
   1458   %add92 = add i16 %vecext88, %vecext90
   1459   %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13
   1460   %vecext95 = extractelement <16 x i16> %b, i32 12
   1461   %vecext97 = extractelement <16 x i16> %b, i32 13
   1462   %add99 = add i16 %vecext95, %vecext97
   1463   %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14
   1464   %vecext102 = extractelement <16 x i16> %b, i32 14
   1465   %vecext104 = extractelement <16 x i16> %b, i32 15
   1466   %add106 = add i16 %vecext102, %vecext104
   1467   %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
   1468   ret <16 x i16> %vecinit108
   1469 }
   1470