Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSE3
      3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3,+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
      4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
      5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
      6 
      7 define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) {
      8 ; SSE-LABEL: hadd_ps_test1:
      9 ; SSE:       # %bb.0:
     10 ; SSE-NEXT:    haddps %xmm1, %xmm0
     11 ; SSE-NEXT:    retq
     12 ;
     13 ; AVX-LABEL: hadd_ps_test1:
     14 ; AVX:       # %bb.0:
     15 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
     16 ; AVX-NEXT:    retq
     17   %vecext = extractelement <4 x float> %A, i32 0
     18   %vecext1 = extractelement <4 x float> %A, i32 1
     19   %add = fadd float %vecext, %vecext1
     20   %vecinit = insertelement <4 x float> undef, float %add, i32 0
     21   %vecext2 = extractelement <4 x float> %A, i32 2
     22   %vecext3 = extractelement <4 x float> %A, i32 3
     23   %add4 = fadd float %vecext2, %vecext3
     24   %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
     25   %vecext6 = extractelement <4 x float> %B, i32 0
     26   %vecext7 = extractelement <4 x float> %B, i32 1
     27   %add8 = fadd float %vecext6, %vecext7
     28   %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
     29   %vecext10 = extractelement <4 x float> %B, i32 2
     30   %vecext11 = extractelement <4 x float> %B, i32 3
     31   %add12 = fadd float %vecext10, %vecext11
     32   %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
     33   ret <4 x float> %vecinit13
     34 }
     35 
     36 define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) {
     37 ; SSE-LABEL: hadd_ps_test2:
     38 ; SSE:       # %bb.0:
     39 ; SSE-NEXT:    haddps %xmm1, %xmm0
     40 ; SSE-NEXT:    retq
     41 ;
     42 ; AVX-LABEL: hadd_ps_test2:
     43 ; AVX:       # %bb.0:
     44 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
     45 ; AVX-NEXT:    retq
     46   %vecext = extractelement <4 x float> %A, i32 2
     47   %vecext1 = extractelement <4 x float> %A, i32 3
     48   %add = fadd float %vecext, %vecext1
     49   %vecinit = insertelement <4 x float> undef, float %add, i32 1
     50   %vecext2 = extractelement <4 x float> %A, i32 0
     51   %vecext3 = extractelement <4 x float> %A, i32 1
     52   %add4 = fadd float %vecext2, %vecext3
     53   %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 0
     54   %vecext6 = extractelement <4 x float> %B, i32 2
     55   %vecext7 = extractelement <4 x float> %B, i32 3
     56   %add8 = fadd float %vecext6, %vecext7
     57   %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 3
     58   %vecext10 = extractelement <4 x float> %B, i32 0
     59   %vecext11 = extractelement <4 x float> %B, i32 1
     60   %add12 = fadd float %vecext10, %vecext11
     61   %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 2
     62   ret <4 x float> %vecinit13
     63 }
     64 
     65 define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) {
     66 ; SSE-LABEL: hsub_ps_test1:
     67 ; SSE:       # %bb.0:
     68 ; SSE-NEXT:    hsubps %xmm1, %xmm0
     69 ; SSE-NEXT:    retq
     70 ;
     71 ; AVX-LABEL: hsub_ps_test1:
     72 ; AVX:       # %bb.0:
     73 ; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
     74 ; AVX-NEXT:    retq
     75   %vecext = extractelement <4 x float> %A, i32 0
     76   %vecext1 = extractelement <4 x float> %A, i32 1
     77   %sub = fsub float %vecext, %vecext1
     78   %vecinit = insertelement <4 x float> undef, float %sub, i32 0
     79   %vecext2 = extractelement <4 x float> %A, i32 2
     80   %vecext3 = extractelement <4 x float> %A, i32 3
     81   %sub4 = fsub float %vecext2, %vecext3
     82   %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 1
     83   %vecext6 = extractelement <4 x float> %B, i32 0
     84   %vecext7 = extractelement <4 x float> %B, i32 1
     85   %sub8 = fsub float %vecext6, %vecext7
     86   %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 2
     87   %vecext10 = extractelement <4 x float> %B, i32 2
     88   %vecext11 = extractelement <4 x float> %B, i32 3
     89   %sub12 = fsub float %vecext10, %vecext11
     90   %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 3
     91   ret <4 x float> %vecinit13
     92 }
     93 
     94 define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) {
     95 ; SSE-LABEL: hsub_ps_test2:
     96 ; SSE:       # %bb.0:
     97 ; SSE-NEXT:    hsubps %xmm1, %xmm0
     98 ; SSE-NEXT:    retq
     99 ;
    100 ; AVX-LABEL: hsub_ps_test2:
    101 ; AVX:       # %bb.0:
    102 ; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
    103 ; AVX-NEXT:    retq
    104   %vecext = extractelement <4 x float> %A, i32 2
    105   %vecext1 = extractelement <4 x float> %A, i32 3
    106   %sub = fsub float %vecext, %vecext1
    107   %vecinit = insertelement <4 x float> undef, float %sub, i32 1
    108   %vecext2 = extractelement <4 x float> %A, i32 0
    109   %vecext3 = extractelement <4 x float> %A, i32 1
    110   %sub4 = fsub float %vecext2, %vecext3
    111   %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0
    112   %vecext6 = extractelement <4 x float> %B, i32 2
    113   %vecext7 = extractelement <4 x float> %B, i32 3
    114   %sub8 = fsub float %vecext6, %vecext7
    115   %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3
    116   %vecext10 = extractelement <4 x float> %B, i32 0
    117   %vecext11 = extractelement <4 x float> %B, i32 1
    118   %sub12 = fsub float %vecext10, %vecext11
    119   %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
    120   ret <4 x float> %vecinit13
    121 }
    122 
    123 define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) {
    124 ; SSE3-LABEL: phadd_d_test1:
    125 ; SSE3:       # %bb.0:
    126 ; SSE3-NEXT:    movd %xmm0, %eax
    127 ; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
    128 ; SSE3-NEXT:    movd %xmm2, %ecx
    129 ; SSE3-NEXT:    addl %eax, %ecx
    130 ; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
    131 ; SSE3-NEXT:    movd %xmm2, %eax
    132 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    133 ; SSE3-NEXT:    movd %xmm0, %edx
    134 ; SSE3-NEXT:    addl %eax, %edx
    135 ; SSE3-NEXT:    movd %xmm1, %eax
    136 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    137 ; SSE3-NEXT:    movd %xmm0, %esi
    138 ; SSE3-NEXT:    addl %eax, %esi
    139 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
    140 ; SSE3-NEXT:    movd %xmm0, %eax
    141 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
    142 ; SSE3-NEXT:    movd %xmm0, %edi
    143 ; SSE3-NEXT:    addl %eax, %edi
    144 ; SSE3-NEXT:    movd %edi, %xmm0
    145 ; SSE3-NEXT:    movd %esi, %xmm1
    146 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    147 ; SSE3-NEXT:    movd %edx, %xmm2
    148 ; SSE3-NEXT:    movd %ecx, %xmm0
    149 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    150 ; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    151 ; SSE3-NEXT:    retq
    152 ;
    153 ; SSSE3-LABEL: phadd_d_test1:
    154 ; SSSE3:       # %bb.0:
    155 ; SSSE3-NEXT:    phaddd %xmm1, %xmm0
    156 ; SSSE3-NEXT:    retq
    157 ;
    158 ; AVX-LABEL: phadd_d_test1:
    159 ; AVX:       # %bb.0:
    160 ; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
    161 ; AVX-NEXT:    retq
    162   %vecext = extractelement <4 x i32> %A, i32 0
    163   %vecext1 = extractelement <4 x i32> %A, i32 1
    164   %add = add i32 %vecext, %vecext1
    165   %vecinit = insertelement <4 x i32> undef, i32 %add, i32 0
    166   %vecext2 = extractelement <4 x i32> %A, i32 2
    167   %vecext3 = extractelement <4 x i32> %A, i32 3
    168   %add4 = add i32 %vecext2, %vecext3
    169   %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 1
    170   %vecext6 = extractelement <4 x i32> %B, i32 0
    171   %vecext7 = extractelement <4 x i32> %B, i32 1
    172   %add8 = add i32 %vecext6, %vecext7
    173   %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 2
    174   %vecext10 = extractelement <4 x i32> %B, i32 2
    175   %vecext11 = extractelement <4 x i32> %B, i32 3
    176   %add12 = add i32 %vecext10, %vecext11
    177   %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 3
    178   ret <4 x i32> %vecinit13
    179 }
    180 
    181 define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) {
    182 ; SSE3-LABEL: phadd_d_test2:
    183 ; SSE3:       # %bb.0:
    184 ; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
    185 ; SSE3-NEXT:    movd %xmm2, %eax
    186 ; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
    187 ; SSE3-NEXT:    movd %xmm2, %ecx
    188 ; SSE3-NEXT:    addl %eax, %ecx
    189 ; SSE3-NEXT:    movd %xmm0, %eax
    190 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
    191 ; SSE3-NEXT:    movd %xmm0, %edx
    192 ; SSE3-NEXT:    addl %eax, %edx
    193 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
    194 ; SSE3-NEXT:    movd %xmm0, %eax
    195 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
    196 ; SSE3-NEXT:    movd %xmm0, %esi
    197 ; SSE3-NEXT:    addl %eax, %esi
    198 ; SSE3-NEXT:    movd %esi, %xmm0
    199 ; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
    200 ; SSE3-NEXT:    movd %xmm2, %eax
    201 ; SSE3-NEXT:    movd %xmm1, %esi
    202 ; SSE3-NEXT:    addl %eax, %esi
    203 ; SSE3-NEXT:    movd %esi, %xmm1
    204 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    205 ; SSE3-NEXT:    movd %ecx, %xmm2
    206 ; SSE3-NEXT:    movd %edx, %xmm0
    207 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    208 ; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    209 ; SSE3-NEXT:    retq
    210 ;
    211 ; SSSE3-LABEL: phadd_d_test2:
    212 ; SSSE3:       # %bb.0:
    213 ; SSSE3-NEXT:    phaddd %xmm1, %xmm0
    214 ; SSSE3-NEXT:    retq
    215 ;
    216 ; AVX-LABEL: phadd_d_test2:
    217 ; AVX:       # %bb.0:
    218 ; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
    219 ; AVX-NEXT:    retq
    220   %vecext = extractelement <4 x i32> %A, i32 2
    221   %vecext1 = extractelement <4 x i32> %A, i32 3
    222   %add = add i32 %vecext, %vecext1
    223   %vecinit = insertelement <4 x i32> undef, i32 %add, i32 1
    224   %vecext2 = extractelement <4 x i32> %A, i32 0
    225   %vecext3 = extractelement <4 x i32> %A, i32 1
    226   %add4 = add i32 %vecext2, %vecext3
    227   %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 0
    228   %vecext6 = extractelement <4 x i32> %B, i32 3
    229   %vecext7 = extractelement <4 x i32> %B, i32 2
    230   %add8 = add i32 %vecext6, %vecext7
    231   %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 3
    232   %vecext10 = extractelement <4 x i32> %B, i32 1
    233   %vecext11 = extractelement <4 x i32> %B, i32 0
    234   %add12 = add i32 %vecext10, %vecext11
    235   %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 2
    236   ret <4 x i32> %vecinit13
    237 }
    238 
    239 define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) {
    240 ; SSE3-LABEL: phsub_d_test1:
    241 ; SSE3:       # %bb.0:
    242 ; SSE3-NEXT:    movd %xmm0, %eax
    243 ; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
    244 ; SSE3-NEXT:    movd %xmm2, %ecx
    245 ; SSE3-NEXT:    subl %ecx, %eax
    246 ; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
    247 ; SSE3-NEXT:    movd %xmm2, %ecx
    248 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    249 ; SSE3-NEXT:    movd %xmm0, %edx
    250 ; SSE3-NEXT:    subl %edx, %ecx
    251 ; SSE3-NEXT:    movd %xmm1, %edx
    252 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    253 ; SSE3-NEXT:    movd %xmm0, %esi
    254 ; SSE3-NEXT:    subl %esi, %edx
    255 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
    256 ; SSE3-NEXT:    movd %xmm0, %esi
    257 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
    258 ; SSE3-NEXT:    movd %xmm0, %edi
    259 ; SSE3-NEXT:    subl %edi, %esi
    260 ; SSE3-NEXT:    movd %esi, %xmm0
    261 ; SSE3-NEXT:    movd %edx, %xmm1
    262 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    263 ; SSE3-NEXT:    movd %ecx, %xmm2
    264 ; SSE3-NEXT:    movd %eax, %xmm0
    265 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    266 ; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    267 ; SSE3-NEXT:    retq
    268 ;
    269 ; SSSE3-LABEL: phsub_d_test1:
    270 ; SSSE3:       # %bb.0:
    271 ; SSSE3-NEXT:    phsubd %xmm1, %xmm0
    272 ; SSSE3-NEXT:    retq
    273 ;
    274 ; AVX-LABEL: phsub_d_test1:
    275 ; AVX:       # %bb.0:
    276 ; AVX-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
    277 ; AVX-NEXT:    retq
    278   %vecext = extractelement <4 x i32> %A, i32 0
    279   %vecext1 = extractelement <4 x i32> %A, i32 1
    280   %sub = sub i32 %vecext, %vecext1
    281   %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0
    282   %vecext2 = extractelement <4 x i32> %A, i32 2
    283   %vecext3 = extractelement <4 x i32> %A, i32 3
    284   %sub4 = sub i32 %vecext2, %vecext3
    285   %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1
    286   %vecext6 = extractelement <4 x i32> %B, i32 0
    287   %vecext7 = extractelement <4 x i32> %B, i32 1
    288   %sub8 = sub i32 %vecext6, %vecext7
    289   %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2
    290   %vecext10 = extractelement <4 x i32> %B, i32 2
    291   %vecext11 = extractelement <4 x i32> %B, i32 3
    292   %sub12 = sub i32 %vecext10, %vecext11
    293   %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
    294   ret <4 x i32> %vecinit13
    295 }
    296 
    297 define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) {
    298 ; SSE3-LABEL: phsub_d_test2:
    299 ; SSE3:       # %bb.0:
    300 ; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
    301 ; SSE3-NEXT:    movd %xmm2, %eax
    302 ; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
    303 ; SSE3-NEXT:    movd %xmm2, %ecx
    304 ; SSE3-NEXT:    subl %ecx, %eax
    305 ; SSE3-NEXT:    movd %xmm0, %ecx
    306 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
    307 ; SSE3-NEXT:    movd %xmm0, %edx
    308 ; SSE3-NEXT:    subl %edx, %ecx
    309 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
    310 ; SSE3-NEXT:    movd %xmm0, %edx
    311 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
    312 ; SSE3-NEXT:    movd %xmm0, %esi
    313 ; SSE3-NEXT:    subl %esi, %edx
    314 ; SSE3-NEXT:    movd %edx, %xmm0
    315 ; SSE3-NEXT:    movd %xmm1, %edx
    316 ; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
    317 ; SSE3-NEXT:    movd %xmm1, %esi
    318 ; SSE3-NEXT:    subl %esi, %edx
    319 ; SSE3-NEXT:    movd %edx, %xmm1
    320 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    321 ; SSE3-NEXT:    movd %eax, %xmm2
    322 ; SSE3-NEXT:    movd %ecx, %xmm0
    323 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    324 ; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    325 ; SSE3-NEXT:    retq
    326 ;
    327 ; SSSE3-LABEL: phsub_d_test2:
    328 ; SSSE3:       # %bb.0:
    329 ; SSSE3-NEXT:    phsubd %xmm1, %xmm0
    330 ; SSSE3-NEXT:    retq
    331 ;
    332 ; AVX-LABEL: phsub_d_test2:
    333 ; AVX:       # %bb.0:
    334 ; AVX-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
    335 ; AVX-NEXT:    retq
    336   %vecext = extractelement <4 x i32> %A, i32 2
    337   %vecext1 = extractelement <4 x i32> %A, i32 3
    338   %sub = sub i32 %vecext, %vecext1
    339   %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 1
    340   %vecext2 = extractelement <4 x i32> %A, i32 0
    341   %vecext3 = extractelement <4 x i32> %A, i32 1
    342   %sub4 = sub i32 %vecext2, %vecext3
    343   %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 0
    344   %vecext6 = extractelement <4 x i32> %B, i32 2
    345   %vecext7 = extractelement <4 x i32> %B, i32 3
    346   %sub8 = sub i32 %vecext6, %vecext7
    347   %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 3
    348   %vecext10 = extractelement <4 x i32> %B, i32 0
    349   %vecext11 = extractelement <4 x i32> %B, i32 1
    350   %sub12 = sub i32 %vecext10, %vecext11
    351   %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 2
    352   ret <4 x i32> %vecinit13
    353 }
    354 
    355 define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) {
    356 ; SSE-LABEL: hadd_pd_test1:
    357 ; SSE:       # %bb.0:
    358 ; SSE-NEXT:    haddpd %xmm1, %xmm0
    359 ; SSE-NEXT:    retq
    360 ;
    361 ; AVX-LABEL: hadd_pd_test1:
    362 ; AVX:       # %bb.0:
    363 ; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
    364 ; AVX-NEXT:    retq
    365   %vecext = extractelement <2 x double> %A, i32 0
    366   %vecext1 = extractelement <2 x double> %A, i32 1
    367   %add = fadd double %vecext, %vecext1
    368   %vecinit = insertelement <2 x double> undef, double %add, i32 0
    369   %vecext2 = extractelement <2 x double> %B, i32 0
    370   %vecext3 = extractelement <2 x double> %B, i32 1
    371   %add2 = fadd double %vecext2, %vecext3
    372   %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
    373   ret <2 x double> %vecinit2
    374 }
    375 
    376 define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) {
    377 ; SSE-LABEL: hadd_pd_test2:
    378 ; SSE:       # %bb.0:
    379 ; SSE-NEXT:    haddpd %xmm1, %xmm0
    380 ; SSE-NEXT:    retq
    381 ;
    382 ; AVX-LABEL: hadd_pd_test2:
    383 ; AVX:       # %bb.0:
    384 ; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
    385 ; AVX-NEXT:    retq
    386   %vecext = extractelement <2 x double> %A, i32 1
    387   %vecext1 = extractelement <2 x double> %A, i32 0
    388   %add = fadd double %vecext, %vecext1
    389   %vecinit = insertelement <2 x double> undef, double %add, i32 0
    390   %vecext2 = extractelement <2 x double> %B, i32 1
    391   %vecext3 = extractelement <2 x double> %B, i32 0
    392   %add2 = fadd double %vecext2, %vecext3
    393   %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
    394   ret <2 x double> %vecinit2
    395 }
    396 
    397 define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) {
    398 ; SSE-LABEL: hsub_pd_test1:
    399 ; SSE:       # %bb.0:
    400 ; SSE-NEXT:    hsubpd %xmm1, %xmm0
    401 ; SSE-NEXT:    retq
    402 ;
    403 ; AVX-LABEL: hsub_pd_test1:
    404 ; AVX:       # %bb.0:
    405 ; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
    406 ; AVX-NEXT:    retq
    407   %vecext = extractelement <2 x double> %A, i32 0
    408   %vecext1 = extractelement <2 x double> %A, i32 1
    409   %sub = fsub double %vecext, %vecext1
    410   %vecinit = insertelement <2 x double> undef, double %sub, i32 0
    411   %vecext2 = extractelement <2 x double> %B, i32 0
    412   %vecext3 = extractelement <2 x double> %B, i32 1
    413   %sub2 = fsub double %vecext2, %vecext3
    414   %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 1
    415   ret <2 x double> %vecinit2
    416 }
    417 
    418 define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) {
    419 ; SSE-LABEL: hsub_pd_test2:
    420 ; SSE:       # %bb.0:
    421 ; SSE-NEXT:    hsubpd %xmm1, %xmm0
    422 ; SSE-NEXT:    retq
    423 ;
    424 ; AVX-LABEL: hsub_pd_test2:
    425 ; AVX:       # %bb.0:
    426 ; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
    427 ; AVX-NEXT:    retq
    428   %vecext = extractelement <2 x double> %B, i32 0
    429   %vecext1 = extractelement <2 x double> %B, i32 1
    430   %sub = fsub double %vecext, %vecext1
    431   %vecinit = insertelement <2 x double> undef, double %sub, i32 1
    432   %vecext2 = extractelement <2 x double> %A, i32 0
    433   %vecext3 = extractelement <2 x double> %A, i32 1
    434   %sub2 = fsub double %vecext2, %vecext3
    435   %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
    436   ret <2 x double> %vecinit2
    437 }
    438 
    439 define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) {
    440 ; SSE-LABEL: avx_vhadd_pd_test:
    441 ; SSE:       # %bb.0:
    442 ; SSE-NEXT:    haddpd %xmm1, %xmm0
    443 ; SSE-NEXT:    haddpd %xmm3, %xmm2
    444 ; SSE-NEXT:    movapd %xmm2, %xmm1
    445 ; SSE-NEXT:    retq
    446 ;
    447 ; AVX-LABEL: avx_vhadd_pd_test:
    448 ; AVX:       # %bb.0:
    449 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
    450 ; AVX-NEXT:    vhaddpd %xmm2, %xmm1, %xmm1
    451 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
    452 ; AVX-NEXT:    vhaddpd %xmm2, %xmm0, %xmm0
    453 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    454 ; AVX-NEXT:    retq
    455   %vecext = extractelement <4 x double> %A, i32 0
    456   %vecext1 = extractelement <4 x double> %A, i32 1
    457   %add = fadd double %vecext, %vecext1
    458   %vecinit = insertelement <4 x double> undef, double %add, i32 0
    459   %vecext2 = extractelement <4 x double> %A, i32 2
    460   %vecext3 = extractelement <4 x double> %A, i32 3
    461   %add4 = fadd double %vecext2, %vecext3
    462   %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1
    463   %vecext6 = extractelement <4 x double> %B, i32 0
    464   %vecext7 = extractelement <4 x double> %B, i32 1
    465   %add8 = fadd double %vecext6, %vecext7
    466   %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2
    467   %vecext10 = extractelement <4 x double> %B, i32 2
    468   %vecext11 = extractelement <4 x double> %B, i32 3
    469   %add12 = fadd double %vecext10, %vecext11
    470   %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
    471   ret <4 x double> %vecinit13
    472 }
    473 
    474 define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) {
    475 ; SSE-LABEL: avx_vhsub_pd_test:
    476 ; SSE:       # %bb.0:
    477 ; SSE-NEXT:    hsubpd %xmm1, %xmm0
    478 ; SSE-NEXT:    hsubpd %xmm3, %xmm2
    479 ; SSE-NEXT:    movapd %xmm2, %xmm1
    480 ; SSE-NEXT:    retq
    481 ;
    482 ; AVX-LABEL: avx_vhsub_pd_test:
    483 ; AVX:       # %bb.0:
    484 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
    485 ; AVX-NEXT:    vhsubpd %xmm2, %xmm1, %xmm1
    486 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
    487 ; AVX-NEXT:    vhsubpd %xmm2, %xmm0, %xmm0
    488 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    489 ; AVX-NEXT:    retq
    490   %vecext = extractelement <4 x double> %A, i32 0
    491   %vecext1 = extractelement <4 x double> %A, i32 1
    492   %sub = fsub double %vecext, %vecext1
    493   %vecinit = insertelement <4 x double> undef, double %sub, i32 0
    494   %vecext2 = extractelement <4 x double> %A, i32 2
    495   %vecext3 = extractelement <4 x double> %A, i32 3
    496   %sub4 = fsub double %vecext2, %vecext3
    497   %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1
    498   %vecext6 = extractelement <4 x double> %B, i32 0
    499   %vecext7 = extractelement <4 x double> %B, i32 1
    500   %sub8 = fsub double %vecext6, %vecext7
    501   %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2
    502   %vecext10 = extractelement <4 x double> %B, i32 2
    503   %vecext11 = extractelement <4 x double> %B, i32 3
    504   %sub12 = fsub double %vecext10, %vecext11
    505   %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
    506   ret <4 x double> %vecinit13
    507 }
    508 
    509 define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
    510 ; SSE3-LABEL: avx2_vphadd_d_test:
    511 ; SSE3:       # %bb.0:
    512 ; SSE3-NEXT:    movd %xmm0, %ecx
    513 ; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
    514 ; SSE3-NEXT:    movd %xmm4, %r8d
    515 ; SSE3-NEXT:    addl %ecx, %r8d
    516 ; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
    517 ; SSE3-NEXT:    movd %xmm4, %edx
    518 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    519 ; SSE3-NEXT:    movd %xmm0, %r9d
    520 ; SSE3-NEXT:    addl %edx, %r9d
    521 ; SSE3-NEXT:    movd %xmm1, %edx
    522 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    523 ; SSE3-NEXT:    movd %xmm0, %esi
    524 ; SSE3-NEXT:    addl %edx, %esi
    525 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
    526 ; SSE3-NEXT:    movd %xmm0, %edx
    527 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
    528 ; SSE3-NEXT:    movd %xmm0, %edi
    529 ; SSE3-NEXT:    addl %edx, %edi
    530 ; SSE3-NEXT:    movd %xmm2, %eax
    531 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
    532 ; SSE3-NEXT:    movd %xmm0, %r10d
    533 ; SSE3-NEXT:    addl %eax, %r10d
    534 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
    535 ; SSE3-NEXT:    movd %xmm0, %eax
    536 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
    537 ; SSE3-NEXT:    movd %xmm0, %ecx
    538 ; SSE3-NEXT:    addl %eax, %ecx
    539 ; SSE3-NEXT:    movd %xmm3, %eax
    540 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
    541 ; SSE3-NEXT:    movd %xmm0, %edx
    542 ; SSE3-NEXT:    addl %eax, %edx
    543 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
    544 ; SSE3-NEXT:    movd %xmm0, %r11d
    545 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
    546 ; SSE3-NEXT:    movd %xmm0, %eax
    547 ; SSE3-NEXT:    addl %r11d, %eax
    548 ; SSE3-NEXT:    movd %edi, %xmm0
    549 ; SSE3-NEXT:    movd %esi, %xmm1
    550 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    551 ; SSE3-NEXT:    movd %r9d, %xmm2
    552 ; SSE3-NEXT:    movd %r8d, %xmm0
    553 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    554 ; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    555 ; SSE3-NEXT:    movd %eax, %xmm1
    556 ; SSE3-NEXT:    movd %edx, %xmm2
    557 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
    558 ; SSE3-NEXT:    movd %ecx, %xmm3
    559 ; SSE3-NEXT:    movd %r10d, %xmm1
    560 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
    561 ; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
    562 ; SSE3-NEXT:    retq
    563 ;
    564 ; SSSE3-LABEL: avx2_vphadd_d_test:
    565 ; SSSE3:       # %bb.0:
    566 ; SSSE3-NEXT:    phaddd %xmm1, %xmm0
    567 ; SSSE3-NEXT:    phaddd %xmm3, %xmm2
    568 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
    569 ; SSSE3-NEXT:    retq
    570 ;
    571 ; AVX1-LABEL: avx2_vphadd_d_test:
    572 ; AVX1:       # %bb.0:
    573 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    574 ; AVX1-NEXT:    vphaddd %xmm2, %xmm1, %xmm1
    575 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    576 ; AVX1-NEXT:    vphaddd %xmm2, %xmm0, %xmm0
    577 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    578 ; AVX1-NEXT:    retq
    579 ;
    580 ; AVX2-LABEL: avx2_vphadd_d_test:
    581 ; AVX2:       # %bb.0:
    582 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
    583 ; AVX2-NEXT:    vphaddd %xmm2, %xmm1, %xmm1
    584 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
    585 ; AVX2-NEXT:    vphaddd %xmm2, %xmm0, %xmm0
    586 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    587 ; AVX2-NEXT:    retq
    588   %vecext = extractelement <8 x i32> %A, i32 0
    589   %vecext1 = extractelement <8 x i32> %A, i32 1
    590   %add = add i32 %vecext, %vecext1
    591   %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
    592   %vecext2 = extractelement <8 x i32> %A, i32 2
    593   %vecext3 = extractelement <8 x i32> %A, i32 3
    594   %add4 = add i32 %vecext2, %vecext3
    595   %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
    596   %vecext6 = extractelement <8 x i32> %A, i32 4
    597   %vecext7 = extractelement <8 x i32> %A, i32 5
    598   %add8 = add i32 %vecext6, %vecext7
    599   %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2
    600   %vecext10 = extractelement <8 x i32> %A, i32 6
    601   %vecext11 = extractelement <8 x i32> %A, i32 7
    602   %add12 = add i32 %vecext10, %vecext11
    603   %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3
    604   %vecext14 = extractelement <8 x i32> %B, i32 0
    605   %vecext15 = extractelement <8 x i32> %B, i32 1
    606   %add16 = add i32 %vecext14, %vecext15
    607   %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4
    608   %vecext18 = extractelement <8 x i32> %B, i32 2
    609   %vecext19 = extractelement <8 x i32> %B, i32 3
    610   %add20 = add i32 %vecext18, %vecext19
    611   %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5
    612   %vecext22 = extractelement <8 x i32> %B, i32 4
    613   %vecext23 = extractelement <8 x i32> %B, i32 5
    614   %add24 = add i32 %vecext22, %vecext23
    615   %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6
    616   %vecext26 = extractelement <8 x i32> %B, i32 6
    617   %vecext27 = extractelement <8 x i32> %B, i32 7
    618   %add28 = add i32 %vecext26, %vecext27
    619   %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
    620   ret <8 x i32> %vecinit29
    621 }
    622 
    623 define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) {
    624 ; SSE3-LABEL: avx2_vphadd_w_test:
    625 ; SSE3:       # %bb.0:
    626 ; SSE3-NEXT:    pushq %rbp
    627 ; SSE3-NEXT:    .cfi_def_cfa_offset 16
    628 ; SSE3-NEXT:    pushq %r15
    629 ; SSE3-NEXT:    .cfi_def_cfa_offset 24
    630 ; SSE3-NEXT:    pushq %r14
    631 ; SSE3-NEXT:    .cfi_def_cfa_offset 32
    632 ; SSE3-NEXT:    pushq %r13
    633 ; SSE3-NEXT:    .cfi_def_cfa_offset 40
    634 ; SSE3-NEXT:    pushq %r12
    635 ; SSE3-NEXT:    .cfi_def_cfa_offset 48
    636 ; SSE3-NEXT:    pushq %rbx
    637 ; SSE3-NEXT:    .cfi_def_cfa_offset 56
    638 ; SSE3-NEXT:    .cfi_offset %rbx, -56
    639 ; SSE3-NEXT:    .cfi_offset %r12, -48
    640 ; SSE3-NEXT:    .cfi_offset %r13, -40
    641 ; SSE3-NEXT:    .cfi_offset %r14, -32
    642 ; SSE3-NEXT:    .cfi_offset %r15, -24
    643 ; SSE3-NEXT:    .cfi_offset %rbp, -16
    644 ; SSE3-NEXT:    movd %xmm0, %eax
    645 ; SSE3-NEXT:    pextrw $1, %xmm0, %ecx
    646 ; SSE3-NEXT:    addl %eax, %ecx
    647 ; SSE3-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
    648 ; SSE3-NEXT:    pextrw $2, %xmm0, %eax
    649 ; SSE3-NEXT:    pextrw $3, %xmm0, %ecx
    650 ; SSE3-NEXT:    addl %eax, %ecx
    651 ; SSE3-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
    652 ; SSE3-NEXT:    pextrw $4, %xmm0, %eax
    653 ; SSE3-NEXT:    pextrw $5, %xmm0, %r11d
    654 ; SSE3-NEXT:    addl %eax, %r11d
    655 ; SSE3-NEXT:    pextrw $6, %xmm0, %eax
    656 ; SSE3-NEXT:    pextrw $7, %xmm0, %r15d
    657 ; SSE3-NEXT:    addl %eax, %r15d
    658 ; SSE3-NEXT:    movd %xmm1, %eax
    659 ; SSE3-NEXT:    pextrw $1, %xmm1, %r13d
    660 ; SSE3-NEXT:    addl %eax, %r13d
    661 ; SSE3-NEXT:    pextrw $2, %xmm1, %eax
    662 ; SSE3-NEXT:    pextrw $3, %xmm1, %ebx
    663 ; SSE3-NEXT:    addl %eax, %ebx
    664 ; SSE3-NEXT:    pextrw $4, %xmm1, %eax
    665 ; SSE3-NEXT:    pextrw $5, %xmm1, %r8d
    666 ; SSE3-NEXT:    addl %eax, %r8d
    667 ; SSE3-NEXT:    pextrw $6, %xmm1, %eax
    668 ; SSE3-NEXT:    pextrw $7, %xmm1, %esi
    669 ; SSE3-NEXT:    addl %eax, %esi
    670 ; SSE3-NEXT:    movd %xmm2, %eax
    671 ; SSE3-NEXT:    pextrw $1, %xmm2, %r10d
    672 ; SSE3-NEXT:    addl %eax, %r10d
    673 ; SSE3-NEXT:    pextrw $2, %xmm2, %eax
    674 ; SSE3-NEXT:    pextrw $3, %xmm2, %r14d
    675 ; SSE3-NEXT:    addl %eax, %r14d
    676 ; SSE3-NEXT:    pextrw $4, %xmm2, %eax
    677 ; SSE3-NEXT:    pextrw $5, %xmm2, %r12d
    678 ; SSE3-NEXT:    addl %eax, %r12d
    679 ; SSE3-NEXT:    pextrw $6, %xmm2, %eax
    680 ; SSE3-NEXT:    pextrw $7, %xmm2, %r9d
    681 ; SSE3-NEXT:    addl %eax, %r9d
    682 ; SSE3-NEXT:    movd %xmm3, %eax
    683 ; SSE3-NEXT:    pextrw $1, %xmm3, %ebp
    684 ; SSE3-NEXT:    addl %eax, %ebp
    685 ; SSE3-NEXT:    pextrw $2, %xmm3, %edx
    686 ; SSE3-NEXT:    pextrw $3, %xmm3, %edi
    687 ; SSE3-NEXT:    addl %edx, %edi
    688 ; SSE3-NEXT:    pextrw $4, %xmm3, %edx
    689 ; SSE3-NEXT:    pextrw $5, %xmm3, %ecx
    690 ; SSE3-NEXT:    addl %edx, %ecx
    691 ; SSE3-NEXT:    pextrw $6, %xmm3, %edx
    692 ; SSE3-NEXT:    pextrw $7, %xmm3, %eax
    693 ; SSE3-NEXT:    addl %edx, %eax
    694 ; SSE3-NEXT:    movd %esi, %xmm8
    695 ; SSE3-NEXT:    movd %r8d, %xmm3
    696 ; SSE3-NEXT:    movd %ebx, %xmm9
    697 ; SSE3-NEXT:    movd %r13d, %xmm4
    698 ; SSE3-NEXT:    movd %r15d, %xmm10
    699 ; SSE3-NEXT:    movd %r11d, %xmm7
    700 ; SSE3-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload
    701 ; SSE3-NEXT:    # xmm11 = mem[0],zero,zero,zero
    702 ; SSE3-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
    703 ; SSE3-NEXT:    # xmm0 = mem[0],zero,zero,zero
    704 ; SSE3-NEXT:    movd %eax, %xmm12
    705 ; SSE3-NEXT:    movd %ecx, %xmm6
    706 ; SSE3-NEXT:    movd %edi, %xmm13
    707 ; SSE3-NEXT:    movd %ebp, %xmm5
    708 ; SSE3-NEXT:    movd %r9d, %xmm14
    709 ; SSE3-NEXT:    movd %r12d, %xmm2
    710 ; SSE3-NEXT:    movd %r14d, %xmm15
    711 ; SSE3-NEXT:    movd %r10d, %xmm1
    712 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
    713 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
    714 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
    715 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
    716 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
    717 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
    718 ; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
    719 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
    720 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
    721 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
    722 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
    723 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
    724 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    725 ; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
    726 ; SSE3-NEXT:    popq %rbx
    727 ; SSE3-NEXT:    .cfi_def_cfa_offset 48
    728 ; SSE3-NEXT:    popq %r12
    729 ; SSE3-NEXT:    .cfi_def_cfa_offset 40
    730 ; SSE3-NEXT:    popq %r13
    731 ; SSE3-NEXT:    .cfi_def_cfa_offset 32
    732 ; SSE3-NEXT:    popq %r14
    733 ; SSE3-NEXT:    .cfi_def_cfa_offset 24
    734 ; SSE3-NEXT:    popq %r15
    735 ; SSE3-NEXT:    .cfi_def_cfa_offset 16
    736 ; SSE3-NEXT:    popq %rbp
    737 ; SSE3-NEXT:    .cfi_def_cfa_offset 8
    738 ; SSE3-NEXT:    retq
    739 ;
    740 ; SSSE3-LABEL: avx2_vphadd_w_test:
    741 ; SSSE3:       # %bb.0:
    742 ; SSSE3-NEXT:    phaddw %xmm1, %xmm0
    743 ; SSSE3-NEXT:    phaddw %xmm3, %xmm2
    744 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
    745 ; SSSE3-NEXT:    retq
    746 ;
    747 ; AVX1-LABEL: avx2_vphadd_w_test:
    748 ; AVX1:       # %bb.0:
    749 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    750 ; AVX1-NEXT:    vphaddw %xmm2, %xmm1, %xmm1
    751 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    752 ; AVX1-NEXT:    vphaddw %xmm2, %xmm0, %xmm0
    753 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    754 ; AVX1-NEXT:    retq
    755 ;
    756 ; AVX2-LABEL: avx2_vphadd_w_test:
    757 ; AVX2:       # %bb.0:
    758 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
    759 ; AVX2-NEXT:    vphaddw %xmm2, %xmm1, %xmm1
    760 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
    761 ; AVX2-NEXT:    vphaddw %xmm2, %xmm0, %xmm0
    762 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    763 ; AVX2-NEXT:    retq
    764   %vecext = extractelement <16 x i16> %a, i32 0
    765   %vecext1 = extractelement <16 x i16> %a, i32 1
    766   %add = add i16 %vecext, %vecext1
    767   %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0
    768   %vecext4 = extractelement <16 x i16> %a, i32 2
    769   %vecext6 = extractelement <16 x i16> %a, i32 3
    770   %add8 = add i16 %vecext4, %vecext6
    771   %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1
    772   %vecext11 = extractelement <16 x i16> %a, i32 4
    773   %vecext13 = extractelement <16 x i16> %a, i32 5
    774   %add15 = add i16 %vecext11, %vecext13
    775   %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2
    776   %vecext18 = extractelement <16 x i16> %a, i32 6
    777   %vecext20 = extractelement <16 x i16> %a, i32 7
    778   %add22 = add i16 %vecext18, %vecext20
    779   %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3
    780   %vecext25 = extractelement <16 x i16> %a, i32 8
    781   %vecext27 = extractelement <16 x i16> %a, i32 9
    782   %add29 = add i16 %vecext25, %vecext27
    783   %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 4
    784   %vecext32 = extractelement <16 x i16> %a, i32 10
    785   %vecext34 = extractelement <16 x i16> %a, i32 11
    786   %add36 = add i16 %vecext32, %vecext34
    787   %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 5
    788   %vecext39 = extractelement <16 x i16> %a, i32 12
    789   %vecext41 = extractelement <16 x i16> %a, i32 13
    790   %add43 = add i16 %vecext39, %vecext41
    791   %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 6
    792   %vecext46 = extractelement <16 x i16> %a, i32 14
    793   %vecext48 = extractelement <16 x i16> %a, i32 15
    794   %add50 = add i16 %vecext46, %vecext48
    795   %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 7
    796   %vecext53 = extractelement <16 x i16> %b, i32 0
    797   %vecext55 = extractelement <16 x i16> %b, i32 1
    798   %add57 = add i16 %vecext53, %vecext55
    799   %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 8
    800   %vecext60 = extractelement <16 x i16> %b, i32 2
    801   %vecext62 = extractelement <16 x i16> %b, i32 3
    802   %add64 = add i16 %vecext60, %vecext62
    803   %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 9
    804   %vecext67 = extractelement <16 x i16> %b, i32 4
    805   %vecext69 = extractelement <16 x i16> %b, i32 5
    806   %add71 = add i16 %vecext67, %vecext69
    807   %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 10
    808   %vecext74 = extractelement <16 x i16> %b, i32 6
    809   %vecext76 = extractelement <16 x i16> %b, i32 7
    810   %add78 = add i16 %vecext74, %vecext76
    811   %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 11
    812   %vecext81 = extractelement <16 x i16> %b, i32 8
    813   %vecext83 = extractelement <16 x i16> %b, i32 9
    814   %add85 = add i16 %vecext81, %vecext83
    815   %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12
    816   %vecext88 = extractelement <16 x i16> %b, i32 10
    817   %vecext90 = extractelement <16 x i16> %b, i32 11
    818   %add92 = add i16 %vecext88, %vecext90
    819   %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13
    820   %vecext95 = extractelement <16 x i16> %b, i32 12
    821   %vecext97 = extractelement <16 x i16> %b, i32 13
    822   %add99 = add i16 %vecext95, %vecext97
    823   %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14
    824   %vecext102 = extractelement <16 x i16> %b, i32 14
    825   %vecext104 = extractelement <16 x i16> %b, i32 15
    826   %add106 = add i16 %vecext102, %vecext104
    827   %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
    828   ret <16 x i16> %vecinit108
    829 }
    830 
    831 ; Verify that we don't select horizontal subs in the following functions.
    832 
    833 define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) {
    834 ; SSE-LABEL: not_a_hsub_1:
    835 ; SSE:       # %bb.0:
    836 ; SSE-NEXT:    movd %xmm0, %eax
    837 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
    838 ; SSE-NEXT:    movd %xmm2, %ecx
    839 ; SSE-NEXT:    subl %ecx, %eax
    840 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
    841 ; SSE-NEXT:    movd %xmm2, %ecx
    842 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    843 ; SSE-NEXT:    movd %xmm0, %edx
    844 ; SSE-NEXT:    subl %edx, %ecx
    845 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    846 ; SSE-NEXT:    movd %xmm0, %edx
    847 ; SSE-NEXT:    movd %xmm1, %esi
    848 ; SSE-NEXT:    subl %esi, %edx
    849 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
    850 ; SSE-NEXT:    movd %xmm0, %esi
    851 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
    852 ; SSE-NEXT:    movd %xmm0, %edi
    853 ; SSE-NEXT:    subl %edi, %esi
    854 ; SSE-NEXT:    movd %esi, %xmm0
    855 ; SSE-NEXT:    movd %edx, %xmm1
    856 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    857 ; SSE-NEXT:    movd %ecx, %xmm2
    858 ; SSE-NEXT:    movd %eax, %xmm0
    859 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    860 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    861 ; SSE-NEXT:    retq
    862 ;
    863 ; AVX-LABEL: not_a_hsub_1:
    864 ; AVX:       # %bb.0:
    865 ; AVX-NEXT:    vmovd %xmm0, %eax
    866 ; AVX-NEXT:    vpextrd $1, %xmm0, %ecx
    867 ; AVX-NEXT:    subl %ecx, %eax
    868 ; AVX-NEXT:    vpextrd $2, %xmm0, %ecx
    869 ; AVX-NEXT:    vpextrd $3, %xmm0, %edx
    870 ; AVX-NEXT:    subl %edx, %ecx
    871 ; AVX-NEXT:    vpextrd $1, %xmm1, %edx
    872 ; AVX-NEXT:    vmovd %xmm1, %esi
    873 ; AVX-NEXT:    subl %esi, %edx
    874 ; AVX-NEXT:    vpextrd $3, %xmm1, %esi
    875 ; AVX-NEXT:    vpextrd $2, %xmm1, %edi
    876 ; AVX-NEXT:    subl %edi, %esi
    877 ; AVX-NEXT:    vmovd %eax, %xmm0
    878 ; AVX-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
    879 ; AVX-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0
    880 ; AVX-NEXT:    vpinsrd $3, %esi, %xmm0, %xmm0
    881 ; AVX-NEXT:    retq
    882   %vecext = extractelement <4 x i32> %A, i32 0
    883   %vecext1 = extractelement <4 x i32> %A, i32 1
    884   %sub = sub i32 %vecext, %vecext1
    885   %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0
    886   %vecext2 = extractelement <4 x i32> %A, i32 2
    887   %vecext3 = extractelement <4 x i32> %A, i32 3
    888   %sub4 = sub i32 %vecext2, %vecext3
    889   %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1
    890   %vecext6 = extractelement <4 x i32> %B, i32 1
    891   %vecext7 = extractelement <4 x i32> %B, i32 0
    892   %sub8 = sub i32 %vecext6, %vecext7
    893   %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2
    894   %vecext10 = extractelement <4 x i32> %B, i32 3
    895   %vecext11 = extractelement <4 x i32> %B, i32 2
    896   %sub12 = sub i32 %vecext10, %vecext11
    897   %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
    898   ret <4 x i32> %vecinit13
    899 }
    900 
    901 define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
    902 ; SSE-LABEL: not_a_hsub_2:
    903 ; SSE:       # %bb.0:
    904 ; SSE-NEXT:    movaps %xmm0, %xmm2
    905 ; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
    906 ; SSE-NEXT:    movaps %xmm0, %xmm3
    907 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3]
    908 ; SSE-NEXT:    subss %xmm3, %xmm2
    909 ; SSE-NEXT:    movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
    910 ; SSE-NEXT:    subss %xmm3, %xmm0
    911 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    912 ; SSE-NEXT:    movaps %xmm1, %xmm2
    913 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3]
    914 ; SSE-NEXT:    movaps %xmm1, %xmm3
    915 ; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1]
    916 ; SSE-NEXT:    subss %xmm3, %xmm2
    917 ; SSE-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
    918 ; SSE-NEXT:    subss %xmm3, %xmm1
    919 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    920 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    921 ; SSE-NEXT:    retq
    922 ;
    923 ; AVX-LABEL: not_a_hsub_2:
    924 ; AVX:       # %bb.0:
    925 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    926 ; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3]
    927 ; AVX-NEXT:    vsubss %xmm3, %xmm2, %xmm2
    928 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
    929 ; AVX-NEXT:    vsubss %xmm3, %xmm0, %xmm0
    930 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
    931 ; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
    932 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
    933 ; AVX-NEXT:    vsubss %xmm3, %xmm2, %xmm2
    934 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
    935 ; AVX-NEXT:    vsubss %xmm3, %xmm1, %xmm1
    936 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
    937 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
    938 ; AVX-NEXT:    retq
    939   %vecext = extractelement <4 x float> %A, i32 2
    940   %vecext1 = extractelement <4 x float> %A, i32 3
    941   %sub = fsub float %vecext, %vecext1
    942   %vecinit = insertelement <4 x float> undef, float %sub, i32 1
    943   %vecext2 = extractelement <4 x float> %A, i32 0
    944   %vecext3 = extractelement <4 x float> %A, i32 1
    945   %sub4 = fsub float %vecext2, %vecext3
    946   %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0
    947   %vecext6 = extractelement <4 x float> %B, i32 3
    948   %vecext7 = extractelement <4 x float> %B, i32 2
    949   %sub8 = fsub float %vecext6, %vecext7
    950   %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3
    951   %vecext10 = extractelement <4 x float> %B, i32 0
    952   %vecext11 = extractelement <4 x float> %B, i32 1
    953   %sub12 = fsub float %vecext10, %vecext11
    954   %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
    955   ret <4 x float> %vecinit13
    956 }
    957 
    958 define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) {
    959 ; SSE-LABEL: not_a_hsub_3:
    960 ; SSE:       # %bb.0:
    961 ; SSE-NEXT:    movaps %xmm1, %xmm2
    962 ; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
    963 ; SSE-NEXT:    subsd %xmm2, %xmm1
    964 ; SSE-NEXT:    movaps %xmm0, %xmm2
    965 ; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
    966 ; SSE-NEXT:    subsd %xmm0, %xmm2
    967 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
    968 ; SSE-NEXT:    movapd %xmm2, %xmm0
    969 ; SSE-NEXT:    retq
    970 ;
    971 ; AVX-LABEL: not_a_hsub_3:
    972 ; AVX:       # %bb.0:
    973 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    974 ; AVX-NEXT:    vsubsd %xmm2, %xmm1, %xmm1
    975 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    976 ; AVX-NEXT:    vsubsd %xmm0, %xmm2, %xmm0
    977 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    978 ; AVX-NEXT:    retq
    979   %vecext = extractelement <2 x double> %B, i32 0
    980   %vecext1 = extractelement <2 x double> %B, i32 1
    981   %sub = fsub double %vecext, %vecext1
    982   %vecinit = insertelement <2 x double> undef, double %sub, i32 1
    983   %vecext2 = extractelement <2 x double> %A, i32 1
    984   %vecext3 = extractelement <2 x double> %A, i32 0
    985   %sub2 = fsub double %vecext2, %vecext3
    986   %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
    987   ret <2 x double> %vecinit2
    988 }
    989 
    990 ; Test AVX horizontal add/sub of packed single/double precision
    991 ; floating point values from 256-bit vectors.
    992 
    993 define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) {
    994 ; SSE-LABEL: avx_vhadd_ps:
    995 ; SSE:       # %bb.0:
    996 ; SSE-NEXT:    haddps %xmm2, %xmm0
    997 ; SSE-NEXT:    haddps %xmm3, %xmm1
    998 ; SSE-NEXT:    retq
    999 ;
   1000 ; AVX-LABEL: avx_vhadd_ps:
   1001 ; AVX:       # %bb.0:
   1002 ; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
   1003 ; AVX-NEXT:    retq
   1004   %vecext = extractelement <8 x float> %a, i32 0
   1005   %vecext1 = extractelement <8 x float> %a, i32 1
   1006   %add = fadd float %vecext, %vecext1
   1007   %vecinit = insertelement <8 x float> undef, float %add, i32 0
   1008   %vecext2 = extractelement <8 x float> %a, i32 2
   1009   %vecext3 = extractelement <8 x float> %a, i32 3
   1010   %add4 = fadd float %vecext2, %vecext3
   1011   %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1
   1012   %vecext6 = extractelement <8 x float> %b, i32 0
   1013   %vecext7 = extractelement <8 x float> %b, i32 1
   1014   %add8 = fadd float %vecext6, %vecext7
   1015   %vecinit9 = insertelement <8 x float> %vecinit5, float %add8, i32 2
   1016   %vecext10 = extractelement <8 x float> %b, i32 2
   1017   %vecext11 = extractelement <8 x float> %b, i32 3
   1018   %add12 = fadd float %vecext10, %vecext11
   1019   %vecinit13 = insertelement <8 x float> %vecinit9, float %add12, i32 3
   1020   %vecext14 = extractelement <8 x float> %a, i32 4
   1021   %vecext15 = extractelement <8 x float> %a, i32 5
   1022   %add16 = fadd float %vecext14, %vecext15
   1023   %vecinit17 = insertelement <8 x float> %vecinit13, float %add16, i32 4
   1024   %vecext18 = extractelement <8 x float> %a, i32 6
   1025   %vecext19 = extractelement <8 x float> %a, i32 7
   1026   %add20 = fadd float %vecext18, %vecext19
   1027   %vecinit21 = insertelement <8 x float> %vecinit17, float %add20, i32 5
   1028   %vecext22 = extractelement <8 x float> %b, i32 4
   1029   %vecext23 = extractelement <8 x float> %b, i32 5
   1030   %add24 = fadd float %vecext22, %vecext23
   1031   %vecinit25 = insertelement <8 x float> %vecinit21, float %add24, i32 6
   1032   %vecext26 = extractelement <8 x float> %b, i32 6
   1033   %vecext27 = extractelement <8 x float> %b, i32 7
   1034   %add28 = fadd float %vecext26, %vecext27
   1035   %vecinit29 = insertelement <8 x float> %vecinit25, float %add28, i32 7
   1036   ret <8 x float> %vecinit29
   1037 }
   1038 
   1039 define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) {
   1040 ; SSE-LABEL: avx_vhsub_ps:
   1041 ; SSE:       # %bb.0:
   1042 ; SSE-NEXT:    hsubps %xmm2, %xmm0
   1043 ; SSE-NEXT:    hsubps %xmm3, %xmm1
   1044 ; SSE-NEXT:    retq
   1045 ;
   1046 ; AVX-LABEL: avx_vhsub_ps:
   1047 ; AVX:       # %bb.0:
   1048 ; AVX-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
   1049 ; AVX-NEXT:    retq
   1050   %vecext = extractelement <8 x float> %a, i32 0
   1051   %vecext1 = extractelement <8 x float> %a, i32 1
   1052   %sub = fsub float %vecext, %vecext1
   1053   %vecinit = insertelement <8 x float> undef, float %sub, i32 0
   1054   %vecext2 = extractelement <8 x float> %a, i32 2
   1055   %vecext3 = extractelement <8 x float> %a, i32 3
   1056   %sub4 = fsub float %vecext2, %vecext3
   1057   %vecinit5 = insertelement <8 x float> %vecinit, float %sub4, i32 1
   1058   %vecext6 = extractelement <8 x float> %b, i32 0
   1059   %vecext7 = extractelement <8 x float> %b, i32 1
   1060   %sub8 = fsub float %vecext6, %vecext7
   1061   %vecinit9 = insertelement <8 x float> %vecinit5, float %sub8, i32 2
   1062   %vecext10 = extractelement <8 x float> %b, i32 2
   1063   %vecext11 = extractelement <8 x float> %b, i32 3
   1064   %sub12 = fsub float %vecext10, %vecext11
   1065   %vecinit13 = insertelement <8 x float> %vecinit9, float %sub12, i32 3
   1066   %vecext14 = extractelement <8 x float> %a, i32 4
   1067   %vecext15 = extractelement <8 x float> %a, i32 5
   1068   %sub16 = fsub float %vecext14, %vecext15
   1069   %vecinit17 = insertelement <8 x float> %vecinit13, float %sub16, i32 4
   1070   %vecext18 = extractelement <8 x float> %a, i32 6
   1071   %vecext19 = extractelement <8 x float> %a, i32 7
   1072   %sub20 = fsub float %vecext18, %vecext19
   1073   %vecinit21 = insertelement <8 x float> %vecinit17, float %sub20, i32 5
   1074   %vecext22 = extractelement <8 x float> %b, i32 4
   1075   %vecext23 = extractelement <8 x float> %b, i32 5
   1076   %sub24 = fsub float %vecext22, %vecext23
   1077   %vecinit25 = insertelement <8 x float> %vecinit21, float %sub24, i32 6
   1078   %vecext26 = extractelement <8 x float> %b, i32 6
   1079   %vecext27 = extractelement <8 x float> %b, i32 7
   1080   %sub28 = fsub float %vecext26, %vecext27
   1081   %vecinit29 = insertelement <8 x float> %vecinit25, float %sub28, i32 7
   1082   ret <8 x float> %vecinit29
   1083 }
   1084 
   1085 define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) {
   1086 ; SSE-LABEL: avx_hadd_pd:
   1087 ; SSE:       # %bb.0:
   1088 ; SSE-NEXT:    haddpd %xmm2, %xmm0
   1089 ; SSE-NEXT:    haddpd %xmm3, %xmm1
   1090 ; SSE-NEXT:    retq
   1091 ;
   1092 ; AVX-LABEL: avx_hadd_pd:
   1093 ; AVX:       # %bb.0:
   1094 ; AVX-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
   1095 ; AVX-NEXT:    retq
   1096   %vecext = extractelement <4 x double> %a, i32 0
   1097   %vecext1 = extractelement <4 x double> %a, i32 1
   1098   %add = fadd double %vecext, %vecext1
   1099   %vecinit = insertelement <4 x double> undef, double %add, i32 0
   1100   %vecext2 = extractelement <4 x double> %b, i32 0
   1101   %vecext3 = extractelement <4 x double> %b, i32 1
   1102   %add4 = fadd double %vecext2, %vecext3
   1103   %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1
   1104   %vecext6 = extractelement <4 x double> %a, i32 2
   1105   %vecext7 = extractelement <4 x double> %a, i32 3
   1106   %add8 = fadd double %vecext6, %vecext7
   1107   %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2
   1108   %vecext10 = extractelement <4 x double> %b, i32 2
   1109   %vecext11 = extractelement <4 x double> %b, i32 3
   1110   %add12 = fadd double %vecext10, %vecext11
   1111   %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
   1112   ret <4 x double> %vecinit13
   1113 }
   1114 
   1115 define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) {
   1116 ; SSE-LABEL: avx_hsub_pd:
   1117 ; SSE:       # %bb.0:
   1118 ; SSE-NEXT:    hsubpd %xmm2, %xmm0
   1119 ; SSE-NEXT:    hsubpd %xmm3, %xmm1
   1120 ; SSE-NEXT:    retq
   1121 ;
   1122 ; AVX-LABEL: avx_hsub_pd:
   1123 ; AVX:       # %bb.0:
   1124 ; AVX-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
   1125 ; AVX-NEXT:    retq
   1126   %vecext = extractelement <4 x double> %a, i32 0
   1127   %vecext1 = extractelement <4 x double> %a, i32 1
   1128   %sub = fsub double %vecext, %vecext1
   1129   %vecinit = insertelement <4 x double> undef, double %sub, i32 0
   1130   %vecext2 = extractelement <4 x double> %b, i32 0
   1131   %vecext3 = extractelement <4 x double> %b, i32 1
   1132   %sub4 = fsub double %vecext2, %vecext3
   1133   %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1
   1134   %vecext6 = extractelement <4 x double> %a, i32 2
   1135   %vecext7 = extractelement <4 x double> %a, i32 3
   1136   %sub8 = fsub double %vecext6, %vecext7
   1137   %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2
   1138   %vecext10 = extractelement <4 x double> %b, i32 2
   1139   %vecext11 = extractelement <4 x double> %b, i32 3
   1140   %sub12 = fsub double %vecext10, %vecext11
   1141   %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
   1142   ret <4 x double> %vecinit13
   1143 }
   1144 
   1145 ; Test AVX2 horizontal add of packed integer values from 256-bit vectors.
   1146 
   1147 define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
   1148 ; SSE3-LABEL: avx2_hadd_d:
   1149 ; SSE3:       # %bb.0:
   1150 ; SSE3-NEXT:    movd %xmm0, %ecx
   1151 ; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
   1152 ; SSE3-NEXT:    movd %xmm4, %r8d
   1153 ; SSE3-NEXT:    addl %ecx, %r8d
   1154 ; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
   1155 ; SSE3-NEXT:    movd %xmm4, %edx
   1156 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
   1157 ; SSE3-NEXT:    movd %xmm0, %r9d
   1158 ; SSE3-NEXT:    addl %edx, %r9d
   1159 ; SSE3-NEXT:    movd %xmm2, %edx
   1160 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
   1161 ; SSE3-NEXT:    movd %xmm0, %esi
   1162 ; SSE3-NEXT:    addl %edx, %esi
   1163 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
   1164 ; SSE3-NEXT:    movd %xmm0, %edx
   1165 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
   1166 ; SSE3-NEXT:    movd %xmm0, %edi
   1167 ; SSE3-NEXT:    addl %edx, %edi
   1168 ; SSE3-NEXT:    movd %xmm1, %eax
   1169 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
   1170 ; SSE3-NEXT:    movd %xmm0, %r10d
   1171 ; SSE3-NEXT:    addl %eax, %r10d
   1172 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
   1173 ; SSE3-NEXT:    movd %xmm0, %eax
   1174 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
   1175 ; SSE3-NEXT:    movd %xmm0, %ecx
   1176 ; SSE3-NEXT:    addl %eax, %ecx
   1177 ; SSE3-NEXT:    movd %xmm3, %eax
   1178 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
   1179 ; SSE3-NEXT:    movd %xmm0, %edx
   1180 ; SSE3-NEXT:    addl %eax, %edx
   1181 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
   1182 ; SSE3-NEXT:    movd %xmm0, %r11d
   1183 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
   1184 ; SSE3-NEXT:    movd %xmm0, %eax
   1185 ; SSE3-NEXT:    addl %r11d, %eax
   1186 ; SSE3-NEXT:    movd %edi, %xmm0
   1187 ; SSE3-NEXT:    movd %esi, %xmm1
   1188 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
   1189 ; SSE3-NEXT:    movd %r9d, %xmm2
   1190 ; SSE3-NEXT:    movd %r8d, %xmm0
   1191 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
   1192 ; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1193 ; SSE3-NEXT:    movd %eax, %xmm1
   1194 ; SSE3-NEXT:    movd %edx, %xmm2
   1195 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
   1196 ; SSE3-NEXT:    movd %ecx, %xmm3
   1197 ; SSE3-NEXT:    movd %r10d, %xmm1
   1198 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
   1199 ; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
   1200 ; SSE3-NEXT:    retq
   1201 ;
   1202 ; SSSE3-LABEL: avx2_hadd_d:
   1203 ; SSSE3:       # %bb.0:
   1204 ; SSSE3-NEXT:    phaddd %xmm2, %xmm0
   1205 ; SSSE3-NEXT:    phaddd %xmm3, %xmm1
   1206 ; SSSE3-NEXT:    retq
   1207 ;
   1208 ; AVX1-LABEL: avx2_hadd_d:
   1209 ; AVX1:       # %bb.0:
   1210 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1211 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1212 ; AVX1-NEXT:    vphaddd %xmm2, %xmm3, %xmm2
   1213 ; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
   1214 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1215 ; AVX1-NEXT:    retq
   1216 ;
   1217 ; AVX2-LABEL: avx2_hadd_d:
   1218 ; AVX2:       # %bb.0:
   1219 ; AVX2-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
   1220 ; AVX2-NEXT:    retq
   1221   %vecext = extractelement <8 x i32> %a, i32 0
   1222   %vecext1 = extractelement <8 x i32> %a, i32 1
   1223   %add = add i32 %vecext, %vecext1
   1224   %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
   1225   %vecext2 = extractelement <8 x i32> %a, i32 2
   1226   %vecext3 = extractelement <8 x i32> %a, i32 3
   1227   %add4 = add i32 %vecext2, %vecext3
   1228   %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
   1229   %vecext6 = extractelement <8 x i32> %b, i32 0
   1230   %vecext7 = extractelement <8 x i32> %b, i32 1
   1231   %add8 = add i32 %vecext6, %vecext7
   1232   %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2
   1233   %vecext10 = extractelement <8 x i32> %b, i32 2
   1234   %vecext11 = extractelement <8 x i32> %b, i32 3
   1235   %add12 = add i32 %vecext10, %vecext11
   1236   %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3
   1237   %vecext14 = extractelement <8 x i32> %a, i32 4
   1238   %vecext15 = extractelement <8 x i32> %a, i32 5
   1239   %add16 = add i32 %vecext14, %vecext15
   1240   %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4
   1241   %vecext18 = extractelement <8 x i32> %a, i32 6
   1242   %vecext19 = extractelement <8 x i32> %a, i32 7
   1243   %add20 = add i32 %vecext18, %vecext19
   1244   %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5
   1245   %vecext22 = extractelement <8 x i32> %b, i32 4
   1246   %vecext23 = extractelement <8 x i32> %b, i32 5
   1247   %add24 = add i32 %vecext22, %vecext23
   1248   %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6
   1249   %vecext26 = extractelement <8 x i32> %b, i32 6
   1250   %vecext27 = extractelement <8 x i32> %b, i32 7
   1251   %add28 = add i32 %vecext26, %vecext27
   1252   %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
   1253   ret <8 x i32> %vecinit29
   1254 }
   1255 
   1256 define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) {
   1257 ; SSE3-LABEL: avx2_hadd_w:
   1258 ; SSE3:       # %bb.0:
   1259 ; SSE3-NEXT:    pushq %rbp
   1260 ; SSE3-NEXT:    .cfi_def_cfa_offset 16
   1261 ; SSE3-NEXT:    pushq %r15
   1262 ; SSE3-NEXT:    .cfi_def_cfa_offset 24
   1263 ; SSE3-NEXT:    pushq %r14
   1264 ; SSE3-NEXT:    .cfi_def_cfa_offset 32
   1265 ; SSE3-NEXT:    pushq %r13
   1266 ; SSE3-NEXT:    .cfi_def_cfa_offset 40
   1267 ; SSE3-NEXT:    pushq %r12
   1268 ; SSE3-NEXT:    .cfi_def_cfa_offset 48
   1269 ; SSE3-NEXT:    pushq %rbx
   1270 ; SSE3-NEXT:    .cfi_def_cfa_offset 56
   1271 ; SSE3-NEXT:    .cfi_offset %rbx, -56
   1272 ; SSE3-NEXT:    .cfi_offset %r12, -48
   1273 ; SSE3-NEXT:    .cfi_offset %r13, -40
   1274 ; SSE3-NEXT:    .cfi_offset %r14, -32
   1275 ; SSE3-NEXT:    .cfi_offset %r15, -24
   1276 ; SSE3-NEXT:    .cfi_offset %rbp, -16
   1277 ; SSE3-NEXT:    movd %xmm0, %eax
   1278 ; SSE3-NEXT:    pextrw $1, %xmm0, %r10d
   1279 ; SSE3-NEXT:    addl %eax, %r10d
   1280 ; SSE3-NEXT:    pextrw $2, %xmm0, %eax
   1281 ; SSE3-NEXT:    pextrw $3, %xmm0, %r11d
   1282 ; SSE3-NEXT:    addl %eax, %r11d
   1283 ; SSE3-NEXT:    pextrw $4, %xmm0, %eax
   1284 ; SSE3-NEXT:    pextrw $5, %xmm0, %r12d
   1285 ; SSE3-NEXT:    addl %eax, %r12d
   1286 ; SSE3-NEXT:    pextrw $6, %xmm0, %eax
   1287 ; SSE3-NEXT:    pextrw $7, %xmm0, %r13d
   1288 ; SSE3-NEXT:    addl %eax, %r13d
   1289 ; SSE3-NEXT:    movd %xmm1, %eax
   1290 ; SSE3-NEXT:    pextrw $1, %xmm1, %ecx
   1291 ; SSE3-NEXT:    addl %eax, %ecx
   1292 ; SSE3-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
   1293 ; SSE3-NEXT:    pextrw $2, %xmm1, %eax
   1294 ; SSE3-NEXT:    pextrw $3, %xmm1, %ecx
   1295 ; SSE3-NEXT:    addl %eax, %ecx
   1296 ; SSE3-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
   1297 ; SSE3-NEXT:    pextrw $4, %xmm1, %eax
   1298 ; SSE3-NEXT:    pextrw $5, %xmm1, %r14d
   1299 ; SSE3-NEXT:    addl %eax, %r14d
   1300 ; SSE3-NEXT:    pextrw $6, %xmm1, %esi
   1301 ; SSE3-NEXT:    pextrw $7, %xmm1, %r15d
   1302 ; SSE3-NEXT:    addl %esi, %r15d
   1303 ; SSE3-NEXT:    movd %xmm2, %esi
   1304 ; SSE3-NEXT:    pextrw $1, %xmm2, %ebp
   1305 ; SSE3-NEXT:    addl %esi, %ebp
   1306 ; SSE3-NEXT:    pextrw $2, %xmm2, %esi
   1307 ; SSE3-NEXT:    pextrw $3, %xmm2, %edi
   1308 ; SSE3-NEXT:    addl %esi, %edi
   1309 ; SSE3-NEXT:    pextrw $4, %xmm2, %esi
   1310 ; SSE3-NEXT:    pextrw $5, %xmm2, %eax
   1311 ; SSE3-NEXT:    addl %esi, %eax
   1312 ; SSE3-NEXT:    pextrw $6, %xmm2, %esi
   1313 ; SSE3-NEXT:    pextrw $7, %xmm2, %ecx
   1314 ; SSE3-NEXT:    addl %esi, %ecx
   1315 ; SSE3-NEXT:    movd %xmm3, %ebx
   1316 ; SSE3-NEXT:    pextrw $1, %xmm3, %r9d
   1317 ; SSE3-NEXT:    addl %ebx, %r9d
   1318 ; SSE3-NEXT:    pextrw $2, %xmm3, %edx
   1319 ; SSE3-NEXT:    pextrw $3, %xmm3, %ebx
   1320 ; SSE3-NEXT:    addl %edx, %ebx
   1321 ; SSE3-NEXT:    pextrw $4, %xmm3, %edx
   1322 ; SSE3-NEXT:    pextrw $5, %xmm3, %esi
   1323 ; SSE3-NEXT:    addl %edx, %esi
   1324 ; SSE3-NEXT:    pextrw $6, %xmm3, %r8d
   1325 ; SSE3-NEXT:    pextrw $7, %xmm3, %edx
   1326 ; SSE3-NEXT:    addl %r8d, %edx
   1327 ; SSE3-NEXT:    movd %ecx, %xmm8
   1328 ; SSE3-NEXT:    movd %eax, %xmm3
   1329 ; SSE3-NEXT:    movd %edi, %xmm9
   1330 ; SSE3-NEXT:    movd %ebp, %xmm4
   1331 ; SSE3-NEXT:    movd %r13d, %xmm10
   1332 ; SSE3-NEXT:    movd %r12d, %xmm7
   1333 ; SSE3-NEXT:    movd %r11d, %xmm11
   1334 ; SSE3-NEXT:    movd %r10d, %xmm0
   1335 ; SSE3-NEXT:    movd %edx, %xmm12
   1336 ; SSE3-NEXT:    movd %esi, %xmm6
   1337 ; SSE3-NEXT:    movd %ebx, %xmm13
   1338 ; SSE3-NEXT:    movd %r9d, %xmm5
   1339 ; SSE3-NEXT:    movd %r15d, %xmm14
   1340 ; SSE3-NEXT:    movd %r14d, %xmm2
   1341 ; SSE3-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload
   1342 ; SSE3-NEXT:    # xmm15 = mem[0],zero,zero,zero
   1343 ; SSE3-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload
   1344 ; SSE3-NEXT:    # xmm1 = mem[0],zero,zero,zero
   1345 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
   1346 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
   1347 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
   1348 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
   1349 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
   1350 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
   1351 ; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
   1352 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
   1353 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
   1354 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
   1355 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
   1356 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
   1357 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   1358 ; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
   1359 ; SSE3-NEXT:    popq %rbx
   1360 ; SSE3-NEXT:    .cfi_def_cfa_offset 48
   1361 ; SSE3-NEXT:    popq %r12
   1362 ; SSE3-NEXT:    .cfi_def_cfa_offset 40
   1363 ; SSE3-NEXT:    popq %r13
   1364 ; SSE3-NEXT:    .cfi_def_cfa_offset 32
   1365 ; SSE3-NEXT:    popq %r14
   1366 ; SSE3-NEXT:    .cfi_def_cfa_offset 24
   1367 ; SSE3-NEXT:    popq %r15
   1368 ; SSE3-NEXT:    .cfi_def_cfa_offset 16
   1369 ; SSE3-NEXT:    popq %rbp
   1370 ; SSE3-NEXT:    .cfi_def_cfa_offset 8
   1371 ; SSE3-NEXT:    retq
   1372 ;
   1373 ; SSSE3-LABEL: avx2_hadd_w:
   1374 ; SSSE3:       # %bb.0:
   1375 ; SSSE3-NEXT:    phaddw %xmm2, %xmm0
   1376 ; SSSE3-NEXT:    phaddw %xmm3, %xmm1
   1377 ; SSSE3-NEXT:    retq
   1378 ;
   1379 ; AVX1-LABEL: avx2_hadd_w:
   1380 ; AVX1:       # %bb.0:
   1381 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1382 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1383 ; AVX1-NEXT:    vphaddw %xmm2, %xmm3, %xmm2
   1384 ; AVX1-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
   1385 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1386 ; AVX1-NEXT:    retq
   1387 ;
   1388 ; AVX2-LABEL: avx2_hadd_w:
   1389 ; AVX2:       # %bb.0:
   1390 ; AVX2-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
   1391 ; AVX2-NEXT:    retq
   1392   %vecext = extractelement <16 x i16> %a, i32 0
   1393   %vecext1 = extractelement <16 x i16> %a, i32 1
   1394   %add = add i16 %vecext, %vecext1
   1395   %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0
   1396   %vecext4 = extractelement <16 x i16> %a, i32 2
   1397   %vecext6 = extractelement <16 x i16> %a, i32 3
   1398   %add8 = add i16 %vecext4, %vecext6
   1399   %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1
   1400   %vecext11 = extractelement <16 x i16> %a, i32 4
   1401   %vecext13 = extractelement <16 x i16> %a, i32 5
   1402   %add15 = add i16 %vecext11, %vecext13
   1403   %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2
   1404   %vecext18 = extractelement <16 x i16> %a, i32 6
   1405   %vecext20 = extractelement <16 x i16> %a, i32 7
   1406   %add22 = add i16 %vecext18, %vecext20
   1407   %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3
   1408   %vecext25 = extractelement <16 x i16> %a, i32 8
   1409   %vecext27 = extractelement <16 x i16> %a, i32 9
   1410   %add29 = add i16 %vecext25, %vecext27
   1411   %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 8
   1412   %vecext32 = extractelement <16 x i16> %a, i32 10
   1413   %vecext34 = extractelement <16 x i16> %a, i32 11
   1414   %add36 = add i16 %vecext32, %vecext34
   1415   %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 9
   1416   %vecext39 = extractelement <16 x i16> %a, i32 12
   1417   %vecext41 = extractelement <16 x i16> %a, i32 13
   1418   %add43 = add i16 %vecext39, %vecext41
   1419   %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 10
   1420   %vecext46 = extractelement <16 x i16> %a, i32 14
   1421   %vecext48 = extractelement <16 x i16> %a, i32 15
   1422   %add50 = add i16 %vecext46, %vecext48
   1423   %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 11
   1424   %vecext53 = extractelement <16 x i16> %b, i32 0
   1425   %vecext55 = extractelement <16 x i16> %b, i32 1
   1426   %add57 = add i16 %vecext53, %vecext55
   1427   %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 4
   1428   %vecext60 = extractelement <16 x i16> %b, i32 2
   1429   %vecext62 = extractelement <16 x i16> %b, i32 3
   1430   %add64 = add i16 %vecext60, %vecext62
   1431   %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 5
   1432   %vecext67 = extractelement <16 x i16> %b, i32 4
   1433   %vecext69 = extractelement <16 x i16> %b, i32 5
   1434   %add71 = add i16 %vecext67, %vecext69
   1435   %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 6
   1436   %vecext74 = extractelement <16 x i16> %b, i32 6
   1437   %vecext76 = extractelement <16 x i16> %b, i32 7
   1438   %add78 = add i16 %vecext74, %vecext76
   1439   %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 7
   1440   %vecext81 = extractelement <16 x i16> %b, i32 8
   1441   %vecext83 = extractelement <16 x i16> %b, i32 9
   1442   %add85 = add i16 %vecext81, %vecext83
   1443   %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12
   1444   %vecext88 = extractelement <16 x i16> %b, i32 10
   1445   %vecext90 = extractelement <16 x i16> %b, i32 11
   1446   %add92 = add i16 %vecext88, %vecext90
   1447   %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13
   1448   %vecext95 = extractelement <16 x i16> %b, i32 12
   1449   %vecext97 = extractelement <16 x i16> %b, i32 13
   1450   %add99 = add i16 %vecext95, %vecext97
   1451   %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14
   1452   %vecext102 = extractelement <16 x i16> %b, i32 14
   1453   %vecext104 = extractelement <16 x i16> %b, i32 15
   1454   %add106 = add i16 %vecext102, %vecext104
   1455   %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
   1456   ret <16 x i16> %vecinit108
   1457 }
   1458