Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X32
      3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64
      4 
      5 ;
      6 ; 128-bit Vectors
      7 ;
      8 
      9 define <4 x float> @test_unpackl_fhadd_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
     10 ; X32-LABEL: test_unpackl_fhadd_128:
     11 ; X32:       ## %bb.0:
     12 ; X32-NEXT:    vhaddps %xmm2, %xmm0, %xmm0
     13 ; X32-NEXT:    retl
     14 ;
     15 ; X64-LABEL: test_unpackl_fhadd_128:
     16 ; X64:       ## %bb.0:
     17 ; X64-NEXT:    vhaddps %xmm2, %xmm0, %xmm0
     18 ; X64-NEXT:    retq
     19   %1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
     20   %2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a2, <4 x float> %a3)
     21   %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
     22   ret <4 x float> %3
     23 }
     24 
     25 define <2 x double> @test_unpackh_fhadd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
     26 ; X32-LABEL: test_unpackh_fhadd_128:
     27 ; X32:       ## %bb.0:
     28 ; X32-NEXT:    vhaddpd %xmm3, %xmm1, %xmm0
     29 ; X32-NEXT:    retl
     30 ;
     31 ; X64-LABEL: test_unpackh_fhadd_128:
     32 ; X64:       ## %bb.0:
     33 ; X64-NEXT:    vhaddpd %xmm3, %xmm1, %xmm0
     34 ; X64-NEXT:    retq
     35   %1 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
     36   %2 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a2, <2 x double> %a3)
     37   %3 = shufflevector <2 x double> %1, <2 x double> %2, <2 x i32> <i32 1, i32 3>
     38   ret <2 x double> %3
     39 }
     40 
     41 define <2 x double> @test_unpackl_fhsub_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
     42 ; X32-LABEL: test_unpackl_fhsub_128:
     43 ; X32:       ## %bb.0:
     44 ; X32-NEXT:    vhsubpd %xmm2, %xmm0, %xmm0
     45 ; X32-NEXT:    retl
     46 ;
     47 ; X64-LABEL: test_unpackl_fhsub_128:
     48 ; X64:       ## %bb.0:
     49 ; X64-NEXT:    vhsubpd %xmm2, %xmm0, %xmm0
     50 ; X64-NEXT:    retq
     51   %1 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
     52   %2 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a2, <2 x double> %a3)
     53   %3 = shufflevector <2 x double> %1, <2 x double> %2, <2 x i32> <i32 0, i32 2>
     54   ret <2 x double> %3
     55 }
     56 
     57 define <4 x float> @test_unpackh_fhsub_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
     58 ; X32-LABEL: test_unpackh_fhsub_128:
     59 ; X32:       ## %bb.0:
     60 ; X32-NEXT:    vhsubps %xmm3, %xmm1, %xmm0
     61 ; X32-NEXT:    retl
     62 ;
     63 ; X64-LABEL: test_unpackh_fhsub_128:
     64 ; X64:       ## %bb.0:
     65 ; X64-NEXT:    vhsubps %xmm3, %xmm1, %xmm0
     66 ; X64-NEXT:    retq
     67   %1 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
     68   %2 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a2, <4 x float> %a3)
     69   %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
     70   ret <4 x float> %3
     71 }
     72 
     73 define <8 x i16> @test_unpackl_hadd_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
     74 ; X32-LABEL: test_unpackl_hadd_128:
     75 ; X32:       ## %bb.0:
     76 ; X32-NEXT:    vphaddw %xmm2, %xmm0, %xmm0
     77 ; X32-NEXT:    retl
     78 ;
     79 ; X64-LABEL: test_unpackl_hadd_128:
     80 ; X64:       ## %bb.0:
     81 ; X64-NEXT:    vphaddw %xmm2, %xmm0, %xmm0
     82 ; X64-NEXT:    retq
     83   %1 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
     84   %2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a2, <8 x i16> %a3)
     85   %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
     86   ret <8 x i16> %3
     87 }
     88 
     89 define <4 x i32> @test_unpackh_hadd_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
     90 ; X32-LABEL: test_unpackh_hadd_128:
     91 ; X32:       ## %bb.0:
     92 ; X32-NEXT:    vphaddd %xmm3, %xmm1, %xmm0
     93 ; X32-NEXT:    retl
     94 ;
     95 ; X64-LABEL: test_unpackh_hadd_128:
     96 ; X64:       ## %bb.0:
     97 ; X64-NEXT:    vphaddd %xmm3, %xmm1, %xmm0
     98 ; X64-NEXT:    retq
     99   %1 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1)
    100   %2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a2, <4 x i32> %a3)
    101   %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
    102   ret <4 x i32> %3
    103 }
    104 
    105 define <4 x i32> @test_unpackl_hsub_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
    106 ; X32-LABEL: test_unpackl_hsub_128:
    107 ; X32:       ## %bb.0:
    108 ; X32-NEXT:    vphsubd %xmm2, %xmm0, %xmm0
    109 ; X32-NEXT:    retl
    110 ;
    111 ; X64-LABEL: test_unpackl_hsub_128:
    112 ; X64:       ## %bb.0:
    113 ; X64-NEXT:    vphsubd %xmm2, %xmm0, %xmm0
    114 ; X64-NEXT:    retq
    115   %1 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1)
    116   %2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a2, <4 x i32> %a3)
    117   %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
    118   ret <4 x i32> %3
    119 }
    120 
    121 define <8 x i16> @test_unpackh_hsub_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
    122 ; X32-LABEL: test_unpackh_hsub_128:
    123 ; X32:       ## %bb.0:
    124 ; X32-NEXT:    vphsubw %xmm3, %xmm1, %xmm0
    125 ; X32-NEXT:    retl
    126 ;
    127 ; X64-LABEL: test_unpackh_hsub_128:
    128 ; X64:       ## %bb.0:
    129 ; X64-NEXT:    vphsubw %xmm3, %xmm1, %xmm0
    130 ; X64-NEXT:    retq
    131   %1 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)
    132   %2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a2, <8 x i16> %a3)
    133   %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
    134   ret <8 x i16> %3
    135 }
    136 
    137 define <16 x i8> @test_unpackl_packss_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
    138 ; X32-LABEL: test_unpackl_packss_128:
    139 ; X32:       ## %bb.0:
    140 ; X32-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
    141 ; X32-NEXT:    retl
    142 ;
    143 ; X64-LABEL: test_unpackl_packss_128:
    144 ; X64:       ## %bb.0:
    145 ; X64-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
    146 ; X64-NEXT:    retq
    147   %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
    148   %2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a2, <8 x i16> %a3)
    149   %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
    150   ret <16 x i8> %3
    151 }
    152 
    153 define <8 x i16> @test_unpackh_packss_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
    154 ; X32-LABEL: test_unpackh_packss_128:
    155 ; X32:       ## %bb.0:
    156 ; X32-NEXT:    vpackssdw %xmm3, %xmm1, %xmm0
    157 ; X32-NEXT:    retl
    158 ;
    159 ; X64-LABEL: test_unpackh_packss_128:
    160 ; X64:       ## %bb.0:
    161 ; X64-NEXT:    vpackssdw %xmm3, %xmm1, %xmm0
    162 ; X64-NEXT:    retq
    163   %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
    164   %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a2, <4 x i32> %a3)
    165   %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
    166   ret <8 x i16> %3
    167 }
    168 
    169 define <8 x i16> @test_unpackl_packus_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
    170 ; X32-LABEL: test_unpackl_packus_128:
    171 ; X32:       ## %bb.0:
    172 ; X32-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
    173 ; X32-NEXT:    retl
    174 ;
    175 ; X64-LABEL: test_unpackl_packus_128:
    176 ; X64:       ## %bb.0:
    177 ; X64-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
    178 ; X64-NEXT:    retq
    179   %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
    180   %2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a2, <4 x i32> %a3)
    181   %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
    182   ret <8 x i16> %3
    183 }
    184 
    185 define <16 x i8> @test_unpackh_packus_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
    186 ; X32-LABEL: test_unpackh_packus_128:
    187 ; X32:       ## %bb.0:
    188 ; X32-NEXT:    vpackuswb %xmm3, %xmm1, %xmm0
    189 ; X32-NEXT:    retl
    190 ;
    191 ; X64-LABEL: test_unpackh_packus_128:
    192 ; X64:       ## %bb.0:
    193 ; X64-NEXT:    vpackuswb %xmm3, %xmm1, %xmm0
    194 ; X64-NEXT:    retq
    195   %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
    196   %2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a2, <8 x i16> %a3)
    197   %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    198   ret <16 x i8> %3
    199 }
    200 
    201 ;
    202 ; 256-bit Vectors
    203 ;
    204 
    205 define <8 x float> @test_unpackl_fhadd_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> %a3) {
    206 ; X32-LABEL: test_unpackl_fhadd_256:
    207 ; X32:       ## %bb.0:
    208 ; X32-NEXT:    vhaddps %ymm2, %ymm0, %ymm0
    209 ; X32-NEXT:    retl
    210 ;
    211 ; X64-LABEL: test_unpackl_fhadd_256:
    212 ; X64:       ## %bb.0:
    213 ; X64-NEXT:    vhaddps %ymm2, %ymm0, %ymm0
    214 ; X64-NEXT:    retq
    215   %1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
    216   %2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a2, <8 x float> %a3)
    217   %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13>
    218   ret <8 x float> %3
    219 }
    220 
    221 define <4 x double> @test_unpackh_fhadd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> %a3) {
    222 ; X32-LABEL: test_unpackh_fhadd_256:
    223 ; X32:       ## %bb.0:
    224 ; X32-NEXT:    vhaddpd %ymm3, %ymm1, %ymm0
    225 ; X32-NEXT:    retl
    226 ;
    227 ; X64-LABEL: test_unpackh_fhadd_256:
    228 ; X64:       ## %bb.0:
    229 ; X64-NEXT:    vhaddpd %ymm3, %ymm1, %ymm0
    230 ; X64-NEXT:    retq
    231   %1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
    232   %2 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a2, <4 x double> %a3)
    233   %3 = shufflevector <4 x double> %1, <4 x double> %2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
    234   ret <4 x double> %3
    235 }
    236 
    237 define <4 x double> @test_unpackl_fhsub_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> %a3) {
    238 ; X32-LABEL: test_unpackl_fhsub_256:
    239 ; X32:       ## %bb.0:
    240 ; X32-NEXT:    vhsubpd %ymm2, %ymm0, %ymm0
    241 ; X32-NEXT:    retl
    242 ;
    243 ; X64-LABEL: test_unpackl_fhsub_256:
    244 ; X64:       ## %bb.0:
    245 ; X64-NEXT:    vhsubpd %ymm2, %ymm0, %ymm0
    246 ; X64-NEXT:    retq
    247   %1 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
    248   %2 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a2, <4 x double> %a3)
    249   %3 = shufflevector <4 x double> %1, <4 x double> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
    250   ret <4 x double> %3
    251 }
    252 
    253 define <8 x float> @test_unpackh_fhsub_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> %a3) {
    254 ; X32-LABEL: test_unpackh_fhsub_256:
    255 ; X32:       ## %bb.0:
    256 ; X32-NEXT:    vhsubps %ymm3, %ymm1, %ymm0
    257 ; X32-NEXT:    retl
    258 ;
    259 ; X64-LABEL: test_unpackh_fhsub_256:
    260 ; X64:       ## %bb.0:
    261 ; X64-NEXT:    vhsubps %ymm3, %ymm1, %ymm0
    262 ; X64-NEXT:    retq
    263   %1 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
    264   %2 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a2, <8 x float> %a3)
    265   %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15>
    266   ret <8 x float> %3
    267 }
    268 
    269 define <16 x i16> @test_unpackl_hadd_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
    270 ; X32-LABEL: test_unpackl_hadd_256:
    271 ; X32:       ## %bb.0:
    272 ; X32-NEXT:    vphaddw %ymm2, %ymm0, %ymm0
    273 ; X32-NEXT:    retl
    274 ;
    275 ; X64-LABEL: test_unpackl_hadd_256:
    276 ; X64:       ## %bb.0:
    277 ; X64-NEXT:    vphaddw %ymm2, %ymm0, %ymm0
    278 ; X64-NEXT:    retq
    279   %1 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1)
    280   %2 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a2, <16 x i16> %a3)
    281   %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27>
    282   ret <16 x i16> %3
    283 }
    284 
    285 define <8 x i32> @test_unpackh_hadd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
    286 ; X32-LABEL: test_unpackh_hadd_256:
    287 ; X32:       ## %bb.0:
    288 ; X32-NEXT:    vphaddd %ymm3, %ymm1, %ymm0
    289 ; X32-NEXT:    retl
    290 ;
    291 ; X64-LABEL: test_unpackh_hadd_256:
    292 ; X64:       ## %bb.0:
    293 ; X64-NEXT:    vphaddd %ymm3, %ymm1, %ymm0
    294 ; X64-NEXT:    retq
    295   %1 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1)
    296   %2 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a2, <8 x i32> %a3)
    297   %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15>
    298   ret <8 x i32> %3
    299 }
    300 
    301 define <8 x i32> @test_unpackl_hsub_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
    302 ; X32-LABEL: test_unpackl_hsub_256:
    303 ; X32:       ## %bb.0:
    304 ; X32-NEXT:    vphsubd %ymm2, %ymm0, %ymm0
    305 ; X32-NEXT:    retl
    306 ;
    307 ; X64-LABEL: test_unpackl_hsub_256:
    308 ; X64:       ## %bb.0:
    309 ; X64-NEXT:    vphsubd %ymm2, %ymm0, %ymm0
    310 ; X64-NEXT:    retq
    311   %1 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1)
    312   %2 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a2, <8 x i32> %a3)
    313   %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13>
    314   ret <8 x i32> %3
    315 }
    316 
    317 define <16 x i16> @test_unpackh_hsub_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
    318 ; X32-LABEL: test_unpackh_hsub_256:
    319 ; X32:       ## %bb.0:
    320 ; X32-NEXT:    vphsubw %ymm3, %ymm1, %ymm0
    321 ; X32-NEXT:    retl
    322 ;
    323 ; X64-LABEL: test_unpackh_hsub_256:
    324 ; X64:       ## %bb.0:
    325 ; X64-NEXT:    vphsubw %ymm3, %ymm1, %ymm0
    326 ; X64-NEXT:    retq
    327   %1 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1)
    328   %2 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a2, <16 x i16> %a3)
    329   %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
    330   ret <16 x i16> %3
    331 }
    332 
    333 define <32 x i8> @test_unpackl_packss_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
    334 ; X32-LABEL: test_unpackl_packss_256:
    335 ; X32:       ## %bb.0:
    336 ; X32-NEXT:    vpacksswb %ymm2, %ymm0, %ymm0
    337 ; X32-NEXT:    retl
    338 ;
    339 ; X64-LABEL: test_unpackl_packss_256:
    340 ; X64:       ## %bb.0:
    341 ; X64-NEXT:    vpacksswb %ymm2, %ymm0, %ymm0
    342 ; X64-NEXT:    retq
    343   %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1)
    344   %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a2, <16 x i16> %a3)
    345   %3 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
    346   ret <32 x i8> %3
    347 }
    348 
    349 define <16 x i16> @test_unpackh_packss_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
    350 ; X32-LABEL: test_unpackh_packss_256:
    351 ; X32:       ## %bb.0:
    352 ; X32-NEXT:    vpackssdw %ymm3, %ymm1, %ymm0
    353 ; X32-NEXT:    retl
    354 ;
    355 ; X64-LABEL: test_unpackh_packss_256:
    356 ; X64:       ## %bb.0:
    357 ; X64-NEXT:    vpackssdw %ymm3, %ymm1, %ymm0
    358 ; X64-NEXT:    retq
    359   %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
    360   %2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a2, <8 x i32> %a3)
    361   %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
    362   ret <16 x i16> %3
    363 }
    364 
    365 define <16 x i16> @test_unpackl_packus_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
    366 ; X32-LABEL: test_unpackl_packus_256:
    367 ; X32:       ## %bb.0:
    368 ; X32-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
    369 ; X32-NEXT:    retl
    370 ;
    371 ; X64-LABEL: test_unpackl_packus_256:
    372 ; X64:       ## %bb.0:
    373 ; X64-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
    374 ; X64-NEXT:    retq
    375   %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1)
    376   %2 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a2, <8 x i32> %a3)
    377   %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27>
    378   ret <16 x i16> %3
    379 }
    380 
    381 define <32 x i8> @test_unpackh_packus_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
    382 ; X32-LABEL: test_unpackh_packus_256:
    383 ; X32:       ## %bb.0:
    384 ; X32-NEXT:    vpacksswb %ymm3, %ymm1, %ymm0
    385 ; X32-NEXT:    retl
    386 ;
    387 ; X64-LABEL: test_unpackh_packus_256:
    388 ; X64:       ## %bb.0:
    389 ; X64-NEXT:    vpacksswb %ymm3, %ymm1, %ymm0
    390 ; X64-NEXT:    retq
    391   %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1)
    392   %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a2, <16 x i16> %a3)
    393   %3 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
    394   ret <32 x i8> %3
    395 }
    396 
    397 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)
    398 declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>)
    399 declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>)
    400 declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>)
    401 
    402 declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>)
    403 declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>)
    404 declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>)
    405 declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>)
    406 
    407 declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>)
    408 declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>)
    409 declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>)
    410 declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>)
    411 
    412 declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>)
    413 declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>)
    414 declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>)
    415 declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>)
    416 
    417 declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>)
    418 declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>)
    419 declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>)
    420 declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>)
    421 
    422 declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>)
    423 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>)
    424 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>)
    425 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>)
    426