Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell -mattr=-avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell -mattr=-avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
      8 
      9 define <8 x i32> @test_broadcasti128(<8 x i32> %a0, <4 x i32> *%a1) {
     10 ; GENERIC-LABEL: test_broadcasti128:
     11 ; GENERIC:       # %bb.0:
     12 ; GENERIC-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] sched: [6:1.00]
     13 ; GENERIC-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
     14 ; GENERIC-NEXT:    retq # sched: [1:1.00]
     15 ;
     16 ; HASWELL-LABEL: test_broadcasti128:
     17 ; HASWELL:       # %bb.0:
     18 ; HASWELL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] sched: [7:0.50]
     19 ; HASWELL-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
     20 ; HASWELL-NEXT:    retq # sched: [7:1.00]
     21 ;
     22 ; BROADWELL-LABEL: test_broadcasti128:
     23 ; BROADWELL:       # %bb.0:
     24 ; BROADWELL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] sched: [6:0.50]
     25 ; BROADWELL-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
     26 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
     27 ;
     28 ; SKYLAKE-LABEL: test_broadcasti128:
     29 ; SKYLAKE:       # %bb.0:
     30 ; SKYLAKE-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] sched: [7:0.50]
     31 ; SKYLAKE-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
     32 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
     33 ;
     34 ; SKX-LABEL: test_broadcasti128:
     35 ; SKX:       # %bb.0:
     36 ; SKX-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] sched: [7:0.50]
     37 ; SKX-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
     38 ; SKX-NEXT:    retq # sched: [7:1.00]
     39 ;
     40 ; ZNVER1-LABEL: test_broadcasti128:
     41 ; ZNVER1:       # %bb.0:
     42 ; ZNVER1-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] sched: [8:0.50]
     43 ; ZNVER1-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
     44 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
     45   %1 = load <4 x i32>, <4 x i32> *%a1, align 16
     46   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
     47   %3 = add <8 x i32> %2, %a0
     48   ret <8 x i32> %3
     49 }
     50 
     51 define <4 x double> @test_broadcastsd_ymm(<2 x double> %a0) {
     52 ; GENERIC-LABEL: test_broadcastsd_ymm:
     53 ; GENERIC:       # %bb.0:
     54 ; GENERIC-NEXT:    vbroadcastsd %xmm0, %ymm0 # sched: [1:1.00]
     55 ; GENERIC-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
     56 ; GENERIC-NEXT:    retq # sched: [1:1.00]
     57 ;
     58 ; HASWELL-LABEL: test_broadcastsd_ymm:
     59 ; HASWELL:       # %bb.0:
     60 ; HASWELL-NEXT:    vbroadcastsd %xmm0, %ymm0 # sched: [3:1.00]
     61 ; HASWELL-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
     62 ; HASWELL-NEXT:    retq # sched: [7:1.00]
     63 ;
     64 ; BROADWELL-LABEL: test_broadcastsd_ymm:
     65 ; BROADWELL:       # %bb.0:
     66 ; BROADWELL-NEXT:    vbroadcastsd %xmm0, %ymm0 # sched: [3:1.00]
     67 ; BROADWELL-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
     68 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
     69 ;
     70 ; SKYLAKE-LABEL: test_broadcastsd_ymm:
     71 ; SKYLAKE:       # %bb.0:
     72 ; SKYLAKE-NEXT:    vbroadcastsd %xmm0, %ymm0 # sched: [3:1.00]
     73 ; SKYLAKE-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [4:0.50]
     74 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
     75 ;
     76 ; SKX-LABEL: test_broadcastsd_ymm:
     77 ; SKX:       # %bb.0:
     78 ; SKX-NEXT:    vbroadcastsd %xmm0, %ymm0 # sched: [3:1.00]
     79 ; SKX-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [4:0.50]
     80 ; SKX-NEXT:    retq # sched: [7:1.00]
     81 ;
     82 ; ZNVER1-LABEL: test_broadcastsd_ymm:
     83 ; ZNVER1:       # %bb.0:
     84 ; ZNVER1-NEXT:    vbroadcastsd %xmm0, %ymm0 # sched: [100:0.25]
     85 ; ZNVER1-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
     86 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
     87   %1 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer
     88   %2 = fadd <4 x double> %1, %1
     89   ret <4 x double> %2
     90 }
     91 
     92 define <4 x float> @test_broadcastss(<4 x float> %a0) {
     93 ; GENERIC-LABEL: test_broadcastss:
     94 ; GENERIC:       # %bb.0:
     95 ; GENERIC-NEXT:    vbroadcastss %xmm0, %xmm0 # sched: [1:1.00]
     96 ; GENERIC-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
     97 ; GENERIC-NEXT:    retq # sched: [1:1.00]
     98 ;
     99 ; HASWELL-LABEL: test_broadcastss:
    100 ; HASWELL:       # %bb.0:
    101 ; HASWELL-NEXT:    vbroadcastss %xmm0, %xmm0 # sched: [1:1.00]
    102 ; HASWELL-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
    103 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    104 ;
    105 ; BROADWELL-LABEL: test_broadcastss:
    106 ; BROADWELL:       # %bb.0:
    107 ; BROADWELL-NEXT:    vbroadcastss %xmm0, %xmm0 # sched: [1:1.00]
    108 ; BROADWELL-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
    109 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
    110 ;
    111 ; SKYLAKE-LABEL: test_broadcastss:
    112 ; SKYLAKE:       # %bb.0:
    113 ; SKYLAKE-NEXT:    vbroadcastss %xmm0, %xmm0 # sched: [1:1.00]
    114 ; SKYLAKE-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [4:0.50]
    115 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
    116 ;
    117 ; SKX-LABEL: test_broadcastss:
    118 ; SKX:       # %bb.0:
    119 ; SKX-NEXT:    vbroadcastss %xmm0, %xmm0 # sched: [1:1.00]
    120 ; SKX-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [4:0.50]
    121 ; SKX-NEXT:    retq # sched: [7:1.00]
    122 ;
    123 ; ZNVER1-LABEL: test_broadcastss:
    124 ; ZNVER1:       # %bb.0:
    125 ; ZNVER1-NEXT:    vbroadcastss %xmm0, %xmm0 # sched: [1:0.50]
    126 ; ZNVER1-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
    127 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
    128   %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
    129   %2 = fadd <4 x float> %1, %1
    130   ret <4 x float> %2
    131 }
    132 
    133 define <8 x float> @test_broadcastss_ymm(<4 x float> %a0) {
    134 ; GENERIC-LABEL: test_broadcastss_ymm:
    135 ; GENERIC:       # %bb.0:
    136 ; GENERIC-NEXT:    vbroadcastss %xmm0, %ymm0 # sched: [1:1.00]
    137 ; GENERIC-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
    138 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    139 ;
    140 ; HASWELL-LABEL: test_broadcastss_ymm:
    141 ; HASWELL:       # %bb.0:
    142 ; HASWELL-NEXT:    vbroadcastss %xmm0, %ymm0 # sched: [3:1.00]
    143 ; HASWELL-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
    144 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    145 ;
    146 ; BROADWELL-LABEL: test_broadcastss_ymm:
    147 ; BROADWELL:       # %bb.0:
    148 ; BROADWELL-NEXT:    vbroadcastss %xmm0, %ymm0 # sched: [3:1.00]
    149 ; BROADWELL-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
    150 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
    151 ;
    152 ; SKYLAKE-LABEL: test_broadcastss_ymm:
    153 ; SKYLAKE:       # %bb.0:
    154 ; SKYLAKE-NEXT:    vbroadcastss %xmm0, %ymm0 # sched: [3:1.00]
    155 ; SKYLAKE-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [4:0.50]
    156 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
    157 ;
    158 ; SKX-LABEL: test_broadcastss_ymm:
    159 ; SKX:       # %bb.0:
    160 ; SKX-NEXT:    vbroadcastss %xmm0, %ymm0 # sched: [3:1.00]
    161 ; SKX-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [4:0.50]
    162 ; SKX-NEXT:    retq # sched: [7:1.00]
    163 ;
    164 ; ZNVER1-LABEL: test_broadcastss_ymm:
    165 ; ZNVER1:       # %bb.0:
    166 ; ZNVER1-NEXT:    vbroadcastss %xmm0, %ymm0 # sched: [100:0.25]
    167 ; ZNVER1-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
    168 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
    169   %1 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer
    170   %2 = fadd <8 x float> %1, %1
    171   ret <8 x float> %2
    172 }
    173 
    174 define <4 x i32> @test_extracti128(<8 x i32> %a0, <8 x i32> %a1, <4 x i32> *%a2) {
    175 ; GENERIC-LABEL: test_extracti128:
    176 ; GENERIC:       # %bb.0:
    177 ; GENERIC-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.50]
    178 ; GENERIC-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
    179 ; GENERIC-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [1:1.00]
    180 ; GENERIC-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
    181 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
    182 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    183 ;
    184 ; HASWELL-LABEL: test_extracti128:
    185 ; HASWELL:       # %bb.0:
    186 ; HASWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.50]
    187 ; HASWELL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
    188 ; HASWELL-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [3:1.00]
    189 ; HASWELL-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
    190 ; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
    191 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    192 ;
    193 ; BROADWELL-LABEL: test_extracti128:
    194 ; BROADWELL:       # %bb.0:
    195 ; BROADWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.50]
    196 ; BROADWELL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
    197 ; BROADWELL-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [3:1.00]
    198 ; BROADWELL-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
    199 ; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
    200 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
    201 ;
    202 ; SKYLAKE-LABEL: test_extracti128:
    203 ; SKYLAKE:       # %bb.0:
    204 ; SKYLAKE-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.33]
    205 ; SKYLAKE-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
    206 ; SKYLAKE-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [3:1.00]
    207 ; SKYLAKE-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
    208 ; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
    209 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
    210 ;
    211 ; SKX-LABEL: test_extracti128:
    212 ; SKX:       # %bb.0:
    213 ; SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.33]
    214 ; SKX-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
    215 ; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [3:1.00]
    216 ; SKX-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
    217 ; SKX-NEXT:    vzeroupper # sched: [4:1.00]
    218 ; SKX-NEXT:    retq # sched: [7:1.00]
    219 ;
    220 ; ZNVER1-LABEL: test_extracti128:
    221 ; ZNVER1:       # %bb.0:
    222 ; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.25]
    223 ; ZNVER1-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
    224 ; ZNVER1-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [2:0.25]
    225 ; ZNVER1-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:0.50]
    226 ; ZNVER1-NEXT:    vzeroupper # sched: [100:0.25]
    227 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
    228   %1 = add <8 x i32> %a0, %a1
    229   %2 = sub <8 x i32> %a0, %a1
    230   %3 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    231   %4 = shufflevector <8 x i32> %2, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    232   store <4 x i32> %3, <4 x i32> *%a2
    233   ret <4 x i32> %4
    234 }
    235 
    236 define <2 x double> @test_gatherdpd(<2 x double> %a0, i8* %a1, <4 x i32> %a2, <2 x double> %a3) {
    237 ; GENERIC-LABEL: test_gatherdpd:
    238 ; GENERIC:       # %bb.0:
    239 ; GENERIC-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
    240 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    241 ;
    242 ; HASWELL-LABEL: test_gatherdpd:
    243 ; HASWELL:       # %bb.0:
    244 ; HASWELL-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [26:2.67]
    245 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    246 ;
    247 ; BROADWELL-LABEL: test_gatherdpd:
    248 ; BROADWELL:       # %bb.0:
    249 ; BROADWELL-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [25:3.00]
    250 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
    251 ;
    252 ; SKYLAKE-LABEL: test_gatherdpd:
    253 ; SKYLAKE:       # %bb.0:
    254 ; SKYLAKE-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
    255 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
    256 ;
    257 ; SKX-LABEL: test_gatherdpd:
    258 ; SKX:       # %bb.0:
    259 ; SKX-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
    260 ; SKX-NEXT:    retq # sched: [7:1.00]
    261 ;
    262 ; ZNVER1-LABEL: test_gatherdpd:
    263 ; ZNVER1:       # %bb.0:
    264 ; ZNVER1-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:0.25]
    265 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
    266   %1 = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %a1, <4 x i32> %a2, <2 x double> %a3, i8 2)
    267   ret <2 x double> %1
    268 }
    269 declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly
    270 
    271 define <4 x double> @test_gatherdpd_ymm(<4 x double> %a0, i8* %a1, <4 x i32> %a2, <4 x double> %a3) {
    272 ; GENERIC-LABEL: test_gatherdpd_ymm:
    273 ; GENERIC:       # %bb.0:
    274 ; GENERIC-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,8), %ymm0 # sched: [5:0.50]
    275 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    276 ;
    277 ; HASWELL-LABEL: test_gatherdpd_ymm:
    278 ; HASWELL:       # %bb.0:
    279 ; HASWELL-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,8), %ymm0 # sched: [27:4.00]
    280 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    281 ;
    282 ; BROADWELL-LABEL: test_gatherdpd_ymm:
    283 ; BROADWELL:       # %bb.0:
    284 ; BROADWELL-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,8), %ymm0 # sched: [26:5.00]
    285 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
    286 ;
    287 ; SKYLAKE-LABEL: test_gatherdpd_ymm:
    288 ; SKYLAKE:       # %bb.0:
    289 ; SKYLAKE-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,8), %ymm0 # sched: [25:1.00]
    290 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
    291 ;
    292 ; SKX-LABEL: test_gatherdpd_ymm:
    293 ; SKX:       # %bb.0:
    294 ; SKX-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,8), %ymm0 # sched: [25:1.00]
    295 ; SKX-NEXT:    retq # sched: [7:1.00]
    296 ;
    297 ; ZNVER1-LABEL: test_gatherdpd_ymm:
    298 ; ZNVER1:       # %bb.0:
    299 ; ZNVER1-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,8), %ymm0 # sched: [100:0.25]
    300 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
    301   %1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %a1, <4 x i32> %a2, <4 x double> %a3, i8 8)
    302   ret <4 x double> %1
    303 }
    304 declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly
    305 
    306 define <4 x float> @test_gatherdps(<4 x float> %a0, i8* %a1, <4 x i32> %a2, <4 x float> %a3) {
    307 ; GENERIC-LABEL: test_gatherdps:
    308 ; GENERIC:       # %bb.0:
    309 ; GENERIC-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
    310 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    311 ;
    312 ; HASWELL-LABEL: test_gatherdps:
    313 ; HASWELL:       # %bb.0:
    314 ; HASWELL-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [25:3.67]
    315 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    316 ;
    317 ; BROADWELL-LABEL: test_gatherdps:
    318 ; BROADWELL:       # %bb.0:
    319 ; BROADWELL-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [25:3.00]
    320 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
    321 ;
    322 ; SKYLAKE-LABEL: test_gatherdps:
    323 ; SKYLAKE:       # %bb.0:
    324 ; SKYLAKE-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
    325 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
    326 ;
    327 ; SKX-LABEL: test_gatherdps:
    328 ; SKX:       # %bb.0:
    329 ; SKX-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
    330 ; SKX-NEXT:    retq # sched: [7:1.00]
    331 ;
    332 ; ZNVER1-LABEL: test_gatherdps:
    333 ; ZNVER1:       # %bb.0:
    334 ; ZNVER1-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:0.25]
    335 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
    336   %1 = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %a1, <4 x i32> %a2, <4 x float> %a3, i8 2)
    337   ret <4 x float> %1
    338 }
    339 declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly
    340 
    341 define <8 x float> @test_gatherdps_ymm(<8 x float> %a0, i8* %a1, <8 x i32> %a2, <8 x float> %a3) {
    342 ; GENERIC-LABEL: test_gatherdps_ymm:
    343 ; GENERIC:       # %bb.0:
    344 ; GENERIC-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,4), %ymm0 # sched: [5:0.50]
    345 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    346 ;
    347 ; HASWELL-LABEL: test_gatherdps_ymm:
    348 ; HASWELL:       # %bb.0:
    349 ; HASWELL-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,4), %ymm0 # sched: [27:6.50]
    350 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    351 ;
    352 ; BROADWELL-LABEL: test_gatherdps_ymm:
    353 ; BROADWELL:       # %bb.0:
    354 ; BROADWELL-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,4), %ymm0 # sched: [26:4.00]
    355 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
    356 ;
    357 ; SKYLAKE-LABEL: test_gatherdps_ymm:
    358 ; SKYLAKE:       # %bb.0:
    359 ; SKYLAKE-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,4), %ymm0 # sched: [25:1.00]
    360 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
    361 ;
    362 ; SKX-LABEL: test_gatherdps_ymm:
    363 ; SKX:       # %bb.0:
    364 ; SKX-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,4), %ymm0 # sched: [25:1.00]
    365 ; SKX-NEXT:    retq # sched: [7:1.00]
    366 ;
    367 ; ZNVER1-LABEL: test_gatherdps_ymm:
    368 ; ZNVER1:       # %bb.0:
    369 ; ZNVER1-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,4), %ymm0 # sched: [100:0.25]
    370 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
    371   %1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %a1, <8 x i32> %a2, <8 x float> %a3, i8 4)
    372   ret <8 x float> %1
    373 }
    374 declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly
    375 
    376 define <2 x double> @test_gatherqpd(<2 x double> %a0, i8* %a1, <2 x i64> %a2, <2 x double> %a3) {
    377 ; GENERIC-LABEL: test_gatherqpd:
    378 ; GENERIC:       # %bb.0:
    379 ; GENERIC-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
    380 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    381 ;
    382 ; HASWELL-LABEL: test_gatherqpd:
    383 ; HASWELL:       # %bb.0:
    384 ; HASWELL-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [23:3.33]
    385 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    386 ;
    387 ; BROADWELL-LABEL: test_gatherqpd:
    388 ; BROADWELL:       # %bb.0:
    389 ; BROADWELL-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:3.00]
    390 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
    391 ;
    392 ; SKYLAKE-LABEL: test_gatherqpd:
    393 ; SKYLAKE:       # %bb.0:
    394 ; SKYLAKE-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
    395 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
    396 ;
    397 ; SKX-LABEL: test_gatherqpd:
    398 ; SKX:       # %bb.0:
    399 ; SKX-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
    400 ; SKX-NEXT:    retq # sched: [7:1.00]
    401 ;
    402 ; ZNVER1-LABEL: test_gatherqpd:
    403 ; ZNVER1:       # %bb.0:
    404 ; ZNVER1-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:0.25]
    405 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
    406   %1 = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %a1, <2 x i64> %a2, <2 x double> %a3, i8 2)
    407   ret <2 x double> %1
    408 }
    409 declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 x double>, i8) nounwind readonly
    410 
    411 define <4 x double> @test_gatherqpd_ymm(<4 x double> %a0, i8* %a1, <4 x i64> %a2, <4 x double> %a3) {
    412 ; GENERIC-LABEL: test_gatherqpd_ymm:
    413 ; GENERIC:       # %bb.0:
    414 ; GENERIC-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,8), %ymm0 # sched: [5:0.50]
    415 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    416 ;
    417 ; HASWELL-LABEL: test_gatherqpd_ymm:
    418 ; HASWELL:       # %bb.0:
    419 ; HASWELL-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,8), %ymm0 # sched: [24:5.00]
    420 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    421 ;
    422 ; BROADWELL-LABEL: test_gatherqpd_ymm:
    423 ; BROADWELL:       # %bb.0:
    424 ; BROADWELL-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,8), %ymm0 # sched: [23:3.00]
    425 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
    426 ;
    427 ; SKYLAKE-LABEL: test_gatherqpd_ymm:
    428 ; SKYLAKE:       # %bb.0:
    429 ; SKYLAKE-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,8), %ymm0 # sched: [25:1.00]
    430 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
    431 ;
    432 ; SKX-LABEL: test_gatherqpd_ymm:
    433 ; SKX:       # %bb.0:
    434 ; SKX-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,8), %ymm0 # sched: [25:1.00]
    435 ; SKX-NEXT:    retq # sched: [7:1.00]
    436 ;
    437 ; ZNVER1-LABEL: test_gatherqpd_ymm:
    438 ; ZNVER1:       # %bb.0:
    439 ; ZNVER1-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,8), %ymm0 # sched: [100:0.25]
    440 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
    441   %1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %a1, <4 x i64> %a2, <4 x double> %a3, i8 8)
    442   ret <4 x double> %1
    443 }
    444 declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64>, <4 x double>, i8) nounwind readonly
    445 
    446 define <4 x float> @test_gatherqps(<4 x float> %a0, i8* %a1, <2 x i64> %a2, <4 x float> %a3) {
    447 ; GENERIC-LABEL: test_gatherqps:
    448 ; GENERIC:       # %bb.0:
    449 ; GENERIC-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
    450 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    451 ;
    452 ; HASWELL-LABEL: test_gatherqps:
    453 ; HASWELL:       # %bb.0:
    454 ; HASWELL-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [25:3.67]
    455 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    456 ;
    457 ; BROADWELL-LABEL: test_gatherqps:
    458 ; BROADWELL:       # %bb.0:
    459 ; BROADWELL-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [27:5.00]
    460 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
    461 ;
    462 ; SKYLAKE-LABEL: test_gatherqps:
    463 ; SKYLAKE:       # %bb.0:
    464 ; SKYLAKE-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
    465 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
    466 ;
    467 ; SKX-LABEL: test_gatherqps:
    468 ; SKX:       # %bb.0:
    469 ; SKX-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
    470 ; SKX-NEXT:    retq # sched: [7:1.00]
    471 ;
    472 ; ZNVER1-LABEL: test_gatherqps:
    473 ; ZNVER1:       # %bb.0:
    474 ; ZNVER1-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:0.25]
    475 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
    476   %1 = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %a1, <2 x i64> %a2, <4 x float> %a3, i8 2)
    477   ret <4 x float> %1
    478 }
    479 declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x float>, i8) nounwind readonly
    480 
    481 define <4 x float> @test_gatherqps_ymm(<4 x float> %a0, i8* %a1, <4 x i64> %a2, <4 x float> %a3) {
    482 ; GENERIC-LABEL: test_gatherqps_ymm:
    483 ; GENERIC:       # %bb.0:
    484 ; GENERIC-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [5:0.50]
    485 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
    486 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    487 ;
    488 ; HASWELL-LABEL: test_gatherqps_ymm:
    489 ; HASWELL:       # %bb.0:
    490 ; HASWELL-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [28:3.67]
    491 ; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
    492 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    493 ;
    494 ; BROADWELL-LABEL: test_gatherqps_ymm:
    495 ; BROADWELL:       # %bb.0:
    496 ; BROADWELL-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [24:5.00]
    497 ; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
    498 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
    499 ;
    500 ; SKYLAKE-LABEL: test_gatherqps_ymm:
    501 ; SKYLAKE:       # %bb.0:
    502 ; SKYLAKE-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [25:1.00]
    503 ; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
    504 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
    505 ;
    506 ; SKX-LABEL: test_gatherqps_ymm:
    507 ; SKX:       # %bb.0:
    508 ; SKX-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [25:1.00]
    509 ; SKX-NEXT:    vzeroupper # sched: [4:1.00]
    510 ; SKX-NEXT:    retq # sched: [7:1.00]
    511 ;
    512 ; ZNVER1-LABEL: test_gatherqps_ymm:
    513 ; ZNVER1:       # %bb.0:
    514 ; ZNVER1-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [100:0.25]
    515 ; ZNVER1-NEXT:    vzeroupper # sched: [100:0.25]
    516 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
    517   %1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %a1, <4 x i64> %a2, <4 x float> %a3, i8 4)
    518   ret <4 x float> %1
    519 }
    520 declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*, <4 x i64>, <4 x float>, i8) nounwind readonly
    521 
    522 define <8 x i32> @test_inserti128(<8 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
    523 ; GENERIC-LABEL: test_inserti128:
    524 ; GENERIC:       # %bb.0:
    525 ; GENERIC-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:1.00]
    526 ; GENERIC-NEXT:    vinserti128 $1, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
    527 ; GENERIC-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
    528 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    529 ;
    530 ; HASWELL-LABEL: test_inserti128:
    531 ; HASWELL:       # %bb.0:
    532 ; HASWELL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00]
    533 ; HASWELL-NEXT:    vinserti128 $1, (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
    534 ; HASWELL-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
    535 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    536 ;
    537 ; BROADWELL-LABEL: test_inserti128:
    538 ; BROADWELL:       # %bb.0:
    539 ; BROADWELL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00]
    540 ; BROADWELL-NEXT:    vinserti128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:0.50]
    541 ; BROADWELL-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
    542 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
    543 ;
    544 ; SKYLAKE-LABEL: test_inserti128:
    545 ; SKYLAKE:       # %bb.0:
    546 ; SKYLAKE-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00]
    547 ; SKYLAKE-NEXT:    vinserti128 $1, (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
    548 ; SKYLAKE-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
    549 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
    550 ;
    551 ; SKX-LABEL: test_inserti128:
    552 ; SKX:       # %bb.0:
    553 ; SKX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00]
    554 ; SKX-NEXT:    vinserti128 $1, (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
    555 ; SKX-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
    556 ; SKX-NEXT:    retq # sched: [7:1.00]
    557 ;
    558 ; ZNVER1-LABEL: test_inserti128:
    559 ; ZNVER1:       # %bb.0:
    560 ; ZNVER1-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 # sched: [2:0.25]
    561 ; ZNVER1-NEXT:    vinserti128 $1, (%rdi), %ymm0, %ymm0 # sched: [9:0.50]
    562 ; ZNVER1-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
    563 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
    564   %1 = shufflevector <4 x i32> %a1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
    565   %2 = shufflevector <8 x i32> %a0, <8 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
    566   %3 = load <4 x i32>, <4 x i32> *%a2, align 16
    567   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
    568   %5 = shufflevector <8 x i32> %a0, <8 x i32> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
    569   %6 = add <8 x i32> %2, %5
    570   ret <8 x i32> %6
    571 }
    572 
    573 define <4 x i64> @test_movntdqa(i8* %a0) {
    574 ; GENERIC-LABEL: test_movntdqa:
    575 ; GENERIC:       # %bb.0:
    576 ; GENERIC-NEXT:    vmovntdqa (%rdi), %ymm0 # sched: [7:0.50]
    577 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    578 ;
    579 ; HASWELL-LABEL: test_movntdqa:
    580 ; HASWELL:       # %bb.0:
    581 ; HASWELL-NEXT:    vmovntdqa (%rdi), %ymm0 # sched: [7:0.50]
    582 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    583 ;
    584 ; BROADWELL-LABEL: test_movntdqa:
    585 ; BROADWELL:       # %bb.0:
    586 ; BROADWELL-NEXT:    vmovntdqa (%rdi), %ymm0 # sched: [6:0.50]
    587 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
    588 ;
    589 ; SKYLAKE-LABEL: test_movntdqa:
    590 ; SKYLAKE:       # %bb.0:
    591 ; SKYLAKE-NEXT:    vmovntdqa (%rdi), %ymm0 # sched: [7:0.50]
    592 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
    593 ;
    594 ; SKX-LABEL: test_movntdqa:
    595 ; SKX:       # %bb.0:
    596 ; SKX-NEXT:    vmovntdqa (%rdi), %ymm0 # sched: [7:0.50]
    597 ; SKX-NEXT:    retq # sched: [7:1.00]
    598 ;
    599 ; ZNVER1-LABEL: test_movntdqa:
    600 ; ZNVER1:       # %bb.0:
    601 ; ZNVER1-NEXT:    vmovntdqa (%rdi), %ymm0 # sched: [8:0.50]
    602 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
    603   %1 = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %a0)
    604   ret <4 x i64> %1
    605 }
    606 declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
    607 
    608 define <16 x i16> @test_mpsadbw(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
    609 ; GENERIC-LABEL: test_mpsadbw:
    610 ; GENERIC:       # %bb.0:
    611 ; GENERIC-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [7:1.00]
    612 ; GENERIC-NEXT:    vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [14:1.00]
    613 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    614 ;
    615 ; HASWELL-LABEL: test_mpsadbw:
    616 ; HASWELL:       # %bb.0:
    617 ; HASWELL-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [7:2.00]
    618 ; HASWELL-NEXT:    vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [14:2.00]
    619 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    620 ;
    621 ; BROADWELL-LABEL: test_mpsadbw:
    622 ; BROADWELL:       # %bb.0:
    623 ; BROADWELL-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [7:2.00]
    624 ; BROADWELL-NEXT:    vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
    625 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
    626 ;
    627 ; SKYLAKE-LABEL: test_mpsadbw:
    628 ; SKYLAKE:       # %bb.0:
    629 ; SKYLAKE-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [4:2.00]
    630 ; SKYLAKE-NEXT:    vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [11:2.00]
    631 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
    632 ;
    633 ; SKX-LABEL: test_mpsadbw:
    634 ; SKX:       # %bb.0:
    635 ; SKX-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [4:2.00]
    636 ; SKX-NEXT:    vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [11:2.00]
    637 ; SKX-NEXT:    retq # sched: [7:1.00]
    638 ;
    639 ; ZNVER1-LABEL: test_mpsadbw:
    640 ; ZNVER1:       # %bb.0:
    641 ; ZNVER1-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [100:0.25]
    642 ; ZNVER1-NEXT:    vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [100:0.25]
    643 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
    644   %1 = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7)
    645   %2 = bitcast <16 x i16> %1 to <32 x i8>
    646   %3 = load <32 x i8>, <32 x i8> *%a2, align 32
    647   %4 = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %2, <32 x i8> %3, i8 7)
    648   ret <16 x i16> %4
    649 }
    650 declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
    651 
    652 define <32 x i8> @test_pabsb(<32 x i8> %a0, <32 x i8> *%a1) {
    653 ; GENERIC-LABEL: test_pabsb:
    654 ; GENERIC:       # %bb.0:
    655 ; GENERIC-NEXT:    vpabsb %ymm0, %ymm0 # sched: [1:0.50]
    656 ; GENERIC-NEXT:    vpabsb (%rdi), %ymm1 # sched: [8:0.50]
    657 ; GENERIC-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
    658 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    659 ;
    660 ; HASWELL-LABEL: test_pabsb:
    661 ; HASWELL:       # %bb.0:
    662 ; HASWELL-NEXT:    vpabsb %ymm0, %ymm0 # sched: [1:0.50]
    663 ; HASWELL-NEXT:    vpabsb (%rdi), %ymm1 # sched: [8:0.50]
    664 ; HASWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
    665 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    666 ;
    667 ; BROADWELL-LABEL: test_pabsb:
    668 ; BROADWELL:       # %bb.0:
    669 ; BROADWELL-NEXT:    vpabsb %ymm0, %ymm0 # sched: [1:0.50]
    670 ; BROADWELL-NEXT:    vpabsb (%rdi), %ymm1 # sched: [7:0.50]
    671 ; BROADWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
    672 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
    673 ;
    674 ; SKYLAKE-LABEL: test_pabsb:
    675 ; SKYLAKE:       # %bb.0:
    676 ; SKYLAKE-NEXT:    vpabsb %ymm0, %ymm0 # sched: [1:0.50]
    677 ; SKYLAKE-NEXT:    vpabsb (%rdi), %ymm1 # sched: [8:0.50]
    678 ; SKYLAKE-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
    679 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
    680 ;
    681 ; SKX-LABEL: test_pabsb:
    682 ; SKX:       # %bb.0:
    683 ; SKX-NEXT:    vpabsb %ymm0, %ymm0 # sched: [1:0.50]
    684 ; SKX-NEXT:    vpabsb (%rdi), %ymm1 # sched: [8:0.50]
    685 ; SKX-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
    686 ; SKX-NEXT:    retq # sched: [7:1.00]
    687 ;
    688 ; ZNVER1-LABEL: test_pabsb:
    689 ; ZNVER1:       # %bb.0:
    690 ; ZNVER1-NEXT:    vpabsb (%rdi), %ymm1 # sched: [8:0.50]
    691 ; ZNVER1-NEXT:    vpabsb %ymm0, %ymm0 # sched: [1:0.25]
    692 ; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
    693 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
    694   %1 = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %a0)
    695   %2 = load <32 x i8>, <32 x i8> *%a1, align 32
    696   %3 = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %2)
    697   %4 = or <32 x i8> %1, %3
    698   ret <32 x i8> %4
    699 }
    700 declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
    701 
    702 define <8 x i32> @test_pabsd(<8 x i32> %a0, <8 x i32> *%a1) {
    703 ; GENERIC-LABEL: test_pabsd:
    704 ; GENERIC:       # %bb.0:
    705 ; GENERIC-NEXT:    vpabsd %ymm0, %ymm0 # sched: [1:0.50]
    706 ; GENERIC-NEXT:    vpabsd (%rdi), %ymm1 # sched: [8:0.50]
    707 ; GENERIC-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
    708 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    709 ;
    710 ; HASWELL-LABEL: test_pabsd:
    711 ; HASWELL:       # %bb.0:
    712 ; HASWELL-NEXT:    vpabsd %ymm0, %ymm0 # sched: [1:0.50]
    713 ; HASWELL-NEXT:    vpabsd (%rdi), %ymm1 # sched: [8:0.50]
    714 ; HASWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
    715 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    716 ;
    717 ; BROADWELL-LABEL: test_pabsd:
    718 ; BROADWELL:       # %bb.0:
    719 ; BROADWELL-NEXT:    vpabsd %ymm0, %ymm0 # sched: [1:0.50]
    720 ; BROADWELL-NEXT:    vpabsd (%rdi), %ymm1 # sched: [7:0.50]
    721 ; BROADWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
    722 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
    723 ;
    724 ; SKYLAKE-LABEL: test_pabsd:
    725 ; SKYLAKE:       # %bb.0:
    726 ; SKYLAKE-NEXT:    vpabsd %ymm0, %ymm0 # sched: [1:0.50]
    727 ; SKYLAKE-NEXT:    vpabsd (%rdi), %ymm1 # sched: [8:0.50]
    728 ; SKYLAKE-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
    729 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
    730 ;
    731 ; SKX-LABEL: test_pabsd:
    732 ; SKX:       # %bb.0:
    733 ; SKX-NEXT:    vpabsd %ymm0, %ymm0 # sched: [1:0.50]
    734 ; SKX-NEXT:    vpabsd (%rdi), %ymm1 # sched: [8:0.50]
    735 ; SKX-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
    736 ; SKX-NEXT:    retq # sched: [7:1.00]
    737 ;
    738 ; ZNVER1-LABEL: test_pabsd:
    739 ; ZNVER1:       # %bb.0:
    740 ; ZNVER1-NEXT:    vpabsd (%rdi), %ymm1 # sched: [8:0.50]
    741 ; ZNVER1-NEXT:    vpabsd %ymm0, %ymm0 # sched: [1:0.25]
    742 ; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
    743 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
    744   %1 = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %a0)
    745   %2 = load <8 x i32>, <8 x i32> *%a1, align 32
    746   %3 = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %2)
    747   %4 = or <8 x i32> %1, %3
    748   ret <8 x i32> %4
    749 }
    750 declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
    751 
    752 define <16 x i16> @test_pabsw(<16 x i16> %a0, <16 x i16> *%a1) {
    753 ; GENERIC-LABEL: test_pabsw:
    754 ; GENERIC:       # %bb.0:
    755 ; GENERIC-NEXT:    vpabsw %ymm0, %ymm0 # sched: [1:0.50]
    756 ; GENERIC-NEXT:    vpabsw (%rdi), %ymm1 # sched: [8:0.50]
    757 ; GENERIC-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
    758 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    759 ;
    760 ; HASWELL-LABEL: test_pabsw:
    761 ; HASWELL:       # %bb.0:
    762 ; HASWELL-NEXT:    vpabsw %ymm0, %ymm0 # sched: [1:0.50]
    763 ; HASWELL-NEXT:    vpabsw (%rdi), %ymm1 # sched: [8:0.50]
    764 ; HASWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
    765 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    766 ;
    767 ; BROADWELL-LABEL: test_pabsw:
    768 ; BROADWELL:       # %bb.0:
    769 ; BROADWELL-NEXT:    vpabsw %ymm0, %ymm0 # sched: [1:0.50]
    770 ; BROADWELL-NEXT:    vpabsw (%rdi), %ymm1 # sched: [7:0.50]
    771 ; BROADWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
    772 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
    773 ;
    774 ; SKYLAKE-LABEL: test_pabsw:
    775 ; SKYLAKE:       # %bb.0:
    776 ; SKYLAKE-NEXT:    vpabsw %ymm0, %ymm0 # sched: [1:0.50]
    777 ; SKYLAKE-NEXT:    vpabsw (%rdi), %ymm1 # sched: [8:0.50]
    778 ; SKYLAKE-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
    779 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
    780 ;
    781 ; SKX-LABEL: test_pabsw:
    782 ; SKX:       # %bb.0:
    783 ; SKX-NEXT:    vpabsw %ymm0, %ymm0 # sched: [1:0.50]
    784 ; SKX-NEXT:    vpabsw (%rdi), %ymm1 # sched: [8:0.50]
    785 ; SKX-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
    786 ; SKX-NEXT:    retq # sched: [7:1.00]
    787 ;
    788 ; ZNVER1-LABEL: test_pabsw:
    789 ; ZNVER1:       # %bb.0:
    790 ; ZNVER1-NEXT:    vpabsw (%rdi), %ymm1 # sched: [8:0.50]
    791 ; ZNVER1-NEXT:    vpabsw %ymm0, %ymm0 # sched: [1:0.25]
    792 ; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
    793 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
    794   %1 = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %a0)
    795   %2 = load <16 x i16>, <16 x i16> *%a1, align 32
    796   %3 = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %2)
    797   %4 = or <16 x i16> %1, %3
    798   ret <16 x i16> %4
    799 }
    800 declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
    801 
    802 define <16 x i16> @test_packssdw(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
    803 ; GENERIC-LABEL: test_packssdw:
    804 ; GENERIC:       # %bb.0:
    805 ; GENERIC-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
    806 ; GENERIC-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
    807 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    808 ;
    809 ; HASWELL-LABEL: test_packssdw:
    810 ; HASWELL:       # %bb.0:
    811 ; HASWELL-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
    812 ; HASWELL-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
    813 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    814 ;
    815 ; BROADWELL-LABEL: test_packssdw:
    816 ; BROADWELL:       # %bb.0:
    817 ; BROADWELL-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
    818 ; BROADWELL-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
    819 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
    820 ;
    821 ; SKYLAKE-LABEL: test_packssdw:
    822 ; SKYLAKE:       # %bb.0:
    823 ; SKYLAKE-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
    824 ; SKYLAKE-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
    825 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
    826 ;
    827 ; SKX-LABEL: test_packssdw:
    828 ; SKX:       # %bb.0:
    829 ; SKX-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
    830 ; SKX-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
    831 ; SKX-NEXT:    retq # sched: [7:1.00]
    832 ;
    833 ; ZNVER1-LABEL: test_packssdw:
    834 ; ZNVER1:       # %bb.0:
    835 ; ZNVER1-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
    836 ; ZNVER1-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
    837 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
    838   %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
    839   %2 = bitcast <16 x i16> %1 to <8 x i32>
    840   %3 = load <8 x i32>, <8 x i32> *%a2, align 32
    841   %4 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %2, <8 x i32> %3)
    842   ret <16 x i16> %4
    843 }
    844 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
    845 
    846 define <32 x i8> @test_packsswb(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
    847 ; GENERIC-LABEL: test_packsswb:
    848 ; GENERIC:       # %bb.0:
    849 ; GENERIC-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
    850 ; GENERIC-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
    851 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    852 ;
    853 ; HASWELL-LABEL: test_packsswb:
    854 ; HASWELL:       # %bb.0:
    855 ; HASWELL-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
    856 ; HASWELL-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
    857 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    858 ;
    859 ; BROADWELL-LABEL: test_packsswb:
    860 ; BROADWELL:       # %bb.0:
    861 ; BROADWELL-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
    862 ; BROADWELL-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
    863 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
    864 ;
    865 ; SKYLAKE-LABEL: test_packsswb:
    866 ; SKYLAKE:       # %bb.0:
    867 ; SKYLAKE-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
    868 ; SKYLAKE-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
    869 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
    870 ;
    871 ; SKX-LABEL: test_packsswb:
    872 ; SKX:       # %bb.0:
    873 ; SKX-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
    874 ; SKX-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
    875 ; SKX-NEXT:    retq # sched: [7:1.00]
    876 ;
    877 ; ZNVER1-LABEL: test_packsswb:
    878 ; ZNVER1:       # %bb.0:
    879 ; ZNVER1-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
    880 ; ZNVER1-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
    881 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
    882   %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1)
    883   %2 = bitcast <32 x i8> %1 to <16 x i16>
    884   %3 = load <16 x i16>, <16 x i16> *%a2, align 32
    885   %4 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %2, <16 x i16> %3)
    886   ret <32 x i8> %4
    887 }
    888 declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
    889 
    890 define <16 x i16> @test_packusdw(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
    891 ; GENERIC-LABEL: test_packusdw:
    892 ; GENERIC:       # %bb.0:
    893 ; GENERIC-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
    894 ; GENERIC-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
    895 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    896 ;
    897 ; HASWELL-LABEL: test_packusdw:
    898 ; HASWELL:       # %bb.0:
    899 ; HASWELL-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
    900 ; HASWELL-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
    901 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    902 ;
    903 ; BROADWELL-LABEL: test_packusdw:
    904 ; BROADWELL:       # %bb.0:
    905 ; BROADWELL-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
    906 ; BROADWELL-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
    907 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
    908 ;
    909 ; SKYLAKE-LABEL: test_packusdw:
    910 ; SKYLAKE:       # %bb.0:
    911 ; SKYLAKE-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
    912 ; SKYLAKE-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
    913 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
    914 ;
    915 ; SKX-LABEL: test_packusdw:
    916 ; SKX:       # %bb.0:
    917 ; SKX-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
    918 ; SKX-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
    919 ; SKX-NEXT:    retq # sched: [7:1.00]
    920 ;
    921 ; ZNVER1-LABEL: test_packusdw:
    922 ; ZNVER1:       # %bb.0:
    923 ; ZNVER1-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
    924 ; ZNVER1-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
    925 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
    926   %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1)
    927   %2 = bitcast <16 x i16> %1 to <8 x i32>
    928   %3 = load <8 x i32>, <8 x i32> *%a2, align 32
    929   %4 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %2, <8 x i32> %3)
    930   ret <16 x i16> %4
    931 }
    932 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
    933 
    934 define <32 x i8> @test_packuswb(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
    935 ; GENERIC-LABEL: test_packuswb:
    936 ; GENERIC:       # %bb.0:
    937 ; GENERIC-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
    938 ; GENERIC-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
    939 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    940 ;
    941 ; HASWELL-LABEL: test_packuswb:
    942 ; HASWELL:       # %bb.0:
    943 ; HASWELL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
    944 ; HASWELL-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
    945 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    946 ;
    947 ; BROADWELL-LABEL: test_packuswb:
    948 ; BROADWELL:       # %bb.0:
    949 ; BROADWELL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
    950 ; BROADWELL-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
    951 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
    952 ;
    953 ; SKYLAKE-LABEL: test_packuswb:
    954 ; SKYLAKE:       # %bb.0:
    955 ; SKYLAKE-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
    956 ; SKYLAKE-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
    957 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
    958 ;
    959 ; SKX-LABEL: test_packuswb:
    960 ; SKX:       # %bb.0:
    961 ; SKX-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
    962 ; SKX-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
    963 ; SKX-NEXT:    retq # sched: [7:1.00]
    964 ;
    965 ; ZNVER1-LABEL: test_packuswb:
    966 ; ZNVER1:       # %bb.0:
    967 ; ZNVER1-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
    968 ; ZNVER1-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
    969 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
    970   %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1)
    971   %2 = bitcast <32 x i8> %1 to <16 x i16>
    972   %3 = load <16 x i16>, <16 x i16> *%a2, align 32
    973   %4 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %2, <16 x i16> %3)
    974   ret <32 x i8> %4
    975 }
    976 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
    977 
    978 define <32 x i8> @test_paddb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
    979 ; GENERIC-LABEL: test_paddb:
    980 ; GENERIC:       # %bb.0:
    981 ; GENERIC-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
    982 ; GENERIC-NEXT:    vpaddb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
    983 ; GENERIC-NEXT:    retq # sched: [1:1.00]
    984 ;
    985 ; HASWELL-LABEL: test_paddb:
    986 ; HASWELL:       # %bb.0:
    987 ; HASWELL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
    988 ; HASWELL-NEXT:    vpaddb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
    989 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    990 ;
    991 ; BROADWELL-LABEL: test_paddb:
    992 ; BROADWELL:       # %bb.0:
    993 ; BROADWELL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
    994 ; BROADWELL-NEXT:    vpaddb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
    995 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
    996 ;
    997 ; SKYLAKE-LABEL: test_paddb:
    998 ; SKYLAKE:       # %bb.0:
    999 ; SKYLAKE-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1000 ; SKYLAKE-NEXT:    vpaddb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1001 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   1002 ;
   1003 ; SKX-LABEL: test_paddb:
   1004 ; SKX:       # %bb.0:
   1005 ; SKX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1006 ; SKX-NEXT:    vpaddb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1007 ; SKX-NEXT:    retq # sched: [7:1.00]
   1008 ;
   1009 ; ZNVER1-LABEL: test_paddb:
   1010 ; ZNVER1:       # %bb.0:
   1011 ; ZNVER1-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   1012 ; ZNVER1-NEXT:    vpaddb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1013 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   1014   %1 = add <32 x i8> %a0, %a1
   1015   %2 = load <32 x i8>, <32 x i8> *%a2, align 32
   1016   %3 = add <32 x i8> %1, %2
   1017   ret <32 x i8> %3
   1018 }
   1019 
   1020 define <8 x i32> @test_paddd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
   1021 ; GENERIC-LABEL: test_paddd:
   1022 ; GENERIC:       # %bb.0:
   1023 ; GENERIC-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1024 ; GENERIC-NEXT:    vpaddd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1025 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1026 ;
   1027 ; HASWELL-LABEL: test_paddd:
   1028 ; HASWELL:       # %bb.0:
   1029 ; HASWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1030 ; HASWELL-NEXT:    vpaddd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1031 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1032 ;
   1033 ; BROADWELL-LABEL: test_paddd:
   1034 ; BROADWELL:       # %bb.0:
   1035 ; BROADWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1036 ; BROADWELL-NEXT:    vpaddd (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   1037 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   1038 ;
   1039 ; SKYLAKE-LABEL: test_paddd:
   1040 ; SKYLAKE:       # %bb.0:
   1041 ; SKYLAKE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1042 ; SKYLAKE-NEXT:    vpaddd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1043 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   1044 ;
   1045 ; SKX-LABEL: test_paddd:
   1046 ; SKX:       # %bb.0:
   1047 ; SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1048 ; SKX-NEXT:    vpaddd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1049 ; SKX-NEXT:    retq # sched: [7:1.00]
   1050 ;
   1051 ; ZNVER1-LABEL: test_paddd:
   1052 ; ZNVER1:       # %bb.0:
   1053 ; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   1054 ; ZNVER1-NEXT:    vpaddd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1055 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   1056   %1 = add <8 x i32> %a0, %a1
   1057   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   1058   %3 = add <8 x i32> %1, %2
   1059   ret <8 x i32> %3
   1060 }
   1061 
   1062 define <4 x i64> @test_paddq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
   1063 ; GENERIC-LABEL: test_paddq:
   1064 ; GENERIC:       # %bb.0:
   1065 ; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1066 ; GENERIC-NEXT:    vpaddq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1067 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1068 ;
   1069 ; HASWELL-LABEL: test_paddq:
   1070 ; HASWELL:       # %bb.0:
   1071 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1072 ; HASWELL-NEXT:    vpaddq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1073 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1074 ;
   1075 ; BROADWELL-LABEL: test_paddq:
   1076 ; BROADWELL:       # %bb.0:
   1077 ; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1078 ; BROADWELL-NEXT:    vpaddq (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   1079 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   1080 ;
   1081 ; SKYLAKE-LABEL: test_paddq:
   1082 ; SKYLAKE:       # %bb.0:
   1083 ; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1084 ; SKYLAKE-NEXT:    vpaddq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1085 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   1086 ;
   1087 ; SKX-LABEL: test_paddq:
   1088 ; SKX:       # %bb.0:
   1089 ; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1090 ; SKX-NEXT:    vpaddq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1091 ; SKX-NEXT:    retq # sched: [7:1.00]
   1092 ;
   1093 ; ZNVER1-LABEL: test_paddq:
   1094 ; ZNVER1:       # %bb.0:
   1095 ; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   1096 ; ZNVER1-NEXT:    vpaddq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1097 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   1098   %1 = add <4 x i64> %a0, %a1
   1099   %2 = load <4 x i64>, <4 x i64> *%a2, align 32
   1100   %3 = add <4 x i64> %1, %2
   1101   ret <4 x i64> %3
   1102 }
   1103 
   1104 define <32 x i8> @test_paddsb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
   1105 ; GENERIC-LABEL: test_paddsb:
   1106 ; GENERIC:       # %bb.0:
   1107 ; GENERIC-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1108 ; GENERIC-NEXT:    vpaddsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1109 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1110 ;
   1111 ; HASWELL-LABEL: test_paddsb:
   1112 ; HASWELL:       # %bb.0:
   1113 ; HASWELL-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1114 ; HASWELL-NEXT:    vpaddsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1115 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1116 ;
   1117 ; BROADWELL-LABEL: test_paddsb:
   1118 ; BROADWELL:       # %bb.0:
   1119 ; BROADWELL-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1120 ; BROADWELL-NEXT:    vpaddsb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   1121 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   1122 ;
   1123 ; SKYLAKE-LABEL: test_paddsb:
   1124 ; SKYLAKE:       # %bb.0:
   1125 ; SKYLAKE-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1126 ; SKYLAKE-NEXT:    vpaddsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1127 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   1128 ;
   1129 ; SKX-LABEL: test_paddsb:
   1130 ; SKX:       # %bb.0:
   1131 ; SKX-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1132 ; SKX-NEXT:    vpaddsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1133 ; SKX-NEXT:    retq # sched: [7:1.00]
   1134 ;
   1135 ; ZNVER1-LABEL: test_paddsb:
   1136 ; ZNVER1:       # %bb.0:
   1137 ; ZNVER1-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   1138 ; ZNVER1-NEXT:    vpaddsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1139 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   1140   %1 = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1)
   1141   %2 = load <32 x i8>, <32 x i8> *%a2, align 32
   1142   %3 = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %1, <32 x i8> %2)
   1143   ret <32 x i8> %3
   1144 }
   1145 declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone
   1146 
   1147 define <16 x i16> @test_paddsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   1148 ; GENERIC-LABEL: test_paddsw:
   1149 ; GENERIC:       # %bb.0:
   1150 ; GENERIC-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1151 ; GENERIC-NEXT:    vpaddsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1152 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1153 ;
   1154 ; HASWELL-LABEL: test_paddsw:
   1155 ; HASWELL:       # %bb.0:
   1156 ; HASWELL-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1157 ; HASWELL-NEXT:    vpaddsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1158 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1159 ;
   1160 ; BROADWELL-LABEL: test_paddsw:
   1161 ; BROADWELL:       # %bb.0:
   1162 ; BROADWELL-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1163 ; BROADWELL-NEXT:    vpaddsw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   1164 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   1165 ;
   1166 ; SKYLAKE-LABEL: test_paddsw:
   1167 ; SKYLAKE:       # %bb.0:
   1168 ; SKYLAKE-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1169 ; SKYLAKE-NEXT:    vpaddsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1170 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   1171 ;
   1172 ; SKX-LABEL: test_paddsw:
   1173 ; SKX:       # %bb.0:
   1174 ; SKX-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1175 ; SKX-NEXT:    vpaddsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1176 ; SKX-NEXT:    retq # sched: [7:1.00]
   1177 ;
   1178 ; ZNVER1-LABEL: test_paddsw:
   1179 ; ZNVER1:       # %bb.0:
   1180 ; ZNVER1-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   1181 ; ZNVER1-NEXT:    vpaddsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1182 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   1183   %1 = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1)
   1184   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   1185   %3 = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %1, <16 x i16> %2)
   1186   ret <16 x i16> %3
   1187 }
   1188 declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone
   1189 
   1190 define <32 x i8> @test_paddusb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
   1191 ; GENERIC-LABEL: test_paddusb:
   1192 ; GENERIC:       # %bb.0:
   1193 ; GENERIC-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1194 ; GENERIC-NEXT:    vpaddusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1195 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1196 ;
   1197 ; HASWELL-LABEL: test_paddusb:
   1198 ; HASWELL:       # %bb.0:
   1199 ; HASWELL-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1200 ; HASWELL-NEXT:    vpaddusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1201 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1202 ;
   1203 ; BROADWELL-LABEL: test_paddusb:
   1204 ; BROADWELL:       # %bb.0:
   1205 ; BROADWELL-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1206 ; BROADWELL-NEXT:    vpaddusb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   1207 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   1208 ;
   1209 ; SKYLAKE-LABEL: test_paddusb:
   1210 ; SKYLAKE:       # %bb.0:
   1211 ; SKYLAKE-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1212 ; SKYLAKE-NEXT:    vpaddusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1213 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   1214 ;
   1215 ; SKX-LABEL: test_paddusb:
   1216 ; SKX:       # %bb.0:
   1217 ; SKX-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1218 ; SKX-NEXT:    vpaddusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1219 ; SKX-NEXT:    retq # sched: [7:1.00]
   1220 ;
   1221 ; ZNVER1-LABEL: test_paddusb:
   1222 ; ZNVER1:       # %bb.0:
   1223 ; ZNVER1-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   1224 ; ZNVER1-NEXT:    vpaddusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1225 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   1226   %1 = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1)
   1227   %2 = load <32 x i8>, <32 x i8> *%a2, align 32
   1228   %3 = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %1, <32 x i8> %2)
   1229   ret <32 x i8> %3
   1230 }
   1231 declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone
   1232 
   1233 define <16 x i16> @test_paddusw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   1234 ; GENERIC-LABEL: test_paddusw:
   1235 ; GENERIC:       # %bb.0:
   1236 ; GENERIC-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1237 ; GENERIC-NEXT:    vpaddusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1238 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1239 ;
   1240 ; HASWELL-LABEL: test_paddusw:
   1241 ; HASWELL:       # %bb.0:
   1242 ; HASWELL-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1243 ; HASWELL-NEXT:    vpaddusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1244 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1245 ;
   1246 ; BROADWELL-LABEL: test_paddusw:
   1247 ; BROADWELL:       # %bb.0:
   1248 ; BROADWELL-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1249 ; BROADWELL-NEXT:    vpaddusw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   1250 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   1251 ;
   1252 ; SKYLAKE-LABEL: test_paddusw:
   1253 ; SKYLAKE:       # %bb.0:
   1254 ; SKYLAKE-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1255 ; SKYLAKE-NEXT:    vpaddusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1256 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   1257 ;
   1258 ; SKX-LABEL: test_paddusw:
   1259 ; SKX:       # %bb.0:
   1260 ; SKX-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1261 ; SKX-NEXT:    vpaddusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1262 ; SKX-NEXT:    retq # sched: [7:1.00]
   1263 ;
   1264 ; ZNVER1-LABEL: test_paddusw:
   1265 ; ZNVER1:       # %bb.0:
   1266 ; ZNVER1-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   1267 ; ZNVER1-NEXT:    vpaddusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1268 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   1269   %1 = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1)
   1270   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   1271   %3 = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %1, <16 x i16> %2)
   1272   ret <16 x i16> %3
   1273 }
   1274 declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone
   1275 
   1276 define <16 x i16> @test_paddw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   1277 ; GENERIC-LABEL: test_paddw:
   1278 ; GENERIC:       # %bb.0:
   1279 ; GENERIC-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1280 ; GENERIC-NEXT:    vpaddw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1281 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1282 ;
   1283 ; HASWELL-LABEL: test_paddw:
   1284 ; HASWELL:       # %bb.0:
   1285 ; HASWELL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1286 ; HASWELL-NEXT:    vpaddw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1287 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1288 ;
   1289 ; BROADWELL-LABEL: test_paddw:
   1290 ; BROADWELL:       # %bb.0:
   1291 ; BROADWELL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1292 ; BROADWELL-NEXT:    vpaddw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   1293 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   1294 ;
   1295 ; SKYLAKE-LABEL: test_paddw:
   1296 ; SKYLAKE:       # %bb.0:
   1297 ; SKYLAKE-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1298 ; SKYLAKE-NEXT:    vpaddw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1299 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   1300 ;
   1301 ; SKX-LABEL: test_paddw:
   1302 ; SKX:       # %bb.0:
   1303 ; SKX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1304 ; SKX-NEXT:    vpaddw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1305 ; SKX-NEXT:    retq # sched: [7:1.00]
   1306 ;
   1307 ; ZNVER1-LABEL: test_paddw:
   1308 ; ZNVER1:       # %bb.0:
   1309 ; ZNVER1-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   1310 ; ZNVER1-NEXT:    vpaddw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1311 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   1312   %1 = add <16 x i16> %a0, %a1
   1313   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   1314   %3 = add <16 x i16> %1, %2
   1315   ret <16 x i16> %3
   1316 }
   1317 
   1318 define <32 x i8> @test_palignr(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
   1319 ; GENERIC-LABEL: test_palignr:
   1320 ; GENERIC:       # %bb.0:
   1321 ; GENERIC-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [1:1.00]
   1322 ; GENERIC-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16] sched: [1:1.00]
   1323 ; GENERIC-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
   1324 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1325 ;
   1326 ; HASWELL-LABEL: test_palignr:
   1327 ; HASWELL:       # %bb.0:
   1328 ; HASWELL-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [1:1.00]
   1329 ; HASWELL-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16] sched: [1:1.00]
   1330 ; HASWELL-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
   1331 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1332 ;
   1333 ; BROADWELL-LABEL: test_palignr:
   1334 ; BROADWELL:       # %bb.0:
   1335 ; BROADWELL-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [1:1.00]
   1336 ; BROADWELL-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16] sched: [1:1.00]
   1337 ; BROADWELL-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
   1338 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   1339 ;
   1340 ; SKYLAKE-LABEL: test_palignr:
   1341 ; SKYLAKE:       # %bb.0:
   1342 ; SKYLAKE-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [1:1.00]
   1343 ; SKYLAKE-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16] sched: [1:1.00]
   1344 ; SKYLAKE-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
   1345 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   1346 ;
   1347 ; SKX-LABEL: test_palignr:
   1348 ; SKX:       # %bb.0:
   1349 ; SKX-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [1:1.00]
   1350 ; SKX-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16] sched: [1:1.00]
   1351 ; SKX-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
   1352 ; SKX-NEXT:    retq # sched: [7:1.00]
   1353 ;
   1354 ; ZNVER1-LABEL: test_palignr:
   1355 ; ZNVER1:       # %bb.0:
   1356 ; ZNVER1-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [1:0.25]
   1357 ; ZNVER1-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16] sched: [1:0.25]
   1358 ; ZNVER1-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
   1359 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   1360   %1 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
   1361   %2 = load <32 x i8>, <32 x i8> *%a2, align 32
   1362   %3 = shufflevector <32 x i8> %a0, <32 x i8> %1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
   1363   %4 = add <32 x i8> %1, %3
   1364   ret <32 x i8> %4
   1365 }
   1366 
   1367 define <4 x i64> @test_pand(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
   1368 ; GENERIC-LABEL: test_pand:
   1369 ; GENERIC:       # %bb.0:
   1370 ; GENERIC-NEXT:    vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1371 ; GENERIC-NEXT:    vpand (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1372 ; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1373 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1374 ;
   1375 ; HASWELL-LABEL: test_pand:
   1376 ; HASWELL:       # %bb.0:
   1377 ; HASWELL-NEXT:    vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1378 ; HASWELL-NEXT:    vpand (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1379 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1380 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1381 ;
   1382 ; BROADWELL-LABEL: test_pand:
   1383 ; BROADWELL:       # %bb.0:
   1384 ; BROADWELL-NEXT:    vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1385 ; BROADWELL-NEXT:    vpand (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   1386 ; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1387 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   1388 ;
   1389 ; SKYLAKE-LABEL: test_pand:
   1390 ; SKYLAKE:       # %bb.0:
   1391 ; SKYLAKE-NEXT:    vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1392 ; SKYLAKE-NEXT:    vpand (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1393 ; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1394 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   1395 ;
   1396 ; SKX-LABEL: test_pand:
   1397 ; SKX:       # %bb.0:
   1398 ; SKX-NEXT:    vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1399 ; SKX-NEXT:    vpand (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1400 ; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1401 ; SKX-NEXT:    retq # sched: [7:1.00]
   1402 ;
   1403 ; ZNVER1-LABEL: test_pand:
   1404 ; ZNVER1:       # %bb.0:
   1405 ; ZNVER1-NEXT:    vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   1406 ; ZNVER1-NEXT:    vpand (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1407 ; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   1408 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   1409   %1 = and <4 x i64> %a0, %a1
   1410   %2 = load <4 x i64>, <4 x i64> *%a2, align 32
   1411   %3 = and <4 x i64> %1, %2
   1412   %4 = add <4 x i64> %3, %a1
   1413   ret <4 x i64> %4
   1414 }
   1415 
   1416 define <4 x i64> @test_pandn(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
   1417 ; GENERIC-LABEL: test_pandn:
   1418 ; GENERIC:       # %bb.0:
   1419 ; GENERIC-NEXT:    vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1420 ; GENERIC-NEXT:    vpandn (%rdi), %ymm0, %ymm1 # sched: [8:0.50]
   1421 ; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1422 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1423 ;
   1424 ; HASWELL-LABEL: test_pandn:
   1425 ; HASWELL:       # %bb.0:
   1426 ; HASWELL-NEXT:    vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1427 ; HASWELL-NEXT:    vpandn (%rdi), %ymm0, %ymm1 # sched: [8:0.50]
   1428 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1429 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1430 ;
   1431 ; BROADWELL-LABEL: test_pandn:
   1432 ; BROADWELL:       # %bb.0:
   1433 ; BROADWELL-NEXT:    vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1434 ; BROADWELL-NEXT:    vpandn (%rdi), %ymm0, %ymm1 # sched: [7:0.50]
   1435 ; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1436 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   1437 ;
   1438 ; SKYLAKE-LABEL: test_pandn:
   1439 ; SKYLAKE:       # %bb.0:
   1440 ; SKYLAKE-NEXT:    vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1441 ; SKYLAKE-NEXT:    vpandn (%rdi), %ymm0, %ymm1 # sched: [8:0.50]
   1442 ; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1443 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   1444 ;
   1445 ; SKX-LABEL: test_pandn:
   1446 ; SKX:       # %bb.0:
   1447 ; SKX-NEXT:    vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1448 ; SKX-NEXT:    vpandn (%rdi), %ymm0, %ymm1 # sched: [8:0.50]
   1449 ; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1450 ; SKX-NEXT:    retq # sched: [7:1.00]
   1451 ;
   1452 ; ZNVER1-LABEL: test_pandn:
   1453 ; ZNVER1:       # %bb.0:
   1454 ; ZNVER1-NEXT:    vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   1455 ; ZNVER1-NEXT:    vpandn (%rdi), %ymm0, %ymm1 # sched: [8:0.50]
   1456 ; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   1457 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   1458   %1 = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
   1459   %2 = and <4 x i64> %a1, %1
   1460   %3 = load <4 x i64>, <4 x i64> *%a2, align 32
   1461   %4 = xor <4 x i64> %2, <i64 -1, i64 -1, i64 -1, i64 -1>
   1462   %5 = and <4 x i64> %3, %4
   1463   %6 = add <4 x i64> %2, %5
   1464   ret <4 x i64> %6
   1465 }
   1466 
   1467 define <32 x i8> @test_pavgb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
   1468 ; GENERIC-LABEL: test_pavgb:
   1469 ; GENERIC:       # %bb.0:
   1470 ; GENERIC-NEXT:    vpavgb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1471 ; GENERIC-NEXT:    vpavgb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1472 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1473 ;
   1474 ; HASWELL-LABEL: test_pavgb:
   1475 ; HASWELL:       # %bb.0:
   1476 ; HASWELL-NEXT:    vpavgb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1477 ; HASWELL-NEXT:    vpavgb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1478 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1479 ;
   1480 ; BROADWELL-LABEL: test_pavgb:
   1481 ; BROADWELL:       # %bb.0:
   1482 ; BROADWELL-NEXT:    vpavgb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1483 ; BROADWELL-NEXT:    vpavgb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   1484 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   1485 ;
   1486 ; SKYLAKE-LABEL: test_pavgb:
   1487 ; SKYLAKE:       # %bb.0:
   1488 ; SKYLAKE-NEXT:    vpavgb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1489 ; SKYLAKE-NEXT:    vpavgb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1490 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   1491 ;
   1492 ; SKX-LABEL: test_pavgb:
   1493 ; SKX:       # %bb.0:
   1494 ; SKX-NEXT:    vpavgb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1495 ; SKX-NEXT:    vpavgb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1496 ; SKX-NEXT:    retq # sched: [7:1.00]
   1497 ;
   1498 ; ZNVER1-LABEL: test_pavgb:
   1499 ; ZNVER1:       # %bb.0:
   1500 ; ZNVER1-NEXT:    vpavgb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   1501 ; ZNVER1-NEXT:    vpavgb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1502 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   1503   %1 = zext <32 x i8> %a0 to <32 x i16>
   1504   %2 = zext <32 x i8> %a1 to <32 x i16>
   1505   %3 = add <32 x i16> %1, %2
   1506   %4 = add <32 x i16> %3, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   1507   %5 = lshr <32 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   1508   %6 = trunc <32 x i16> %5 to <32 x i8>
   1509   %7 = load <32 x i8>, <32 x i8> *%a2, align 32
   1510   %8 = zext <32 x i8> %6 to <32 x i16>
   1511   %9 = zext <32 x i8> %7 to <32 x i16>
   1512   %10 = add <32 x i16> %8, %9
   1513   %11 = add <32 x i16> %10, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   1514   %12 = lshr <32 x i16> %11, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   1515   %13 = trunc <32 x i16> %12 to <32 x i8>
   1516   ret <32 x i8> %13
   1517 }
   1518 
   1519 define <16 x i16> @test_pavgw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   1520 ; GENERIC-LABEL: test_pavgw:
   1521 ; GENERIC:       # %bb.0:
   1522 ; GENERIC-NEXT:    vpavgw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1523 ; GENERIC-NEXT:    vpavgw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1524 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1525 ;
   1526 ; HASWELL-LABEL: test_pavgw:
   1527 ; HASWELL:       # %bb.0:
   1528 ; HASWELL-NEXT:    vpavgw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1529 ; HASWELL-NEXT:    vpavgw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1530 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1531 ;
   1532 ; BROADWELL-LABEL: test_pavgw:
   1533 ; BROADWELL:       # %bb.0:
   1534 ; BROADWELL-NEXT:    vpavgw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1535 ; BROADWELL-NEXT:    vpavgw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   1536 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   1537 ;
   1538 ; SKYLAKE-LABEL: test_pavgw:
   1539 ; SKYLAKE:       # %bb.0:
   1540 ; SKYLAKE-NEXT:    vpavgw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1541 ; SKYLAKE-NEXT:    vpavgw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1542 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   1543 ;
   1544 ; SKX-LABEL: test_pavgw:
   1545 ; SKX:       # %bb.0:
   1546 ; SKX-NEXT:    vpavgw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1547 ; SKX-NEXT:    vpavgw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1548 ; SKX-NEXT:    retq # sched: [7:1.00]
   1549 ;
   1550 ; ZNVER1-LABEL: test_pavgw:
   1551 ; ZNVER1:       # %bb.0:
   1552 ; ZNVER1-NEXT:    vpavgw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   1553 ; ZNVER1-NEXT:    vpavgw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   1554 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   1555   %1 = zext <16 x i16> %a0 to <16 x i32>
   1556   %2 = zext <16 x i16> %a1 to <16 x i32>
   1557   %3 = add <16 x i32> %1, %2
   1558   %4 = add <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   1559   %5 = lshr <16 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   1560   %6 = trunc <16 x i32> %5 to <16 x i16>
   1561   %7 = load <16 x i16>, <16 x i16> *%a2, align 32
   1562   %8 = zext <16 x i16> %6 to <16 x i32>
   1563   %9 = zext <16 x i16> %7 to <16 x i32>
   1564   %10 = add <16 x i32> %8, %9
   1565   %11 = add <16 x i32> %10, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   1566   %12 = lshr <16 x i32> %11, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   1567   %13 = trunc <16 x i32> %12 to <16 x i16>
   1568   ret <16 x i16> %13
   1569 }
   1570 
   1571 define <4 x i32> @test_pblendd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
   1572 ; GENERIC-LABEL: test_pblendd:
   1573 ; GENERIC:       # %bb.0:
   1574 ; GENERIC-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] sched: [1:0.50]
   1575 ; GENERIC-NEXT:    vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2],xmm1[3] sched: [7:0.50]
   1576 ; GENERIC-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   1577 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1578 ;
   1579 ; HASWELL-LABEL: test_pblendd:
   1580 ; HASWELL:       # %bb.0:
   1581 ; HASWELL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] sched: [1:0.33]
   1582 ; HASWELL-NEXT:    vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2],xmm1[3] sched: [7:0.50]
   1583 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   1584 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1585 ;
   1586 ; BROADWELL-LABEL: test_pblendd:
   1587 ; BROADWELL:       # %bb.0:
   1588 ; BROADWELL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] sched: [1:0.33]
   1589 ; BROADWELL-NEXT:    vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2],xmm1[3] sched: [6:0.50]
   1590 ; BROADWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   1591 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   1592 ;
   1593 ; SKYLAKE-LABEL: test_pblendd:
   1594 ; SKYLAKE:       # %bb.0:
   1595 ; SKYLAKE-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] sched: [1:0.33]
   1596 ; SKYLAKE-NEXT:    vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2],xmm1[3] sched: [7:0.50]
   1597 ; SKYLAKE-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
   1598 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   1599 ;
   1600 ; SKX-LABEL: test_pblendd:
   1601 ; SKX:       # %bb.0:
   1602 ; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] sched: [1:0.33]
   1603 ; SKX-NEXT:    vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2],xmm1[3] sched: [7:0.50]
   1604 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
   1605 ; SKX-NEXT:    retq # sched: [7:1.00]
   1606 ;
   1607 ; ZNVER1-LABEL: test_pblendd:
   1608 ; ZNVER1:       # %bb.0:
   1609 ; ZNVER1-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] sched: [1:0.50]
   1610 ; ZNVER1-NEXT:    vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2],xmm1[3] sched: [8:1.00]
   1611 ; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
   1612 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   1613   %1 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
   1614   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   1615   %3 = shufflevector <4 x i32> %a1, <4 x i32> %2, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   1616   %4 = add <4 x i32> %1, %3
   1617   ret <4 x i32> %4
   1618 }
   1619 
   1620 define <8 x i32> @test_pblendd_ymm(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
   1621 ; GENERIC-LABEL: test_pblendd_ymm:
   1622 ; GENERIC:       # %bb.0:
   1623 ; GENERIC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] sched: [1:0.50]
   1624 ; GENERIC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],mem[1,2],ymm1[3,4,5,6,7] sched: [8:0.50]
   1625 ; GENERIC-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1626 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1627 ;
   1628 ; HASWELL-LABEL: test_pblendd_ymm:
   1629 ; HASWELL:       # %bb.0:
   1630 ; HASWELL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] sched: [1:0.33]
   1631 ; HASWELL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],mem[1,2],ymm1[3,4,5,6,7] sched: [8:0.50]
   1632 ; HASWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1633 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1634 ;
   1635 ; BROADWELL-LABEL: test_pblendd_ymm:
   1636 ; BROADWELL:       # %bb.0:
   1637 ; BROADWELL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] sched: [1:0.33]
   1638 ; BROADWELL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],mem[1,2],ymm1[3,4,5,6,7] sched: [7:0.50]
   1639 ; BROADWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1640 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   1641 ;
   1642 ; SKYLAKE-LABEL: test_pblendd_ymm:
   1643 ; SKYLAKE:       # %bb.0:
   1644 ; SKYLAKE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] sched: [1:0.33]
   1645 ; SKYLAKE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],mem[1,2],ymm1[3,4,5,6,7] sched: [8:0.50]
   1646 ; SKYLAKE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1647 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   1648 ;
   1649 ; SKX-LABEL: test_pblendd_ymm:
   1650 ; SKX:       # %bb.0:
   1651 ; SKX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] sched: [1:0.33]
   1652 ; SKX-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],mem[1,2],ymm1[3,4,5,6,7] sched: [8:0.50]
   1653 ; SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1654 ; SKX-NEXT:    retq # sched: [7:1.00]
   1655 ;
   1656 ; ZNVER1-LABEL: test_pblendd_ymm:
   1657 ; ZNVER1:       # %bb.0:
   1658 ; ZNVER1-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] sched: [1:0.50]
   1659 ; ZNVER1-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],mem[1,2],ymm1[3,4,5,6,7] sched: [9:1.50]
   1660 ; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   1661 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   1662   %1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 15>
   1663   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   1664   %3 = shufflevector <8 x i32> %a1, <8 x i32> %2, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
   1665   %4 = add <8 x i32> %1, %3
   1666   ret <8 x i32> %4
   1667 }
   1668 
   1669 define <32 x i8> @test_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2, <32 x i8> *%a3, <32 x i8> %a4) {
   1670 ; GENERIC-LABEL: test_pblendvb:
   1671 ; GENERIC:       # %bb.0:
   1672 ; GENERIC-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
   1673 ; GENERIC-NEXT:    vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
   1674 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1675 ;
   1676 ; HASWELL-LABEL: test_pblendvb:
   1677 ; HASWELL:       # %bb.0:
   1678 ; HASWELL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
   1679 ; HASWELL-NEXT:    vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
   1680 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1681 ;
   1682 ; BROADWELL-LABEL: test_pblendvb:
   1683 ; BROADWELL:       # %bb.0:
   1684 ; BROADWELL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
   1685 ; BROADWELL-NEXT:    vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
   1686 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   1687 ;
   1688 ; SKYLAKE-LABEL: test_pblendvb:
   1689 ; SKYLAKE:       # %bb.0:
   1690 ; SKYLAKE-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:0.67]
   1691 ; SKYLAKE-NEXT:    vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [8:0.67]
   1692 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   1693 ;
   1694 ; SKX-LABEL: test_pblendvb:
   1695 ; SKX:       # %bb.0:
   1696 ; SKX-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:0.67]
   1697 ; SKX-NEXT:    vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [8:0.67]
   1698 ; SKX-NEXT:    retq # sched: [7:1.00]
   1699 ;
   1700 ; ZNVER1-LABEL: test_pblendvb:
   1701 ; ZNVER1:       # %bb.0:
   1702 ; ZNVER1-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
   1703 ; ZNVER1-NEXT:    vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   1704 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   1705   %1 = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2)
   1706   %2 = load <32 x i8>, <32 x i8> *%a3, align 32
   1707   %3 = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %1, <32 x i8> %2, <32 x i8> %a4)
   1708   ret <32 x i8> %3
   1709 }
   1710 declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
   1711 
   1712 define <16 x i16> @test_pblendw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   1713 ; GENERIC-LABEL: test_pblendw:
   1714 ; GENERIC:       # %bb.0:
   1715 ; GENERIC-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7,8,9],ymm1[10,11,12],ymm0[13,14,15] sched: [1:0.50]
   1716 ; GENERIC-NEXT:    vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15] sched: [8:0.50]
   1717 ; GENERIC-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1718 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1719 ;
   1720 ; HASWELL-LABEL: test_pblendw:
   1721 ; HASWELL:       # %bb.0:
   1722 ; HASWELL-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7,8,9],ymm1[10,11,12],ymm0[13,14,15] sched: [1:1.00]
   1723 ; HASWELL-NEXT:    vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15] sched: [8:1.00]
   1724 ; HASWELL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1725 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1726 ;
   1727 ; BROADWELL-LABEL: test_pblendw:
   1728 ; BROADWELL:       # %bb.0:
   1729 ; BROADWELL-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7,8,9],ymm1[10,11,12],ymm0[13,14,15] sched: [1:1.00]
   1730 ; BROADWELL-NEXT:    vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15] sched: [7:1.00]
   1731 ; BROADWELL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1732 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   1733 ;
   1734 ; SKYLAKE-LABEL: test_pblendw:
   1735 ; SKYLAKE:       # %bb.0:
   1736 ; SKYLAKE-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7,8,9],ymm1[10,11,12],ymm0[13,14,15] sched: [1:1.00]
   1737 ; SKYLAKE-NEXT:    vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15] sched: [8:1.00]
   1738 ; SKYLAKE-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1739 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   1740 ;
   1741 ; SKX-LABEL: test_pblendw:
   1742 ; SKX:       # %bb.0:
   1743 ; SKX-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7,8,9],ymm1[10,11,12],ymm0[13,14,15] sched: [1:1.00]
   1744 ; SKX-NEXT:    vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15] sched: [8:1.00]
   1745 ; SKX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1746 ; SKX-NEXT:    retq # sched: [7:1.00]
   1747 ;
   1748 ; ZNVER1-LABEL: test_pblendw:
   1749 ; ZNVER1:       # %bb.0:
   1750 ; ZNVER1-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7,8,9],ymm1[10,11,12],ymm0[13,14,15] sched: [2:0.33]
   1751 ; ZNVER1-NEXT:    vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15] sched: [9:0.50]
   1752 ; ZNVER1-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   1753 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   1754   %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 5, i32 6, i32 7, i32 8, i32 9, i32 26, i32 27, i32 28, i32 13, i32 14, i32 15>
   1755   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   1756   %3 = shufflevector <16 x i16> %a1, <16 x i16> %2, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
   1757   %4 = add <16 x i16> %1, %3
   1758   ret <16 x i16> %4
   1759 }
   1760 
   1761 define <16 x i8> @test_pbroadcastb(<16 x i8> %a0, <16 x i8> *%a1) {
   1762 ; GENERIC-LABEL: test_pbroadcastb:
   1763 ; GENERIC:       # %bb.0:
   1764 ; GENERIC-NEXT:    vpbroadcastb %xmm0, %xmm0 # sched: [1:0.50]
   1765 ; GENERIC-NEXT:    vpbroadcastb (%rdi), %xmm1 # sched: [7:0.50]
   1766 ; GENERIC-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   1767 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1768 ;
   1769 ; HASWELL-LABEL: test_pbroadcastb:
   1770 ; HASWELL:       # %bb.0:
   1771 ; HASWELL-NEXT:    vpbroadcastb %xmm0, %xmm0 # sched: [3:1.00]
   1772 ; HASWELL-NEXT:    vpbroadcastb (%rdi), %xmm1 # sched: [9:1.00]
   1773 ; HASWELL-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   1774 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1775 ;
   1776 ; BROADWELL-LABEL: test_pbroadcastb:
   1777 ; BROADWELL:       # %bb.0:
   1778 ; BROADWELL-NEXT:    vpbroadcastb (%rdi), %xmm1 # sched: [9:1.00]
   1779 ; BROADWELL-NEXT:    vpbroadcastb %xmm0, %xmm0 # sched: [3:1.00]
   1780 ; BROADWELL-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   1781 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   1782 ;
   1783 ; SKYLAKE-LABEL: test_pbroadcastb:
   1784 ; SKYLAKE:       # %bb.0:
   1785 ; SKYLAKE-NEXT:    vpbroadcastb %xmm0, %xmm0 # sched: [3:1.00]
   1786 ; SKYLAKE-NEXT:    vpbroadcastb (%rdi), %xmm1 # sched: [7:1.00]
   1787 ; SKYLAKE-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
   1788 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   1789 ;
   1790 ; SKX-LABEL: test_pbroadcastb:
   1791 ; SKX:       # %bb.0:
   1792 ; SKX-NEXT:    vpbroadcastb %xmm0, %xmm0 # sched: [3:1.00]
   1793 ; SKX-NEXT:    vpbroadcastb (%rdi), %xmm1 # sched: [7:1.00]
   1794 ; SKX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
   1795 ; SKX-NEXT:    retq # sched: [7:1.00]
   1796 ;
   1797 ; ZNVER1-LABEL: test_pbroadcastb:
   1798 ; ZNVER1:       # %bb.0:
   1799 ; ZNVER1-NEXT:    vpbroadcastb (%rdi), %xmm1 # sched: [8:1.00]
   1800 ; ZNVER1-NEXT:    vpbroadcastb %xmm0, %xmm0 # sched: [1:0.25]
   1801 ; ZNVER1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
   1802 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   1803   %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> zeroinitializer
   1804   %2 = load <16 x i8>, <16 x i8> *%a1, align 16
   1805   %3 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
   1806   %4 = add <16 x i8> %1, %3
   1807   ret <16 x i8> %4
   1808 }
   1809 
   1810 define <32 x i8> @test_pbroadcastb_ymm(<32 x i8> %a0, <32 x i8> *%a1) {
   1811 ; GENERIC-LABEL: test_pbroadcastb_ymm:
   1812 ; GENERIC:       # %bb.0:
   1813 ; GENERIC-NEXT:    vpbroadcastb %xmm0, %ymm0 # sched: [1:1.00]
   1814 ; GENERIC-NEXT:    vpbroadcastb (%rdi), %ymm1 # sched: [7:0.50]
   1815 ; GENERIC-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1816 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1817 ;
   1818 ; HASWELL-LABEL: test_pbroadcastb_ymm:
   1819 ; HASWELL:       # %bb.0:
   1820 ; HASWELL-NEXT:    vpbroadcastb %xmm0, %ymm0 # sched: [3:1.00]
   1821 ; HASWELL-NEXT:    vpbroadcastb (%rdi), %ymm1 # sched: [9:1.00]
   1822 ; HASWELL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1823 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1824 ;
   1825 ; BROADWELL-LABEL: test_pbroadcastb_ymm:
   1826 ; BROADWELL:       # %bb.0:
   1827 ; BROADWELL-NEXT:    vpbroadcastb (%rdi), %ymm1 # sched: [9:1.00]
   1828 ; BROADWELL-NEXT:    vpbroadcastb %xmm0, %ymm0 # sched: [3:1.00]
   1829 ; BROADWELL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1830 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   1831 ;
   1832 ; SKYLAKE-LABEL: test_pbroadcastb_ymm:
   1833 ; SKYLAKE:       # %bb.0:
   1834 ; SKYLAKE-NEXT:    vpbroadcastb %xmm0, %ymm0 # sched: [3:1.00]
   1835 ; SKYLAKE-NEXT:    vpbroadcastb (%rdi), %ymm1 # sched: [8:1.00]
   1836 ; SKYLAKE-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1837 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   1838 ;
   1839 ; SKX-LABEL: test_pbroadcastb_ymm:
   1840 ; SKX:       # %bb.0:
   1841 ; SKX-NEXT:    vpbroadcastb %xmm0, %ymm0 # sched: [3:1.00]
   1842 ; SKX-NEXT:    vpbroadcastb (%rdi), %ymm1 # sched: [8:1.00]
   1843 ; SKX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1844 ; SKX-NEXT:    retq # sched: [7:1.00]
   1845 ;
   1846 ; ZNVER1-LABEL: test_pbroadcastb_ymm:
   1847 ; ZNVER1:       # %bb.0:
   1848 ; ZNVER1-NEXT:    vpbroadcastb (%rdi), %ymm1 # sched: [8:2.00]
   1849 ; ZNVER1-NEXT:    vpbroadcastb %xmm0, %ymm0 # sched: [2:0.25]
   1850 ; ZNVER1-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   1851 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   1852   %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> zeroinitializer
   1853   %2 = load <32 x i8>, <32 x i8> *%a1, align 32
   1854   %3 = shufflevector <32 x i8> %2, <32 x i8> undef, <32 x i32> zeroinitializer
   1855   %4 = add <32 x i8> %1, %3
   1856   ret <32 x i8> %4
   1857 }
   1858 
   1859 define <4 x i32> @test_pbroadcastd(<4 x i32> %a0, <4 x i32> *%a1) {
   1860 ; GENERIC-LABEL: test_pbroadcastd:
   1861 ; GENERIC:       # %bb.0:
   1862 ; GENERIC-NEXT:    vpbroadcastd %xmm0, %xmm0 # sched: [1:0.50]
   1863 ; GENERIC-NEXT:    vpbroadcastd (%rdi), %xmm1 # sched: [7:0.50]
   1864 ; GENERIC-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   1865 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1866 ;
   1867 ; HASWELL-LABEL: test_pbroadcastd:
   1868 ; HASWELL:       # %bb.0:
   1869 ; HASWELL-NEXT:    vpbroadcastd %xmm0, %xmm0 # sched: [1:1.00]
   1870 ; HASWELL-NEXT:    vpbroadcastd (%rdi), %xmm1 # sched: [6:0.50]
   1871 ; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   1872 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1873 ;
   1874 ; BROADWELL-LABEL: test_pbroadcastd:
   1875 ; BROADWELL:       # %bb.0:
   1876 ; BROADWELL-NEXT:    vpbroadcastd %xmm0, %xmm0 # sched: [1:1.00]
   1877 ; BROADWELL-NEXT:    vpbroadcastd (%rdi), %xmm1 # sched: [5:0.50]
   1878 ; BROADWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   1879 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   1880 ;
   1881 ; SKYLAKE-LABEL: test_pbroadcastd:
   1882 ; SKYLAKE:       # %bb.0:
   1883 ; SKYLAKE-NEXT:    vpbroadcastd %xmm0, %xmm0 # sched: [1:1.00]
   1884 ; SKYLAKE-NEXT:    vpbroadcastd (%rdi), %xmm1 # sched: [6:0.50]
   1885 ; SKYLAKE-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
   1886 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   1887 ;
   1888 ; SKX-LABEL: test_pbroadcastd:
   1889 ; SKX:       # %bb.0:
   1890 ; SKX-NEXT:    vpbroadcastd %xmm0, %xmm0 # sched: [1:1.00]
   1891 ; SKX-NEXT:    vpbroadcastd (%rdi), %xmm1 # sched: [6:0.50]
   1892 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
   1893 ; SKX-NEXT:    retq # sched: [7:1.00]
   1894 ;
   1895 ; ZNVER1-LABEL: test_pbroadcastd:
   1896 ; ZNVER1:       # %bb.0:
   1897 ; ZNVER1-NEXT:    vpbroadcastd (%rdi), %xmm1 # sched: [8:0.50]
   1898 ; ZNVER1-NEXT:    vpbroadcastd %xmm0, %xmm0 # sched: [1:0.25]
   1899 ; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
   1900 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   1901   %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> zeroinitializer
   1902   %2 = load <4 x i32>, <4 x i32> *%a1, align 16
   1903   %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
   1904   %4 = add <4 x i32> %1, %3
   1905   ret <4 x i32> %4
   1906 }
   1907 
   1908 define <8 x i32> @test_pbroadcastd_ymm(<8 x i32> %a0, <8 x i32> *%a1) {
   1909 ; GENERIC-LABEL: test_pbroadcastd_ymm:
   1910 ; GENERIC:       # %bb.0:
   1911 ; GENERIC-NEXT:    vpbroadcastd %xmm0, %ymm0 # sched: [1:1.00]
   1912 ; GENERIC-NEXT:    vpbroadcastd (%rdi), %ymm1 # sched: [7:0.50]
   1913 ; GENERIC-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1914 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1915 ;
   1916 ; HASWELL-LABEL: test_pbroadcastd_ymm:
   1917 ; HASWELL:       # %bb.0:
   1918 ; HASWELL-NEXT:    vpbroadcastd %xmm0, %ymm0 # sched: [3:1.00]
   1919 ; HASWELL-NEXT:    vpbroadcastd (%rdi), %ymm1 # sched: [7:0.50]
   1920 ; HASWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1921 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1922 ;
   1923 ; BROADWELL-LABEL: test_pbroadcastd_ymm:
   1924 ; BROADWELL:       # %bb.0:
   1925 ; BROADWELL-NEXT:    vpbroadcastd %xmm0, %ymm0 # sched: [3:1.00]
   1926 ; BROADWELL-NEXT:    vpbroadcastd (%rdi), %ymm1 # sched: [6:0.50]
   1927 ; BROADWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   1928 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   1929 ;
   1930 ; SKYLAKE-LABEL: test_pbroadcastd_ymm:
   1931 ; SKYLAKE:       # %bb.0:
   1932 ; SKYLAKE-NEXT:    vpbroadcastd %xmm0, %ymm0 # sched: [3:1.00]
   1933 ; SKYLAKE-NEXT:    vpbroadcastd (%rdi), %ymm1 # sched: [7:0.50]
   1934 ; SKYLAKE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1935 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   1936 ;
   1937 ; SKX-LABEL: test_pbroadcastd_ymm:
   1938 ; SKX:       # %bb.0:
   1939 ; SKX-NEXT:    vpbroadcastd %xmm0, %ymm0 # sched: [3:1.00]
   1940 ; SKX-NEXT:    vpbroadcastd (%rdi), %ymm1 # sched: [7:0.50]
   1941 ; SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   1942 ; SKX-NEXT:    retq # sched: [7:1.00]
   1943 ;
   1944 ; ZNVER1-LABEL: test_pbroadcastd_ymm:
   1945 ; ZNVER1:       # %bb.0:
   1946 ; ZNVER1-NEXT:    vpbroadcastd (%rdi), %ymm1 # sched: [8:0.50]
   1947 ; ZNVER1-NEXT:    vpbroadcastd %xmm0, %ymm0 # sched: [2:0.25]
   1948 ; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   1949 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   1950   %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> zeroinitializer
   1951   %2 = load <8 x i32>, <8 x i32> *%a1, align 32
   1952   %3 = shufflevector <8 x i32> %2, <8 x i32> undef, <8 x i32> zeroinitializer
   1953   %4 = add <8 x i32> %1, %3
   1954   ret <8 x i32> %4
   1955 }
   1956 
   1957 define <2 x i64> @test_pbroadcastq(<2 x i64> %a0, <2 x i64> *%a1) {
   1958 ; GENERIC-LABEL: test_pbroadcastq:
   1959 ; GENERIC:       # %bb.0:
   1960 ; GENERIC-NEXT:    vpbroadcastq %xmm0, %xmm0 # sched: [1:0.50]
   1961 ; GENERIC-NEXT:    vpbroadcastq (%rdi), %xmm1 # sched: [7:0.50]
   1962 ; GENERIC-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   1963 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   1964 ;
   1965 ; HASWELL-LABEL: test_pbroadcastq:
   1966 ; HASWELL:       # %bb.0:
   1967 ; HASWELL-NEXT:    vpbroadcastq %xmm0, %xmm0 # sched: [1:1.00]
   1968 ; HASWELL-NEXT:    vpbroadcastq (%rdi), %xmm1 # sched: [6:0.50]
   1969 ; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   1970 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1971 ;
   1972 ; BROADWELL-LABEL: test_pbroadcastq:
   1973 ; BROADWELL:       # %bb.0:
   1974 ; BROADWELL-NEXT:    vpbroadcastq %xmm0, %xmm0 # sched: [1:1.00]
   1975 ; BROADWELL-NEXT:    vpbroadcastq (%rdi), %xmm1 # sched: [5:0.50]
   1976 ; BROADWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   1977 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   1978 ;
   1979 ; SKYLAKE-LABEL: test_pbroadcastq:
   1980 ; SKYLAKE:       # %bb.0:
   1981 ; SKYLAKE-NEXT:    vpbroadcastq %xmm0, %xmm0 # sched: [1:1.00]
   1982 ; SKYLAKE-NEXT:    vpbroadcastq (%rdi), %xmm1 # sched: [6:0.50]
   1983 ; SKYLAKE-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
   1984 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   1985 ;
   1986 ; SKX-LABEL: test_pbroadcastq:
   1987 ; SKX:       # %bb.0:
   1988 ; SKX-NEXT:    vpbroadcastq %xmm0, %xmm0 # sched: [1:1.00]
   1989 ; SKX-NEXT:    vpbroadcastq (%rdi), %xmm1 # sched: [6:0.50]
   1990 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
   1991 ; SKX-NEXT:    retq # sched: [7:1.00]
   1992 ;
   1993 ; ZNVER1-LABEL: test_pbroadcastq:
   1994 ; ZNVER1:       # %bb.0:
   1995 ; ZNVER1-NEXT:    vpbroadcastq (%rdi), %xmm1 # sched: [8:0.50]
   1996 ; ZNVER1-NEXT:    vpbroadcastq %xmm0, %xmm0 # sched: [1:0.25]
   1997 ; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
   1998 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   1999   %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
   2000   %2 = load <2 x i64>, <2 x i64> *%a1, align 16
   2001   %3 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
   2002   %4 = add <2 x i64> %1, %3
   2003   ret <2 x i64> %4
   2004 }
   2005 
   2006 define <4 x i64> @test_pbroadcastq_ymm(<4 x i64> %a0, <4 x i64> *%a1) {
   2007 ; GENERIC-LABEL: test_pbroadcastq_ymm:
   2008 ; GENERIC:       # %bb.0:
   2009 ; GENERIC-NEXT:    vpbroadcastq %xmm0, %ymm0 # sched: [1:1.00]
   2010 ; GENERIC-NEXT:    vpbroadcastq (%rdi), %ymm1 # sched: [7:0.50]
   2011 ; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2012 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2013 ;
   2014 ; HASWELL-LABEL: test_pbroadcastq_ymm:
   2015 ; HASWELL:       # %bb.0:
   2016 ; HASWELL-NEXT:    vpbroadcastq %xmm0, %ymm0 # sched: [3:1.00]
   2017 ; HASWELL-NEXT:    vpbroadcastq (%rdi), %ymm1 # sched: [7:0.50]
   2018 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2019 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2020 ;
   2021 ; BROADWELL-LABEL: test_pbroadcastq_ymm:
   2022 ; BROADWELL:       # %bb.0:
   2023 ; BROADWELL-NEXT:    vpbroadcastq %xmm0, %ymm0 # sched: [3:1.00]
   2024 ; BROADWELL-NEXT:    vpbroadcastq (%rdi), %ymm1 # sched: [6:0.50]
   2025 ; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2026 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2027 ;
   2028 ; SKYLAKE-LABEL: test_pbroadcastq_ymm:
   2029 ; SKYLAKE:       # %bb.0:
   2030 ; SKYLAKE-NEXT:    vpbroadcastq %xmm0, %ymm0 # sched: [3:1.00]
   2031 ; SKYLAKE-NEXT:    vpbroadcastq (%rdi), %ymm1 # sched: [7:0.50]
   2032 ; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   2033 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2034 ;
   2035 ; SKX-LABEL: test_pbroadcastq_ymm:
   2036 ; SKX:       # %bb.0:
   2037 ; SKX-NEXT:    vpbroadcastq %xmm0, %ymm0 # sched: [3:1.00]
   2038 ; SKX-NEXT:    vpbroadcastq (%rdi), %ymm1 # sched: [7:0.50]
   2039 ; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   2040 ; SKX-NEXT:    retq # sched: [7:1.00]
   2041 ;
   2042 ; ZNVER1-LABEL: test_pbroadcastq_ymm:
   2043 ; ZNVER1:       # %bb.0:
   2044 ; ZNVER1-NEXT:    vpbroadcastq (%rdi), %ymm1 # sched: [8:0.50]
   2045 ; ZNVER1-NEXT:    vpbroadcastq %xmm0, %ymm0 # sched: [2:0.25]
   2046 ; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   2047 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2048   %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
   2049   %2 = load <4 x i64>, <4 x i64> *%a1, align 32
   2050   %3 = shufflevector <4 x i64> %2, <4 x i64> undef, <4 x i32> zeroinitializer
   2051   %4 = add <4 x i64> %1, %3
   2052   ret <4 x i64> %4
   2053 }
   2054 
   2055 define <8 x i16> @test_pbroadcastw(<8 x i16> %a0, <8 x i16> *%a1) {
   2056 ; GENERIC-LABEL: test_pbroadcastw:
   2057 ; GENERIC:       # %bb.0:
   2058 ; GENERIC-NEXT:    vpbroadcastw %xmm0, %xmm0 # sched: [1:0.50]
   2059 ; GENERIC-NEXT:    vpbroadcastw (%rdi), %xmm1 # sched: [7:0.50]
   2060 ; GENERIC-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   2061 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2062 ;
   2063 ; HASWELL-LABEL: test_pbroadcastw:
   2064 ; HASWELL:       # %bb.0:
   2065 ; HASWELL-NEXT:    vpbroadcastw %xmm0, %xmm0 # sched: [3:1.00]
   2066 ; HASWELL-NEXT:    vpbroadcastw (%rdi), %xmm1 # sched: [9:1.00]
   2067 ; HASWELL-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   2068 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2069 ;
   2070 ; BROADWELL-LABEL: test_pbroadcastw:
   2071 ; BROADWELL:       # %bb.0:
   2072 ; BROADWELL-NEXT:    vpbroadcastw (%rdi), %xmm1 # sched: [9:1.00]
   2073 ; BROADWELL-NEXT:    vpbroadcastw %xmm0, %xmm0 # sched: [3:1.00]
   2074 ; BROADWELL-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   2075 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2076 ;
   2077 ; SKYLAKE-LABEL: test_pbroadcastw:
   2078 ; SKYLAKE:       # %bb.0:
   2079 ; SKYLAKE-NEXT:    vpbroadcastw %xmm0, %xmm0 # sched: [3:1.00]
   2080 ; SKYLAKE-NEXT:    vpbroadcastw (%rdi), %xmm1 # sched: [7:1.00]
   2081 ; SKYLAKE-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
   2082 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2083 ;
   2084 ; SKX-LABEL: test_pbroadcastw:
   2085 ; SKX:       # %bb.0:
   2086 ; SKX-NEXT:    vpbroadcastw %xmm0, %xmm0 # sched: [3:1.00]
   2087 ; SKX-NEXT:    vpbroadcastw (%rdi), %xmm1 # sched: [7:1.00]
   2088 ; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
   2089 ; SKX-NEXT:    retq # sched: [7:1.00]
   2090 ;
   2091 ; ZNVER1-LABEL: test_pbroadcastw:
   2092 ; ZNVER1:       # %bb.0:
   2093 ; ZNVER1-NEXT:    vpbroadcastw (%rdi), %xmm1 # sched: [8:1.00]
   2094 ; ZNVER1-NEXT:    vpbroadcastw %xmm0, %xmm0 # sched: [1:0.25]
   2095 ; ZNVER1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
   2096 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2097   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer
   2098   %2 = load <8 x i16>, <8 x i16> *%a1, align 16
   2099   %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
   2100   %4 = add <8 x i16> %1, %3
   2101   ret <8 x i16> %4
   2102 }
   2103 
   2104 define <16 x i16> @test_pbroadcastw_ymm(<16 x i16> %a0, <16 x i16> *%a1) {
   2105 ; GENERIC-LABEL: test_pbroadcastw_ymm:
   2106 ; GENERIC:       # %bb.0:
   2107 ; GENERIC-NEXT:    vpbroadcastw %xmm0, %ymm0 # sched: [1:1.00]
   2108 ; GENERIC-NEXT:    vpbroadcastw (%rdi), %ymm1 # sched: [7:0.50]
   2109 ; GENERIC-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2110 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2111 ;
   2112 ; HASWELL-LABEL: test_pbroadcastw_ymm:
   2113 ; HASWELL:       # %bb.0:
   2114 ; HASWELL-NEXT:    vpbroadcastw %xmm0, %ymm0 # sched: [3:1.00]
   2115 ; HASWELL-NEXT:    vpbroadcastw (%rdi), %ymm1 # sched: [9:1.00]
   2116 ; HASWELL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2117 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2118 ;
   2119 ; BROADWELL-LABEL: test_pbroadcastw_ymm:
   2120 ; BROADWELL:       # %bb.0:
   2121 ; BROADWELL-NEXT:    vpbroadcastw (%rdi), %ymm1 # sched: [9:1.00]
   2122 ; BROADWELL-NEXT:    vpbroadcastw %xmm0, %ymm0 # sched: [3:1.00]
   2123 ; BROADWELL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2124 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2125 ;
   2126 ; SKYLAKE-LABEL: test_pbroadcastw_ymm:
   2127 ; SKYLAKE:       # %bb.0:
   2128 ; SKYLAKE-NEXT:    vpbroadcastw %xmm0, %ymm0 # sched: [3:1.00]
   2129 ; SKYLAKE-NEXT:    vpbroadcastw (%rdi), %ymm1 # sched: [8:1.00]
   2130 ; SKYLAKE-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   2131 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2132 ;
   2133 ; SKX-LABEL: test_pbroadcastw_ymm:
   2134 ; SKX:       # %bb.0:
   2135 ; SKX-NEXT:    vpbroadcastw %xmm0, %ymm0 # sched: [3:1.00]
   2136 ; SKX-NEXT:    vpbroadcastw (%rdi), %ymm1 # sched: [8:1.00]
   2137 ; SKX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   2138 ; SKX-NEXT:    retq # sched: [7:1.00]
   2139 ;
   2140 ; ZNVER1-LABEL: test_pbroadcastw_ymm:
   2141 ; ZNVER1:       # %bb.0:
   2142 ; ZNVER1-NEXT:    vpbroadcastw (%rdi), %ymm1 # sched: [8:2.00]
   2143 ; ZNVER1-NEXT:    vpbroadcastw %xmm0, %ymm0 # sched: [2:0.25]
   2144 ; ZNVER1-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   2145 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2146   %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> zeroinitializer
   2147   %2 = load <16 x i16>, <16 x i16> *%a1, align 32
   2148   %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> zeroinitializer
   2149   %4 = add <16 x i16> %1, %3
   2150   ret <16 x i16> %4
   2151 }
   2152 
   2153 define <32 x i8> @test_pcmpeqb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
   2154 ; GENERIC-LABEL: test_pcmpeqb:
   2155 ; GENERIC:       # %bb.0:
   2156 ; GENERIC-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2157 ; GENERIC-NEXT:    vpcmpeqb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2158 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2159 ;
   2160 ; HASWELL-LABEL: test_pcmpeqb:
   2161 ; HASWELL:       # %bb.0:
   2162 ; HASWELL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2163 ; HASWELL-NEXT:    vpcmpeqb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2164 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2165 ;
   2166 ; BROADWELL-LABEL: test_pcmpeqb:
   2167 ; BROADWELL:       # %bb.0:
   2168 ; BROADWELL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2169 ; BROADWELL-NEXT:    vpcmpeqb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   2170 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2171 ;
   2172 ; SKYLAKE-LABEL: test_pcmpeqb:
   2173 ; SKYLAKE:       # %bb.0:
   2174 ; SKYLAKE-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2175 ; SKYLAKE-NEXT:    vpcmpeqb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2176 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2177 ;
   2178 ; SKX-LABEL: test_pcmpeqb:
   2179 ; SKX:       # %bb.0:
   2180 ; SKX-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2181 ; SKX-NEXT:    vpcmpeqb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2182 ; SKX-NEXT:    retq # sched: [7:1.00]
   2183 ;
   2184 ; ZNVER1-LABEL: test_pcmpeqb:
   2185 ; ZNVER1:       # %bb.0:
   2186 ; ZNVER1-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   2187 ; ZNVER1-NEXT:    vpcmpeqb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2188 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2189   %1 = icmp eq <32 x i8> %a0, %a1
   2190   %2 = sext <32 x i1> %1 to <32 x i8>
   2191   %3 = load <32 x i8>, <32 x i8> *%a2, align 32
   2192   %4 = icmp eq <32 x i8> %2, %3
   2193   %5 = sext <32 x i1> %4 to <32 x i8>
   2194   ret <32 x i8> %5
   2195 }
   2196 
   2197 define <8 x i32> @test_pcmpeqd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
   2198 ; GENERIC-LABEL: test_pcmpeqd:
   2199 ; GENERIC:       # %bb.0:
   2200 ; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2201 ; GENERIC-NEXT:    vpcmpeqd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2202 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2203 ;
   2204 ; HASWELL-LABEL: test_pcmpeqd:
   2205 ; HASWELL:       # %bb.0:
   2206 ; HASWELL-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2207 ; HASWELL-NEXT:    vpcmpeqd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2208 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2209 ;
   2210 ; BROADWELL-LABEL: test_pcmpeqd:
   2211 ; BROADWELL:       # %bb.0:
   2212 ; BROADWELL-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2213 ; BROADWELL-NEXT:    vpcmpeqd (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   2214 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2215 ;
   2216 ; SKYLAKE-LABEL: test_pcmpeqd:
   2217 ; SKYLAKE:       # %bb.0:
   2218 ; SKYLAKE-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2219 ; SKYLAKE-NEXT:    vpcmpeqd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2220 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2221 ;
   2222 ; SKX-LABEL: test_pcmpeqd:
   2223 ; SKX:       # %bb.0:
   2224 ; SKX-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2225 ; SKX-NEXT:    vpcmpeqd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2226 ; SKX-NEXT:    retq # sched: [7:1.00]
   2227 ;
   2228 ; ZNVER1-LABEL: test_pcmpeqd:
   2229 ; ZNVER1:       # %bb.0:
   2230 ; ZNVER1-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   2231 ; ZNVER1-NEXT:    vpcmpeqd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2232 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2233   %1 = icmp eq <8 x i32> %a0, %a1
   2234   %2 = sext <8 x i1> %1 to <8 x i32>
   2235   %3 = load <8 x i32>, <8 x i32> *%a2, align 32
   2236   %4 = icmp eq <8 x i32> %2, %3
   2237   %5 = sext <8 x i1> %4 to <8 x i32>
   2238   ret <8 x i32> %5
   2239 }
   2240 
   2241 define <4 x i64> @test_pcmpeqq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
   2242 ; GENERIC-LABEL: test_pcmpeqq:
   2243 ; GENERIC:       # %bb.0:
   2244 ; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2245 ; GENERIC-NEXT:    vpcmpeqq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2246 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2247 ;
   2248 ; HASWELL-LABEL: test_pcmpeqq:
   2249 ; HASWELL:       # %bb.0:
   2250 ; HASWELL-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2251 ; HASWELL-NEXT:    vpcmpeqq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2252 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2253 ;
   2254 ; BROADWELL-LABEL: test_pcmpeqq:
   2255 ; BROADWELL:       # %bb.0:
   2256 ; BROADWELL-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2257 ; BROADWELL-NEXT:    vpcmpeqq (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   2258 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2259 ;
   2260 ; SKYLAKE-LABEL: test_pcmpeqq:
   2261 ; SKYLAKE:       # %bb.0:
   2262 ; SKYLAKE-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2263 ; SKYLAKE-NEXT:    vpcmpeqq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2264 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2265 ;
   2266 ; SKX-LABEL: test_pcmpeqq:
   2267 ; SKX:       # %bb.0:
   2268 ; SKX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2269 ; SKX-NEXT:    vpcmpeqq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2270 ; SKX-NEXT:    retq # sched: [7:1.00]
   2271 ;
   2272 ; ZNVER1-LABEL: test_pcmpeqq:
   2273 ; ZNVER1:       # %bb.0:
   2274 ; ZNVER1-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   2275 ; ZNVER1-NEXT:    vpcmpeqq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2276 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2277   %1 = icmp eq <4 x i64> %a0, %a1
   2278   %2 = sext <4 x i1> %1 to <4 x i64>
   2279   %3 = load <4 x i64>, <4 x i64> *%a2, align 32
   2280   %4 = icmp eq <4 x i64> %2, %3
   2281   %5 = sext <4 x i1> %4 to <4 x i64>
   2282   ret <4 x i64> %5
   2283 }
   2284 
   2285 define <16 x i16> @test_pcmpeqw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   2286 ; GENERIC-LABEL: test_pcmpeqw:
   2287 ; GENERIC:       # %bb.0:
   2288 ; GENERIC-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2289 ; GENERIC-NEXT:    vpcmpeqw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2290 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2291 ;
   2292 ; HASWELL-LABEL: test_pcmpeqw:
   2293 ; HASWELL:       # %bb.0:
   2294 ; HASWELL-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2295 ; HASWELL-NEXT:    vpcmpeqw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2296 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2297 ;
   2298 ; BROADWELL-LABEL: test_pcmpeqw:
   2299 ; BROADWELL:       # %bb.0:
   2300 ; BROADWELL-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2301 ; BROADWELL-NEXT:    vpcmpeqw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   2302 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2303 ;
   2304 ; SKYLAKE-LABEL: test_pcmpeqw:
   2305 ; SKYLAKE:       # %bb.0:
   2306 ; SKYLAKE-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2307 ; SKYLAKE-NEXT:    vpcmpeqw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2308 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2309 ;
   2310 ; SKX-LABEL: test_pcmpeqw:
   2311 ; SKX:       # %bb.0:
   2312 ; SKX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2313 ; SKX-NEXT:    vpcmpeqw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2314 ; SKX-NEXT:    retq # sched: [7:1.00]
   2315 ;
   2316 ; ZNVER1-LABEL: test_pcmpeqw:
   2317 ; ZNVER1:       # %bb.0:
   2318 ; ZNVER1-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   2319 ; ZNVER1-NEXT:    vpcmpeqw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2320 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2321   %1 = icmp eq <16 x i16> %a0, %a1
   2322   %2 = sext <16 x i1> %1 to <16 x i16>
   2323   %3 = load <16 x i16>, <16 x i16> *%a2, align 32
   2324   %4 = icmp eq <16 x i16> %2, %3
   2325   %5 = sext <16 x i1> %4 to <16 x i16>
   2326   ret <16 x i16> %5
   2327 }
   2328 
   2329 define <32 x i8> @test_pcmpgtb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
   2330 ; GENERIC-LABEL: test_pcmpgtb:
   2331 ; GENERIC:       # %bb.0:
   2332 ; GENERIC-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2333 ; GENERIC-NEXT:    vpcmpgtb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2334 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2335 ;
   2336 ; HASWELL-LABEL: test_pcmpgtb:
   2337 ; HASWELL:       # %bb.0:
   2338 ; HASWELL-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2339 ; HASWELL-NEXT:    vpcmpgtb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2340 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2341 ;
   2342 ; BROADWELL-LABEL: test_pcmpgtb:
   2343 ; BROADWELL:       # %bb.0:
   2344 ; BROADWELL-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2345 ; BROADWELL-NEXT:    vpcmpgtb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   2346 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2347 ;
   2348 ; SKYLAKE-LABEL: test_pcmpgtb:
   2349 ; SKYLAKE:       # %bb.0:
   2350 ; SKYLAKE-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2351 ; SKYLAKE-NEXT:    vpcmpgtb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2352 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2353 ;
   2354 ; SKX-LABEL: test_pcmpgtb:
   2355 ; SKX:       # %bb.0:
   2356 ; SKX-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2357 ; SKX-NEXT:    vpcmpgtb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2358 ; SKX-NEXT:    retq # sched: [7:1.00]
   2359 ;
   2360 ; ZNVER1-LABEL: test_pcmpgtb:
   2361 ; ZNVER1:       # %bb.0:
   2362 ; ZNVER1-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   2363 ; ZNVER1-NEXT:    vpcmpgtb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2364 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2365   %1 = icmp sgt <32 x i8> %a0, %a1
   2366   %2 = sext <32 x i1> %1 to <32 x i8>
   2367   %3 = load <32 x i8>, <32 x i8> *%a2, align 32
   2368   %4 = icmp sgt <32 x i8> %2, %3
   2369   %5 = sext <32 x i1> %4 to <32 x i8>
   2370   ret <32 x i8> %5
   2371 }
   2372 
   2373 define <8 x i32> @test_pcmpgtd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
   2374 ; GENERIC-LABEL: test_pcmpgtd:
   2375 ; GENERIC:       # %bb.0:
   2376 ; GENERIC-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2377 ; GENERIC-NEXT:    vpcmpgtd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2378 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2379 ;
   2380 ; HASWELL-LABEL: test_pcmpgtd:
   2381 ; HASWELL:       # %bb.0:
   2382 ; HASWELL-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2383 ; HASWELL-NEXT:    vpcmpgtd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2384 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2385 ;
   2386 ; BROADWELL-LABEL: test_pcmpgtd:
   2387 ; BROADWELL:       # %bb.0:
   2388 ; BROADWELL-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2389 ; BROADWELL-NEXT:    vpcmpgtd (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   2390 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2391 ;
   2392 ; SKYLAKE-LABEL: test_pcmpgtd:
   2393 ; SKYLAKE:       # %bb.0:
   2394 ; SKYLAKE-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2395 ; SKYLAKE-NEXT:    vpcmpgtd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2396 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2397 ;
   2398 ; SKX-LABEL: test_pcmpgtd:
   2399 ; SKX:       # %bb.0:
   2400 ; SKX-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2401 ; SKX-NEXT:    vpcmpgtd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2402 ; SKX-NEXT:    retq # sched: [7:1.00]
   2403 ;
   2404 ; ZNVER1-LABEL: test_pcmpgtd:
   2405 ; ZNVER1:       # %bb.0:
   2406 ; ZNVER1-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   2407 ; ZNVER1-NEXT:    vpcmpgtd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2408 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2409   %1 = icmp sgt <8 x i32> %a0, %a1
   2410   %2 = sext <8 x i1> %1 to <8 x i32>
   2411   %3 = load <8 x i32>, <8 x i32> *%a2, align 32
   2412   %4 = icmp sgt <8 x i32> %2, %3
   2413   %5 = sext <8 x i1> %4 to <8 x i32>
   2414   ret <8 x i32> %5
   2415 }
   2416 
   2417 define <4 x i64> @test_pcmpgtq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
   2418 ; GENERIC-LABEL: test_pcmpgtq:
   2419 ; GENERIC:       # %bb.0:
   2420 ; GENERIC-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2421 ; GENERIC-NEXT:    vpcmpgtq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2422 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2423 ;
   2424 ; HASWELL-LABEL: test_pcmpgtq:
   2425 ; HASWELL:       # %bb.0:
   2426 ; HASWELL-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   2427 ; HASWELL-NEXT:    vpcmpgtq (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
   2428 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2429 ;
   2430 ; BROADWELL-LABEL: test_pcmpgtq:
   2431 ; BROADWELL:       # %bb.0:
   2432 ; BROADWELL-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   2433 ; BROADWELL-NEXT:    vpcmpgtq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   2434 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2435 ;
   2436 ; SKYLAKE-LABEL: test_pcmpgtq:
   2437 ; SKYLAKE:       # %bb.0:
   2438 ; SKYLAKE-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
   2439 ; SKYLAKE-NEXT:    vpcmpgtq (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
   2440 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2441 ;
   2442 ; SKX-LABEL: test_pcmpgtq:
   2443 ; SKX:       # %bb.0:
   2444 ; SKX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
   2445 ; SKX-NEXT:    vpcmpgtq (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
   2446 ; SKX-NEXT:    retq # sched: [7:1.00]
   2447 ;
   2448 ; ZNVER1-LABEL: test_pcmpgtq:
   2449 ; ZNVER1:       # %bb.0:
   2450 ; ZNVER1-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2451 ; ZNVER1-NEXT:    vpcmpgtq (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   2452 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2453   %1 = icmp sgt <4 x i64> %a0, %a1
   2454   %2 = sext <4 x i1> %1 to <4 x i64>
   2455   %3 = load <4 x i64>, <4 x i64> *%a2, align 32
   2456   %4 = icmp sgt <4 x i64> %2, %3
   2457   %5 = sext <4 x i1> %4 to <4 x i64>
   2458   ret <4 x i64> %5
   2459 }
   2460 
   2461 define <16 x i16> @test_pcmpgtw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   2462 ; GENERIC-LABEL: test_pcmpgtw:
   2463 ; GENERIC:       # %bb.0:
   2464 ; GENERIC-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2465 ; GENERIC-NEXT:    vpcmpgtw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2466 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2467 ;
   2468 ; HASWELL-LABEL: test_pcmpgtw:
   2469 ; HASWELL:       # %bb.0:
   2470 ; HASWELL-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2471 ; HASWELL-NEXT:    vpcmpgtw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2472 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2473 ;
   2474 ; BROADWELL-LABEL: test_pcmpgtw:
   2475 ; BROADWELL:       # %bb.0:
   2476 ; BROADWELL-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2477 ; BROADWELL-NEXT:    vpcmpgtw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   2478 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2479 ;
   2480 ; SKYLAKE-LABEL: test_pcmpgtw:
   2481 ; SKYLAKE:       # %bb.0:
   2482 ; SKYLAKE-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2483 ; SKYLAKE-NEXT:    vpcmpgtw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2484 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2485 ;
   2486 ; SKX-LABEL: test_pcmpgtw:
   2487 ; SKX:       # %bb.0:
   2488 ; SKX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2489 ; SKX-NEXT:    vpcmpgtw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2490 ; SKX-NEXT:    retq # sched: [7:1.00]
   2491 ;
   2492 ; ZNVER1-LABEL: test_pcmpgtw:
   2493 ; ZNVER1:       # %bb.0:
   2494 ; ZNVER1-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   2495 ; ZNVER1-NEXT:    vpcmpgtw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   2496 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2497   %1 = icmp sgt <16 x i16> %a0, %a1
   2498   %2 = sext <16 x i1> %1 to <16 x i16>
   2499   %3 = load <16 x i16>, <16 x i16> *%a2, align 32
   2500   %4 = icmp sgt <16 x i16> %2, %3
   2501   %5 = sext <16 x i1> %4 to <16 x i16>
   2502   ret <16 x i16> %5
   2503 }
   2504 
   2505 define <4 x i64> @test_perm2i128(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
   2506 ; GENERIC-LABEL: test_perm2i128:
   2507 ; GENERIC:       # %bb.0:
   2508 ; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
   2509 ; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
   2510 ; GENERIC-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
   2511 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2512 ;
   2513 ; HASWELL-LABEL: test_perm2i128:
   2514 ; HASWELL:       # %bb.0:
   2515 ; HASWELL-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
   2516 ; HASWELL-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
   2517 ; HASWELL-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
   2518 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2519 ;
   2520 ; BROADWELL-LABEL: test_perm2i128:
   2521 ; BROADWELL:       # %bb.0:
   2522 ; BROADWELL-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
   2523 ; BROADWELL-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [9:1.00]
   2524 ; BROADWELL-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
   2525 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2526 ;
   2527 ; SKYLAKE-LABEL: test_perm2i128:
   2528 ; SKYLAKE:       # %bb.0:
   2529 ; SKYLAKE-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
   2530 ; SKYLAKE-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
   2531 ; SKYLAKE-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
   2532 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2533 ;
   2534 ; SKX-LABEL: test_perm2i128:
   2535 ; SKX:       # %bb.0:
   2536 ; SKX-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
   2537 ; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
   2538 ; SKX-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
   2539 ; SKX-NEXT:    retq # sched: [7:1.00]
   2540 ;
   2541 ; ZNVER1-LABEL: test_perm2i128:
   2542 ; ZNVER1:       # %bb.0:
   2543 ; ZNVER1-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [2:0.25]
   2544 ; ZNVER1-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [9:0.50]
   2545 ; ZNVER1-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
   2546 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2547   %1 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   2548   %2 = load <4 x i64>, <4 x i64> *%a2, align 32
   2549   %3 = shufflevector <4 x i64> %a0, <4 x i64> %2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   2550   %4 = add <4 x i64> %1, %3
   2551   ret <4 x i64> %4
   2552 }
   2553 
   2554 define <8 x i32> @test_permd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
   2555 ; GENERIC-LABEL: test_permd:
   2556 ; GENERIC:       # %bb.0:
   2557 ; GENERIC-NEXT:    vpermd %ymm1, %ymm0, %ymm1 # sched: [1:1.00]
   2558 ; GENERIC-NEXT:    vpermd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   2559 ; GENERIC-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
   2560 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2561 ;
   2562 ; HASWELL-LABEL: test_permd:
   2563 ; HASWELL:       # %bb.0:
   2564 ; HASWELL-NEXT:    vpermd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
   2565 ; HASWELL-NEXT:    vpermd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
   2566 ; HASWELL-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
   2567 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2568 ;
   2569 ; BROADWELL-LABEL: test_permd:
   2570 ; BROADWELL:       # %bb.0:
   2571 ; BROADWELL-NEXT:    vpermd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
   2572 ; BROADWELL-NEXT:    vpermd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
   2573 ; BROADWELL-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
   2574 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2575 ;
   2576 ; SKYLAKE-LABEL: test_permd:
   2577 ; SKYLAKE:       # %bb.0:
   2578 ; SKYLAKE-NEXT:    vpermd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
   2579 ; SKYLAKE-NEXT:    vpermd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
   2580 ; SKYLAKE-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
   2581 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2582 ;
   2583 ; SKX-LABEL: test_permd:
   2584 ; SKX:       # %bb.0:
   2585 ; SKX-NEXT:    vpermd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
   2586 ; SKX-NEXT:    vpermd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
   2587 ; SKX-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
   2588 ; SKX-NEXT:    retq # sched: [7:1.00]
   2589 ;
   2590 ; ZNVER1-LABEL: test_permd:
   2591 ; ZNVER1:       # %bb.0:
   2592 ; ZNVER1-NEXT:    vpermd %ymm1, %ymm0, %ymm1 # sched: [2:0.25]
   2593 ; ZNVER1-NEXT:    vpermd (%rdi), %ymm0, %ymm0 # sched: [9:0.50]
   2594 ; ZNVER1-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
   2595 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2596   %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a1, <8 x i32> %a0)
   2597   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   2598   %3 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %2, <8 x i32> %a0)
   2599   %4 = add <8 x i32> %1, %3
   2600   ret <8 x i32> %4
   2601 }
   2602 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
   2603 
   2604 define <4 x double> @test_permpd(<4 x double> %a0, <4 x double> *%a1) {
   2605 ; GENERIC-LABEL: test_permpd:
   2606 ; GENERIC:       # %bb.0:
   2607 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [1:1.00]
   2608 ; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 = mem[0,2,2,3] sched: [8:1.00]
   2609 ; GENERIC-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
   2610 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2611 ;
   2612 ; HASWELL-LABEL: test_permpd:
   2613 ; HASWELL:       # %bb.0:
   2614 ; HASWELL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
   2615 ; HASWELL-NEXT:    vpermpd {{.*#+}} ymm1 = mem[0,2,2,3] sched: [10:1.00]
   2616 ; HASWELL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
   2617 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2618 ;
   2619 ; BROADWELL-LABEL: test_permpd:
   2620 ; BROADWELL:       # %bb.0:
   2621 ; BROADWELL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
   2622 ; BROADWELL-NEXT:    vpermpd {{.*#+}} ymm1 = mem[0,2,2,3] sched: [9:1.00]
   2623 ; BROADWELL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
   2624 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2625 ;
   2626 ; SKYLAKE-LABEL: test_permpd:
   2627 ; SKYLAKE:       # %bb.0:
   2628 ; SKYLAKE-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
   2629 ; SKYLAKE-NEXT:    vpermpd {{.*#+}} ymm1 = mem[0,2,2,3] sched: [10:1.00]
   2630 ; SKYLAKE-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
   2631 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2632 ;
   2633 ; SKX-LABEL: test_permpd:
   2634 ; SKX:       # %bb.0:
   2635 ; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
   2636 ; SKX-NEXT:    vpermpd {{.*#+}} ymm1 = mem[0,2,2,3] sched: [10:1.00]
   2637 ; SKX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
   2638 ; SKX-NEXT:    retq # sched: [7:1.00]
   2639 ;
   2640 ; ZNVER1-LABEL: test_permpd:
   2641 ; ZNVER1:       # %bb.0:
   2642 ; ZNVER1-NEXT:    vpermpd {{.*#+}} ymm1 = mem[0,2,2,3] sched: [107:0.50]
   2643 ; ZNVER1-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [100:0.25]
   2644 ; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
   2645 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2646   %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
   2647   %2 = load <4 x double>, <4 x double> *%a1, align 32
   2648   %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 2, i32 3>
   2649   %4 = fadd <4 x double> %1, %3
   2650   ret <4 x double> %4
   2651 }
   2652 
   2653 define <8 x float> @test_permps(<8 x i32> %a0, <8 x float> %a1, <8 x float> *%a2) {
   2654 ; GENERIC-LABEL: test_permps:
   2655 ; GENERIC:       # %bb.0:
   2656 ; GENERIC-NEXT:    vpermps %ymm1, %ymm0, %ymm1 # sched: [1:1.00]
   2657 ; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   2658 ; GENERIC-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
   2659 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2660 ;
   2661 ; HASWELL-LABEL: test_permps:
   2662 ; HASWELL:       # %bb.0:
   2663 ; HASWELL-NEXT:    vpermps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
   2664 ; HASWELL-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
   2665 ; HASWELL-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
   2666 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2667 ;
   2668 ; BROADWELL-LABEL: test_permps:
   2669 ; BROADWELL:       # %bb.0:
   2670 ; BROADWELL-NEXT:    vpermps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
   2671 ; BROADWELL-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
   2672 ; BROADWELL-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
   2673 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2674 ;
   2675 ; SKYLAKE-LABEL: test_permps:
   2676 ; SKYLAKE:       # %bb.0:
   2677 ; SKYLAKE-NEXT:    vpermps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
   2678 ; SKYLAKE-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
   2679 ; SKYLAKE-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
   2680 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2681 ;
   2682 ; SKX-LABEL: test_permps:
   2683 ; SKX:       # %bb.0:
   2684 ; SKX-NEXT:    vpermps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
   2685 ; SKX-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
   2686 ; SKX-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
   2687 ; SKX-NEXT:    retq # sched: [7:1.00]
   2688 ;
   2689 ; ZNVER1-LABEL: test_permps:
   2690 ; ZNVER1:       # %bb.0:
   2691 ; ZNVER1-NEXT:    vpermps %ymm1, %ymm0, %ymm1 # sched: [100:0.25]
   2692 ; ZNVER1-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [107:0.50]
   2693 ; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
   2694 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2695   %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a1, <8 x i32> %a0)
   2696   %2 = load <8 x float>, <8 x float> *%a2, align 32
   2697   %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> %a0)
   2698   %4 = fadd <8 x float> %1, %3
   2699   ret <8 x float> %4
   2700 }
   2701 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
   2702 
   2703 define <4 x i64> @test_permq(<4 x i64> %a0, <4 x i64> *%a1) {
   2704 ; GENERIC-LABEL: test_permq:
   2705 ; GENERIC:       # %bb.0:
   2706 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [1:1.00]
   2707 ; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 = mem[0,2,2,3] sched: [8:1.00]
   2708 ; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2709 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2710 ;
   2711 ; HASWELL-LABEL: test_permq:
   2712 ; HASWELL:       # %bb.0:
   2713 ; HASWELL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
   2714 ; HASWELL-NEXT:    vpermq {{.*#+}} ymm1 = mem[0,2,2,3] sched: [10:1.00]
   2715 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2716 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2717 ;
   2718 ; BROADWELL-LABEL: test_permq:
   2719 ; BROADWELL:       # %bb.0:
   2720 ; BROADWELL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
   2721 ; BROADWELL-NEXT:    vpermq {{.*#+}} ymm1 = mem[0,2,2,3] sched: [9:1.00]
   2722 ; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   2723 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2724 ;
   2725 ; SKYLAKE-LABEL: test_permq:
   2726 ; SKYLAKE:       # %bb.0:
   2727 ; SKYLAKE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
   2728 ; SKYLAKE-NEXT:    vpermq {{.*#+}} ymm1 = mem[0,2,2,3] sched: [10:1.00]
   2729 ; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   2730 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2731 ;
   2732 ; SKX-LABEL: test_permq:
   2733 ; SKX:       # %bb.0:
   2734 ; SKX-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
   2735 ; SKX-NEXT:    vpermq {{.*#+}} ymm1 = mem[0,2,2,3] sched: [10:1.00]
   2736 ; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   2737 ; SKX-NEXT:    retq # sched: [7:1.00]
   2738 ;
   2739 ; ZNVER1-LABEL: test_permq:
   2740 ; ZNVER1:       # %bb.0:
   2741 ; ZNVER1-NEXT:    vpermq {{.*#+}} ymm1 = mem[0,2,2,3] sched: [9:0.50]
   2742 ; ZNVER1-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [2:0.25]
   2743 ; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   2744 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2745   %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
   2746   %2 = load <4 x i64>, <4 x i64> *%a1, align 32
   2747   %3 = shufflevector <4 x i64> %2, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 2, i32 3>
   2748   %4 = add <4 x i64> %1, %3
   2749   ret <4 x i64> %4
   2750 }
   2751 
   2752 define <4 x i32> @test_pgatherdd(<4 x i32> %a0, i8* %a1, <4 x i32> %a2, <4 x i32> %a3) {
   2753 ; GENERIC-LABEL: test_pgatherdd:
   2754 ; GENERIC:       # %bb.0:
   2755 ; GENERIC-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
   2756 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2757 ;
   2758 ; HASWELL-LABEL: test_pgatherdd:
   2759 ; HASWELL:       # %bb.0:
   2760 ; HASWELL-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [26:2.67]
   2761 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2762 ;
   2763 ; BROADWELL-LABEL: test_pgatherdd:
   2764 ; BROADWELL:       # %bb.0:
   2765 ; BROADWELL-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
   2766 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2767 ;
   2768 ; SKYLAKE-LABEL: test_pgatherdd:
   2769 ; SKYLAKE:       # %bb.0:
   2770 ; SKYLAKE-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
   2771 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2772 ;
   2773 ; SKX-LABEL: test_pgatherdd:
   2774 ; SKX:       # %bb.0:
   2775 ; SKX-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
   2776 ; SKX-NEXT:    retq # sched: [7:1.00]
   2777 ;
   2778 ; ZNVER1-LABEL: test_pgatherdd:
   2779 ; ZNVER1:       # %bb.0:
   2780 ; ZNVER1-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:0.25]
   2781 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2782   %1 = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %a0, i8* %a1, <4 x i32> %a2, <4 x i32> %a3, i8 2)
   2783   ret <4 x i32> %1
   2784 }
   2785 declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly
   2786 
   2787 define <8 x i32> @test_pgatherdd_ymm(<8 x i32> %a0, i8* %a1, <8 x i32> %a2, <8 x i32> %a3) {
   2788 ; GENERIC-LABEL: test_pgatherdd_ymm:
   2789 ; GENERIC:       # %bb.0:
   2790 ; GENERIC-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [5:0.50]
   2791 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2792 ;
   2793 ; HASWELL-LABEL: test_pgatherdd_ymm:
   2794 ; HASWELL:       # %bb.0:
   2795 ; HASWELL-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [27:6.50]
   2796 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2797 ;
   2798 ; BROADWELL-LABEL: test_pgatherdd_ymm:
   2799 ; BROADWELL:       # %bb.0:
   2800 ; BROADWELL-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [5:0.50]
   2801 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2802 ;
   2803 ; SKYLAKE-LABEL: test_pgatherdd_ymm:
   2804 ; SKYLAKE:       # %bb.0:
   2805 ; SKYLAKE-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [25:1.00]
   2806 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2807 ;
   2808 ; SKX-LABEL: test_pgatherdd_ymm:
   2809 ; SKX:       # %bb.0:
   2810 ; SKX-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [25:1.00]
   2811 ; SKX-NEXT:    retq # sched: [7:1.00]
   2812 ;
   2813 ; ZNVER1-LABEL: test_pgatherdd_ymm:
   2814 ; ZNVER1:       # %bb.0:
   2815 ; ZNVER1-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [100:0.25]
   2816 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2817   %1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %a0, i8* %a1, <8 x i32> %a2, <8 x i32> %a3, i8 2)
   2818   ret <8 x i32> %1
   2819 }
   2820 declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) nounwind readonly
   2821 
   2822 define <2 x i64> @test_pgatherdq(<2 x i64> %a0, i8* %a1, <4 x i32> %a2, <2 x i64> %a3) {
   2823 ; GENERIC-LABEL: test_pgatherdq:
   2824 ; GENERIC:       # %bb.0:
   2825 ; GENERIC-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
   2826 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2827 ;
   2828 ; HASWELL-LABEL: test_pgatherdq:
   2829 ; HASWELL:       # %bb.0:
   2830 ; HASWELL-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [26:2.67]
   2831 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2832 ;
   2833 ; BROADWELL-LABEL: test_pgatherdq:
   2834 ; BROADWELL:       # %bb.0:
   2835 ; BROADWELL-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
   2836 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2837 ;
   2838 ; SKYLAKE-LABEL: test_pgatherdq:
   2839 ; SKYLAKE:       # %bb.0:
   2840 ; SKYLAKE-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
   2841 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2842 ;
   2843 ; SKX-LABEL: test_pgatherdq:
   2844 ; SKX:       # %bb.0:
   2845 ; SKX-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
   2846 ; SKX-NEXT:    retq # sched: [7:1.00]
   2847 ;
   2848 ; ZNVER1-LABEL: test_pgatherdq:
   2849 ; ZNVER1:       # %bb.0:
   2850 ; ZNVER1-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:0.25]
   2851 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2852   %1 = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %a1, <4 x i32> %a2, <2 x i64> %a3, i8 2)
   2853   ret <2 x i64> %1
   2854 }
   2855 declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64>, i8) nounwind readonly
   2856 
   2857 define <4 x i64> @test_pgatherdq_ymm(<4 x i64> %a0, i8* %a1, <4 x i32> %a2, <4 x i64> %a3) {
   2858 ; GENERIC-LABEL: test_pgatherdq_ymm:
   2859 ; GENERIC:       # %bb.0:
   2860 ; GENERIC-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 # sched: [5:0.50]
   2861 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2862 ;
   2863 ; HASWELL-LABEL: test_pgatherdq_ymm:
   2864 ; HASWELL:       # %bb.0:
   2865 ; HASWELL-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 # sched: [27:4.00]
   2866 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2867 ;
   2868 ; BROADWELL-LABEL: test_pgatherdq_ymm:
   2869 ; BROADWELL:       # %bb.0:
   2870 ; BROADWELL-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 # sched: [5:0.50]
   2871 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2872 ;
   2873 ; SKYLAKE-LABEL: test_pgatherdq_ymm:
   2874 ; SKYLAKE:       # %bb.0:
   2875 ; SKYLAKE-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 # sched: [25:1.00]
   2876 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2877 ;
   2878 ; SKX-LABEL: test_pgatherdq_ymm:
   2879 ; SKX:       # %bb.0:
   2880 ; SKX-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 # sched: [25:1.00]
   2881 ; SKX-NEXT:    retq # sched: [7:1.00]
   2882 ;
   2883 ; ZNVER1-LABEL: test_pgatherdq_ymm:
   2884 ; ZNVER1:       # %bb.0:
   2885 ; ZNVER1-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 # sched: [100:0.25]
   2886 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2887   %1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %a1, <4 x i32> %a2, <4 x i64> %a3, i8 2)
   2888   ret <4 x i64> %1
   2889 }
   2890 declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x i64>, i8) nounwind readonly
   2891 
   2892 define <4 x i32> @test_pgatherqd(<4 x i32> %a0, i8* %a1, <2 x i64> %a2, <4 x i32> %a3) {
   2893 ; GENERIC-LABEL: test_pgatherqd:
   2894 ; GENERIC:       # %bb.0:
   2895 ; GENERIC-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
   2896 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2897 ;
   2898 ; HASWELL-LABEL: test_pgatherqd:
   2899 ; HASWELL:       # %bb.0:
   2900 ; HASWELL-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [25:5.00]
   2901 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2902 ;
   2903 ; BROADWELL-LABEL: test_pgatherqd:
   2904 ; BROADWELL:       # %bb.0:
   2905 ; BROADWELL-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
   2906 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2907 ;
   2908 ; SKYLAKE-LABEL: test_pgatherqd:
   2909 ; SKYLAKE:       # %bb.0:
   2910 ; SKYLAKE-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
   2911 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2912 ;
   2913 ; SKX-LABEL: test_pgatherqd:
   2914 ; SKX:       # %bb.0:
   2915 ; SKX-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
   2916 ; SKX-NEXT:    retq # sched: [7:1.00]
   2917 ;
   2918 ; ZNVER1-LABEL: test_pgatherqd:
   2919 ; ZNVER1:       # %bb.0:
   2920 ; ZNVER1-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:0.25]
   2921 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2922   %1 = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %a0, i8* %a1, <2 x i64> %a2, <4 x i32> %a3, i8 2)
   2923   ret <4 x i32> %1
   2924 }
   2925 declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32>, i8) nounwind readonly
   2926 
   2927 define <4 x i32> @test_pgatherqd_ymm(<4 x i32> %a0, i8* %a1, <4 x i64> %a2, <4 x i32> %a3) {
   2928 ; GENERIC-LABEL: test_pgatherqd_ymm:
   2929 ; GENERIC:       # %bb.0:
   2930 ; GENERIC-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [5:0.50]
   2931 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
   2932 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2933 ;
   2934 ; HASWELL-LABEL: test_pgatherqd_ymm:
   2935 ; HASWELL:       # %bb.0:
   2936 ; HASWELL-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [28:5.00]
   2937 ; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
   2938 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2939 ;
   2940 ; BROADWELL-LABEL: test_pgatherqd_ymm:
   2941 ; BROADWELL:       # %bb.0:
   2942 ; BROADWELL-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [5:0.50]
   2943 ; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
   2944 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2945 ;
   2946 ; SKYLAKE-LABEL: test_pgatherqd_ymm:
   2947 ; SKYLAKE:       # %bb.0:
   2948 ; SKYLAKE-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [25:1.00]
   2949 ; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
   2950 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2951 ;
   2952 ; SKX-LABEL: test_pgatherqd_ymm:
   2953 ; SKX:       # %bb.0:
   2954 ; SKX-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [25:1.00]
   2955 ; SKX-NEXT:    vzeroupper # sched: [4:1.00]
   2956 ; SKX-NEXT:    retq # sched: [7:1.00]
   2957 ;
   2958 ; ZNVER1-LABEL: test_pgatherqd_ymm:
   2959 ; ZNVER1:       # %bb.0:
   2960 ; ZNVER1-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [100:0.25]
   2961 ; ZNVER1-NEXT:    vzeroupper # sched: [100:0.25]
   2962 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2963   %1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %a0, i8* %a1, <4 x i64> %a2, <4 x i32> %a3, i8 2)
   2964   ret <4 x i32> %1
   2965 }
   2966 declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x i32>, i8) nounwind readonly
   2967 
   2968 define <2 x i64> @test_pgatherqq(<2 x i64> %a0, i8 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
   2969 ; GENERIC-LABEL: test_pgatherqq:
   2970 ; GENERIC:       # %bb.0:
   2971 ; GENERIC-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
   2972 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   2973 ;
   2974 ; HASWELL-LABEL: test_pgatherqq:
   2975 ; HASWELL:       # %bb.0:
   2976 ; HASWELL-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [23:3.33]
   2977 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   2978 ;
   2979 ; BROADWELL-LABEL: test_pgatherqq:
   2980 ; BROADWELL:       # %bb.0:
   2981 ; BROADWELL-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
   2982 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   2983 ;
   2984 ; SKYLAKE-LABEL: test_pgatherqq:
   2985 ; SKYLAKE:       # %bb.0:
   2986 ; SKYLAKE-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
   2987 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   2988 ;
   2989 ; SKX-LABEL: test_pgatherqq:
   2990 ; SKX:       # %bb.0:
   2991 ; SKX-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
   2992 ; SKX-NEXT:    retq # sched: [7:1.00]
   2993 ;
   2994 ; ZNVER1-LABEL: test_pgatherqq:
   2995 ; ZNVER1:       # %bb.0:
   2996 ; ZNVER1-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:0.25]
   2997 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   2998   %1 = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %a1, <2 x i64> %a2, <2 x i64> %a3, i8 2)
   2999   ret <2 x i64> %1
   3000 }
   3001 declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64>, i8) nounwind readonly
   3002 
   3003 define <4 x i64> @test_pgatherqq_ymm(<4 x i64> %a0, i8 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
   3004 ; GENERIC-LABEL: test_pgatherqq_ymm:
   3005 ; GENERIC:       # %bb.0:
   3006 ; GENERIC-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [5:0.50]
   3007 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3008 ;
   3009 ; HASWELL-LABEL: test_pgatherqq_ymm:
   3010 ; HASWELL:       # %bb.0:
   3011 ; HASWELL-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [24:5.00]
   3012 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3013 ;
   3014 ; BROADWELL-LABEL: test_pgatherqq_ymm:
   3015 ; BROADWELL:       # %bb.0:
   3016 ; BROADWELL-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [5:0.50]
   3017 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3018 ;
   3019 ; SKYLAKE-LABEL: test_pgatherqq_ymm:
   3020 ; SKYLAKE:       # %bb.0:
   3021 ; SKYLAKE-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [25:1.00]
   3022 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3023 ;
   3024 ; SKX-LABEL: test_pgatherqq_ymm:
   3025 ; SKX:       # %bb.0:
   3026 ; SKX-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [25:1.00]
   3027 ; SKX-NEXT:    retq # sched: [7:1.00]
   3028 ;
   3029 ; ZNVER1-LABEL: test_pgatherqq_ymm:
   3030 ; ZNVER1:       # %bb.0:
   3031 ; ZNVER1-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [100:0.25]
   3032 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   3033   %1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %a1, <4 x i64> %a2, <4 x i64> %a3, i8 2)
   3034   ret <4 x i64> %1
   3035 }
   3036 declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*, <4 x i64>, <4 x i64>, i8) nounwind readonly
   3037 
   3038 define <8 x i32> @test_phaddd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
   3039 ; GENERIC-LABEL: test_phaddd:
   3040 ; GENERIC:       # %bb.0:
   3041 ; GENERIC-NEXT:    vphaddd %ymm1, %ymm0, %ymm0 # sched: [3:1.50]
   3042 ; GENERIC-NEXT:    vphaddd (%rdi), %ymm0, %ymm0 # sched: [10:1.50]
   3043 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3044 ;
   3045 ; HASWELL-LABEL: test_phaddd:
   3046 ; HASWELL:       # %bb.0:
   3047 ; HASWELL-NEXT:    vphaddd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3048 ; HASWELL-NEXT:    vphaddd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
   3049 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3050 ;
   3051 ; BROADWELL-LABEL: test_phaddd:
   3052 ; BROADWELL:       # %bb.0:
   3053 ; BROADWELL-NEXT:    vphaddd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3054 ; BROADWELL-NEXT:    vphaddd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
   3055 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3056 ;
   3057 ; SKYLAKE-LABEL: test_phaddd:
   3058 ; SKYLAKE:       # %bb.0:
   3059 ; SKYLAKE-NEXT:    vphaddd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3060 ; SKYLAKE-NEXT:    vphaddd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
   3061 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3062 ;
   3063 ; SKX-LABEL: test_phaddd:
   3064 ; SKX:       # %bb.0:
   3065 ; SKX-NEXT:    vphaddd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3066 ; SKX-NEXT:    vphaddd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
   3067 ; SKX-NEXT:    retq # sched: [7:1.00]
   3068 ;
   3069 ; ZNVER1-LABEL: test_phaddd:
   3070 ; ZNVER1:       # %bb.0:
   3071 ; ZNVER1-NEXT:    vphaddd %ymm1, %ymm0, %ymm0 # sched: [100:0.25]
   3072 ; ZNVER1-NEXT:    vphaddd (%rdi), %ymm0, %ymm0 # sched: [100:0.25]
   3073 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   3074   %1 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1)
   3075   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   3076   %3 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %1, <8 x i32> %2)
   3077   ret <8 x i32> %3
   3078 }
   3079 declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
   3080 
   3081 define <16 x i16> @test_phaddsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   3082 ; GENERIC-LABEL: test_phaddsw:
   3083 ; GENERIC:       # %bb.0:
   3084 ; GENERIC-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0 # sched: [3:1.50]
   3085 ; GENERIC-NEXT:    vphaddsw (%rdi), %ymm0, %ymm0 # sched: [10:1.50]
   3086 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3087 ;
   3088 ; HASWELL-LABEL: test_phaddsw:
   3089 ; HASWELL:       # %bb.0:
   3090 ; HASWELL-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3091 ; HASWELL-NEXT:    vphaddsw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
   3092 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3093 ;
   3094 ; BROADWELL-LABEL: test_phaddsw:
   3095 ; BROADWELL:       # %bb.0:
   3096 ; BROADWELL-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3097 ; BROADWELL-NEXT:    vphaddsw (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
   3098 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3099 ;
   3100 ; SKYLAKE-LABEL: test_phaddsw:
   3101 ; SKYLAKE:       # %bb.0:
   3102 ; SKYLAKE-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3103 ; SKYLAKE-NEXT:    vphaddsw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
   3104 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3105 ;
   3106 ; SKX-LABEL: test_phaddsw:
   3107 ; SKX:       # %bb.0:
   3108 ; SKX-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3109 ; SKX-NEXT:    vphaddsw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
   3110 ; SKX-NEXT:    retq # sched: [7:1.00]
   3111 ;
   3112 ; ZNVER1-LABEL: test_phaddsw:
   3113 ; ZNVER1:       # %bb.0:
   3114 ; ZNVER1-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0 # sched: [100:0.25]
   3115 ; ZNVER1-NEXT:    vphaddsw (%rdi), %ymm0, %ymm0 # sched: [100:0.25]
   3116 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   3117   %1 = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %a0, <16 x i16> %a1)
   3118   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   3119   %3 = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %1, <16 x i16> %2)
   3120   ret <16 x i16> %3
   3121 }
   3122 declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
   3123 
   3124 define <16 x i16> @test_phaddw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   3125 ; GENERIC-LABEL: test_phaddw:
   3126 ; GENERIC:       # %bb.0:
   3127 ; GENERIC-NEXT:    vphaddw %ymm1, %ymm0, %ymm0 # sched: [3:1.50]
   3128 ; GENERIC-NEXT:    vphaddw (%rdi), %ymm0, %ymm0 # sched: [10:1.50]
   3129 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3130 ;
   3131 ; HASWELL-LABEL: test_phaddw:
   3132 ; HASWELL:       # %bb.0:
   3133 ; HASWELL-NEXT:    vphaddw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3134 ; HASWELL-NEXT:    vphaddw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
   3135 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3136 ;
   3137 ; BROADWELL-LABEL: test_phaddw:
   3138 ; BROADWELL:       # %bb.0:
   3139 ; BROADWELL-NEXT:    vphaddw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3140 ; BROADWELL-NEXT:    vphaddw (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
   3141 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3142 ;
   3143 ; SKYLAKE-LABEL: test_phaddw:
   3144 ; SKYLAKE:       # %bb.0:
   3145 ; SKYLAKE-NEXT:    vphaddw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3146 ; SKYLAKE-NEXT:    vphaddw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
   3147 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3148 ;
   3149 ; SKX-LABEL: test_phaddw:
   3150 ; SKX:       # %bb.0:
   3151 ; SKX-NEXT:    vphaddw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3152 ; SKX-NEXT:    vphaddw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
   3153 ; SKX-NEXT:    retq # sched: [7:1.00]
   3154 ;
   3155 ; ZNVER1-LABEL: test_phaddw:
   3156 ; ZNVER1:       # %bb.0:
   3157 ; ZNVER1-NEXT:    vphaddw %ymm1, %ymm0, %ymm0 # sched: [100:0.25]
   3158 ; ZNVER1-NEXT:    vphaddw (%rdi), %ymm0, %ymm0 # sched: [100:0.25]
   3159 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   3160   %1 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1)
   3161   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   3162   %3 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %1, <16 x i16> %2)
   3163   ret <16 x i16> %3
   3164 }
   3165 declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
   3166 
   3167 define <8 x i32> @test_phsubd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
   3168 ; GENERIC-LABEL: test_phsubd:
   3169 ; GENERIC:       # %bb.0:
   3170 ; GENERIC-NEXT:    vphsubd %ymm1, %ymm0, %ymm0 # sched: [3:1.50]
   3171 ; GENERIC-NEXT:    vphsubd (%rdi), %ymm0, %ymm0 # sched: [10:1.50]
   3172 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3173 ;
   3174 ; HASWELL-LABEL: test_phsubd:
   3175 ; HASWELL:       # %bb.0:
   3176 ; HASWELL-NEXT:    vphsubd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3177 ; HASWELL-NEXT:    vphsubd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
   3178 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3179 ;
   3180 ; BROADWELL-LABEL: test_phsubd:
   3181 ; BROADWELL:       # %bb.0:
   3182 ; BROADWELL-NEXT:    vphsubd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3183 ; BROADWELL-NEXT:    vphsubd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
   3184 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3185 ;
   3186 ; SKYLAKE-LABEL: test_phsubd:
   3187 ; SKYLAKE:       # %bb.0:
   3188 ; SKYLAKE-NEXT:    vphsubd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3189 ; SKYLAKE-NEXT:    vphsubd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
   3190 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3191 ;
   3192 ; SKX-LABEL: test_phsubd:
   3193 ; SKX:       # %bb.0:
   3194 ; SKX-NEXT:    vphsubd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3195 ; SKX-NEXT:    vphsubd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
   3196 ; SKX-NEXT:    retq # sched: [7:1.00]
   3197 ;
   3198 ; ZNVER1-LABEL: test_phsubd:
   3199 ; ZNVER1:       # %bb.0:
   3200 ; ZNVER1-NEXT:    vphsubd %ymm1, %ymm0, %ymm0 # sched: [100:0.25]
   3201 ; ZNVER1-NEXT:    vphsubd (%rdi), %ymm0, %ymm0 # sched: [100:0.25]
   3202 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   3203   %1 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1)
   3204   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   3205   %3 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %1, <8 x i32> %2)
   3206   ret <8 x i32> %3
   3207 }
   3208 declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
   3209 
   3210 define <16 x i16> @test_phsubsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   3211 ; GENERIC-LABEL: test_phsubsw:
   3212 ; GENERIC:       # %bb.0:
   3213 ; GENERIC-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0 # sched: [3:1.50]
   3214 ; GENERIC-NEXT:    vphsubsw (%rdi), %ymm0, %ymm0 # sched: [10:1.50]
   3215 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3216 ;
   3217 ; HASWELL-LABEL: test_phsubsw:
   3218 ; HASWELL:       # %bb.0:
   3219 ; HASWELL-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3220 ; HASWELL-NEXT:    vphsubsw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
   3221 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3222 ;
   3223 ; BROADWELL-LABEL: test_phsubsw:
   3224 ; BROADWELL:       # %bb.0:
   3225 ; BROADWELL-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3226 ; BROADWELL-NEXT:    vphsubsw (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
   3227 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3228 ;
   3229 ; SKYLAKE-LABEL: test_phsubsw:
   3230 ; SKYLAKE:       # %bb.0:
   3231 ; SKYLAKE-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3232 ; SKYLAKE-NEXT:    vphsubsw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
   3233 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3234 ;
   3235 ; SKX-LABEL: test_phsubsw:
   3236 ; SKX:       # %bb.0:
   3237 ; SKX-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3238 ; SKX-NEXT:    vphsubsw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
   3239 ; SKX-NEXT:    retq # sched: [7:1.00]
   3240 ;
   3241 ; ZNVER1-LABEL: test_phsubsw:
   3242 ; ZNVER1:       # %bb.0:
   3243 ; ZNVER1-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0 # sched: [100:0.25]
   3244 ; ZNVER1-NEXT:    vphsubsw (%rdi), %ymm0, %ymm0 # sched: [100:0.25]
   3245 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   3246   %1 = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %a0, <16 x i16> %a1)
   3247   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   3248   %3 = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %1, <16 x i16> %2)
   3249   ret <16 x i16> %3
   3250 }
   3251 declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
   3252 
   3253 define <16 x i16> @test_phsubw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   3254 ; GENERIC-LABEL: test_phsubw:
   3255 ; GENERIC:       # %bb.0:
   3256 ; GENERIC-NEXT:    vphsubw %ymm1, %ymm0, %ymm0 # sched: [3:1.50]
   3257 ; GENERIC-NEXT:    vphsubw (%rdi), %ymm0, %ymm0 # sched: [10:1.50]
   3258 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3259 ;
   3260 ; HASWELL-LABEL: test_phsubw:
   3261 ; HASWELL:       # %bb.0:
   3262 ; HASWELL-NEXT:    vphsubw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3263 ; HASWELL-NEXT:    vphsubw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
   3264 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3265 ;
   3266 ; BROADWELL-LABEL: test_phsubw:
   3267 ; BROADWELL:       # %bb.0:
   3268 ; BROADWELL-NEXT:    vphsubw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3269 ; BROADWELL-NEXT:    vphsubw (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
   3270 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3271 ;
   3272 ; SKYLAKE-LABEL: test_phsubw:
   3273 ; SKYLAKE:       # %bb.0:
   3274 ; SKYLAKE-NEXT:    vphsubw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3275 ; SKYLAKE-NEXT:    vphsubw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
   3276 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3277 ;
   3278 ; SKX-LABEL: test_phsubw:
   3279 ; SKX:       # %bb.0:
   3280 ; SKX-NEXT:    vphsubw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   3281 ; SKX-NEXT:    vphsubw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
   3282 ; SKX-NEXT:    retq # sched: [7:1.00]
   3283 ;
   3284 ; ZNVER1-LABEL: test_phsubw:
   3285 ; ZNVER1:       # %bb.0:
   3286 ; ZNVER1-NEXT:    vphsubw %ymm1, %ymm0, %ymm0 # sched: [100:0.25]
   3287 ; ZNVER1-NEXT:    vphsubw (%rdi), %ymm0, %ymm0 # sched: [100:0.25]
   3288 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   3289   %1 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1)
   3290   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   3291   %3 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %1, <16 x i16> %2)
   3292   ret <16 x i16> %3
   3293 }
   3294 declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
   3295 
   3296 define <16 x i16> @test_pmaddubsw(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
   3297 ; GENERIC-LABEL: test_pmaddubsw:
   3298 ; GENERIC:       # %bb.0:
   3299 ; GENERIC-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   3300 ; GENERIC-NEXT:    vpmaddubsw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
   3301 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3302 ;
   3303 ; HASWELL-LABEL: test_pmaddubsw:
   3304 ; HASWELL:       # %bb.0:
   3305 ; HASWELL-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   3306 ; HASWELL-NEXT:    vpmaddubsw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
   3307 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3308 ;
   3309 ; BROADWELL-LABEL: test_pmaddubsw:
   3310 ; BROADWELL:       # %bb.0:
   3311 ; BROADWELL-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   3312 ; BROADWELL-NEXT:    vpmaddubsw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   3313 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3314 ;
   3315 ; SKYLAKE-LABEL: test_pmaddubsw:
   3316 ; SKYLAKE:       # %bb.0:
   3317 ; SKYLAKE-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
   3318 ; SKYLAKE-NEXT:    vpmaddubsw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
   3319 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3320 ;
   3321 ; SKX-LABEL: test_pmaddubsw:
   3322 ; SKX:       # %bb.0:
   3323 ; SKX-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
   3324 ; SKX-NEXT:    vpmaddubsw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
   3325 ; SKX-NEXT:    retq # sched: [7:1.00]
   3326 ;
   3327 ; ZNVER1-LABEL: test_pmaddubsw:
   3328 ; ZNVER1:       # %bb.0:
   3329 ; ZNVER1-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
   3330 ; ZNVER1-NEXT:    vpmaddubsw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   3331 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   3332   %1 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1)
   3333   %2 = bitcast <16 x i16> %1 to <32 x i8>
   3334   %3 = load <32 x i8>, <32 x i8> *%a2, align 32
   3335   %4 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %2, <32 x i8> %3)
   3336   ret <16 x i16> %4
   3337 }
   3338 declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
   3339 
   3340 define <8 x i32> @test_pmaddwd(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   3341 ; GENERIC-LABEL: test_pmaddwd:
   3342 ; GENERIC:       # %bb.0:
   3343 ; GENERIC-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   3344 ; GENERIC-NEXT:    vpmaddwd (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
   3345 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3346 ;
   3347 ; HASWELL-LABEL: test_pmaddwd:
   3348 ; HASWELL:       # %bb.0:
   3349 ; HASWELL-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   3350 ; HASWELL-NEXT:    vpmaddwd (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
   3351 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3352 ;
   3353 ; BROADWELL-LABEL: test_pmaddwd:
   3354 ; BROADWELL:       # %bb.0:
   3355 ; BROADWELL-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   3356 ; BROADWELL-NEXT:    vpmaddwd (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   3357 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3358 ;
   3359 ; SKYLAKE-LABEL: test_pmaddwd:
   3360 ; SKYLAKE:       # %bb.0:
   3361 ; SKYLAKE-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
   3362 ; SKYLAKE-NEXT:    vpmaddwd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
   3363 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3364 ;
   3365 ; SKX-LABEL: test_pmaddwd:
   3366 ; SKX:       # %bb.0:
   3367 ; SKX-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
   3368 ; SKX-NEXT:    vpmaddwd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
   3369 ; SKX-NEXT:    retq # sched: [7:1.00]
   3370 ;
   3371 ; ZNVER1-LABEL: test_pmaddwd:
   3372 ; ZNVER1:       # %bb.0:
   3373 ; ZNVER1-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
   3374 ; ZNVER1-NEXT:    vpmaddwd (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   3375 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   3376   %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1)
   3377   %2 = bitcast <8 x i32> %1 to <16 x i16>
   3378   %3 = load <16 x i16>, <16 x i16> *%a2, align 32
   3379   %4 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %2, <16 x i16> %3)
   3380   ret <8 x i32> %4
   3381 }
   3382 declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
   3383 
   3384 define <4 x i32> @test_pmaskmovd(i8* %a0, <4 x i32> %a1, <4 x i32> %a2) {
   3385 ; GENERIC-LABEL: test_pmaskmovd:
   3386 ; GENERIC:       # %bb.0:
   3387 ; GENERIC-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [8:1.00]
   3388 ; GENERIC-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
   3389 ; GENERIC-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
   3390 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3391 ;
   3392 ; HASWELL-LABEL: test_pmaskmovd:
   3393 ; HASWELL:       # %bb.0:
   3394 ; HASWELL-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [8:2.00]
   3395 ; HASWELL-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
   3396 ; HASWELL-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
   3397 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3398 ;
   3399 ; BROADWELL-LABEL: test_pmaskmovd:
   3400 ; BROADWELL:       # %bb.0:
   3401 ; BROADWELL-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [7:2.00]
   3402 ; BROADWELL-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
   3403 ; BROADWELL-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
   3404 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3405 ;
   3406 ; SKYLAKE-LABEL: test_pmaskmovd:
   3407 ; SKYLAKE:       # %bb.0:
   3408 ; SKYLAKE-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
   3409 ; SKYLAKE-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
   3410 ; SKYLAKE-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
   3411 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3412 ;
   3413 ; SKX-LABEL: test_pmaskmovd:
   3414 ; SKX:       # %bb.0:
   3415 ; SKX-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
   3416 ; SKX-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
   3417 ; SKX-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
   3418 ; SKX-NEXT:    retq # sched: [7:1.00]
   3419 ;
   3420 ; ZNVER1-LABEL: test_pmaskmovd:
   3421 ; ZNVER1:       # %bb.0:
   3422 ; ZNVER1-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [100:0.25]
   3423 ; ZNVER1-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [100:0.25]
   3424 ; ZNVER1-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.25]
   3425 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   3426   %1 = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %a0, <4 x i32> %a1)
   3427   call void @llvm.x86.avx2.maskstore.d(i8* %a0, <4 x i32> %a1, <4 x i32> %a2)
   3428   ret <4 x i32> %1
   3429 }
   3430 declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
   3431 declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind
   3432 
   3433 define <8 x i32> @test_pmaskmovd_ymm(i8* %a0, <8 x i32> %a1, <8 x i32> %a2) {
   3434 ; GENERIC-LABEL: test_pmaskmovd_ymm:
   3435 ; GENERIC:       # %bb.0:
   3436 ; GENERIC-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [9:1.00]
   3437 ; GENERIC-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
   3438 ; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
   3439 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3440 ;
   3441 ; HASWELL-LABEL: test_pmaskmovd_ymm:
   3442 ; HASWELL:       # %bb.0:
   3443 ; HASWELL-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [9:2.00]
   3444 ; HASWELL-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
   3445 ; HASWELL-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
   3446 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3447 ;
   3448 ; BROADWELL-LABEL: test_pmaskmovd_ymm:
   3449 ; BROADWELL:       # %bb.0:
   3450 ; BROADWELL-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [8:2.00]
   3451 ; BROADWELL-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
   3452 ; BROADWELL-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
   3453 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3454 ;
   3455 ; SKYLAKE-LABEL: test_pmaskmovd_ymm:
   3456 ; SKYLAKE:       # %bb.0:
   3457 ; SKYLAKE-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
   3458 ; SKYLAKE-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
   3459 ; SKYLAKE-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
   3460 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3461 ;
   3462 ; SKX-LABEL: test_pmaskmovd_ymm:
   3463 ; SKX:       # %bb.0:
   3464 ; SKX-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
   3465 ; SKX-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
   3466 ; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
   3467 ; SKX-NEXT:    retq # sched: [7:1.00]
   3468 ;
   3469 ; ZNVER1-LABEL: test_pmaskmovd_ymm:
   3470 ; ZNVER1:       # %bb.0:
   3471 ; ZNVER1-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [100:0.25]
   3472 ; ZNVER1-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [100:0.25]
   3473 ; ZNVER1-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [2:0.25]
   3474 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   3475   %1 = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %a0, <8 x i32> %a1)
   3476   call void @llvm.x86.avx2.maskstore.d.256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2)
   3477   ret <8 x i32> %1
   3478 }
   3479 declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly
   3480 declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind
   3481 
   3482 define <2 x i64> @test_pmaskmovq(i8* %a0, <2 x i64> %a1, <2 x i64> %a2) {
   3483 ; GENERIC-LABEL: test_pmaskmovq:
   3484 ; GENERIC:       # %bb.0:
   3485 ; GENERIC-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [8:1.00]
   3486 ; GENERIC-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
   3487 ; GENERIC-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
   3488 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3489 ;
   3490 ; HASWELL-LABEL: test_pmaskmovq:
   3491 ; HASWELL:       # %bb.0:
   3492 ; HASWELL-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [8:2.00]
   3493 ; HASWELL-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
   3494 ; HASWELL-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
   3495 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3496 ;
   3497 ; BROADWELL-LABEL: test_pmaskmovq:
   3498 ; BROADWELL:       # %bb.0:
   3499 ; BROADWELL-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [7:2.00]
   3500 ; BROADWELL-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
   3501 ; BROADWELL-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
   3502 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3503 ;
   3504 ; SKYLAKE-LABEL: test_pmaskmovq:
   3505 ; SKYLAKE:       # %bb.0:
   3506 ; SKYLAKE-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
   3507 ; SKYLAKE-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
   3508 ; SKYLAKE-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
   3509 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3510 ;
   3511 ; SKX-LABEL: test_pmaskmovq:
   3512 ; SKX:       # %bb.0:
   3513 ; SKX-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
   3514 ; SKX-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
   3515 ; SKX-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
   3516 ; SKX-NEXT:    retq # sched: [7:1.00]
   3517 ;
   3518 ; ZNVER1-LABEL: test_pmaskmovq:
   3519 ; ZNVER1:       # %bb.0:
   3520 ; ZNVER1-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [8:1.00]
   3521 ; ZNVER1-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [100:0.25]
   3522 ; ZNVER1-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.25]
   3523 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   3524   %1 = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %a0, <2 x i64> %a1)
   3525   call void @llvm.x86.avx2.maskstore.q(i8* %a0, <2 x i64> %a1, <2 x i64> %a2)
   3526   ret <2 x i64> %1
   3527 }
   3528 declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
   3529 declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind
   3530 
   3531 define <4 x i64> @test_pmaskmovq_ymm(i8* %a0, <4 x i64> %a1, <4 x i64> %a2) {
   3532 ; GENERIC-LABEL: test_pmaskmovq_ymm:
   3533 ; GENERIC:       # %bb.0:
   3534 ; GENERIC-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [9:1.00]
   3535 ; GENERIC-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
   3536 ; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
   3537 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3538 ;
   3539 ; HASWELL-LABEL: test_pmaskmovq_ymm:
   3540 ; HASWELL:       # %bb.0:
   3541 ; HASWELL-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [9:2.00]
   3542 ; HASWELL-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
   3543 ; HASWELL-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
   3544 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3545 ;
   3546 ; BROADWELL-LABEL: test_pmaskmovq_ymm:
   3547 ; BROADWELL:       # %bb.0:
   3548 ; BROADWELL-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [8:2.00]
   3549 ; BROADWELL-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
   3550 ; BROADWELL-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
   3551 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3552 ;
   3553 ; SKYLAKE-LABEL: test_pmaskmovq_ymm:
   3554 ; SKYLAKE:       # %bb.0:
   3555 ; SKYLAKE-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
   3556 ; SKYLAKE-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
   3557 ; SKYLAKE-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
   3558 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3559 ;
   3560 ; SKX-LABEL: test_pmaskmovq_ymm:
   3561 ; SKX:       # %bb.0:
   3562 ; SKX-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
   3563 ; SKX-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
   3564 ; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
   3565 ; SKX-NEXT:    retq # sched: [7:1.00]
   3566 ;
   3567 ; ZNVER1-LABEL: test_pmaskmovq_ymm:
   3568 ; ZNVER1:       # %bb.0:
   3569 ; ZNVER1-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [9:1.50]
   3570 ; ZNVER1-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [100:0.25]
   3571 ; ZNVER1-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [2:0.25]
   3572 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   3573   %1 = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %a0, <4 x i64> %a1)
   3574   call void @llvm.x86.avx2.maskstore.q.256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2)
   3575   ret <4 x i64> %1
   3576 }
   3577 declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly
   3578 declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind
   3579 
   3580 define <32 x i8> @test_pmaxsb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
   3581 ; GENERIC-LABEL: test_pmaxsb:
   3582 ; GENERIC:       # %bb.0:
   3583 ; GENERIC-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3584 ; GENERIC-NEXT:    vpmaxsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3585 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3586 ;
   3587 ; HASWELL-LABEL: test_pmaxsb:
   3588 ; HASWELL:       # %bb.0:
   3589 ; HASWELL-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3590 ; HASWELL-NEXT:    vpmaxsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3591 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3592 ;
   3593 ; BROADWELL-LABEL: test_pmaxsb:
   3594 ; BROADWELL:       # %bb.0:
   3595 ; BROADWELL-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3596 ; BROADWELL-NEXT:    vpmaxsb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   3597 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3598 ;
   3599 ; SKYLAKE-LABEL: test_pmaxsb:
   3600 ; SKYLAKE:       # %bb.0:
   3601 ; SKYLAKE-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3602 ; SKYLAKE-NEXT:    vpmaxsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3603 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3604 ;
   3605 ; SKX-LABEL: test_pmaxsb:
   3606 ; SKX:       # %bb.0:
   3607 ; SKX-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3608 ; SKX-NEXT:    vpmaxsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3609 ; SKX-NEXT:    retq # sched: [7:1.00]
   3610 ;
   3611 ; ZNVER1-LABEL: test_pmaxsb:
   3612 ; ZNVER1:       # %bb.0:
   3613 ; ZNVER1-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   3614 ; ZNVER1-NEXT:    vpmaxsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3615 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   3616   %1 = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %a0, <32 x i8> %a1)
   3617   %2 = load <32 x i8>, <32 x i8> *%a2, align 32
   3618   %3 = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %1, <32 x i8> %2)
   3619   ret <32 x i8> %3
   3620 }
   3621 declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone
   3622 
   3623 define <8 x i32> @test_pmaxsd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
   3624 ; GENERIC-LABEL: test_pmaxsd:
   3625 ; GENERIC:       # %bb.0:
   3626 ; GENERIC-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3627 ; GENERIC-NEXT:    vpmaxsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3628 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3629 ;
   3630 ; HASWELL-LABEL: test_pmaxsd:
   3631 ; HASWELL:       # %bb.0:
   3632 ; HASWELL-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3633 ; HASWELL-NEXT:    vpmaxsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3634 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3635 ;
   3636 ; BROADWELL-LABEL: test_pmaxsd:
   3637 ; BROADWELL:       # %bb.0:
   3638 ; BROADWELL-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3639 ; BROADWELL-NEXT:    vpmaxsd (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   3640 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3641 ;
   3642 ; SKYLAKE-LABEL: test_pmaxsd:
   3643 ; SKYLAKE:       # %bb.0:
   3644 ; SKYLAKE-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3645 ; SKYLAKE-NEXT:    vpmaxsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3646 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3647 ;
   3648 ; SKX-LABEL: test_pmaxsd:
   3649 ; SKX:       # %bb.0:
   3650 ; SKX-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3651 ; SKX-NEXT:    vpmaxsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3652 ; SKX-NEXT:    retq # sched: [7:1.00]
   3653 ;
   3654 ; ZNVER1-LABEL: test_pmaxsd:
   3655 ; ZNVER1:       # %bb.0:
   3656 ; ZNVER1-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   3657 ; ZNVER1-NEXT:    vpmaxsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3658 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   3659   %1 = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %a0, <8 x i32> %a1)
   3660   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   3661   %3 = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %1, <8 x i32> %2)
   3662   ret <8 x i32> %3
   3663 }
   3664 declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone
   3665 
   3666 define <16 x i16> @test_pmaxsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   3667 ; GENERIC-LABEL: test_pmaxsw:
   3668 ; GENERIC:       # %bb.0:
   3669 ; GENERIC-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3670 ; GENERIC-NEXT:    vpmaxsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3671 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3672 ;
   3673 ; HASWELL-LABEL: test_pmaxsw:
   3674 ; HASWELL:       # %bb.0:
   3675 ; HASWELL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3676 ; HASWELL-NEXT:    vpmaxsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3677 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3678 ;
   3679 ; BROADWELL-LABEL: test_pmaxsw:
   3680 ; BROADWELL:       # %bb.0:
   3681 ; BROADWELL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3682 ; BROADWELL-NEXT:    vpmaxsw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   3683 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3684 ;
   3685 ; SKYLAKE-LABEL: test_pmaxsw:
   3686 ; SKYLAKE:       # %bb.0:
   3687 ; SKYLAKE-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3688 ; SKYLAKE-NEXT:    vpmaxsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3689 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3690 ;
   3691 ; SKX-LABEL: test_pmaxsw:
   3692 ; SKX:       # %bb.0:
   3693 ; SKX-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3694 ; SKX-NEXT:    vpmaxsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3695 ; SKX-NEXT:    retq # sched: [7:1.00]
   3696 ;
   3697 ; ZNVER1-LABEL: test_pmaxsw:
   3698 ; ZNVER1:       # %bb.0:
   3699 ; ZNVER1-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   3700 ; ZNVER1-NEXT:    vpmaxsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3701 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   3702   %1 = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %a0, <16 x i16> %a1)
   3703   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   3704   %3 = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %1, <16 x i16> %2)
   3705   ret <16 x i16> %3
   3706 }
   3707 declare <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16>, <16 x i16>) nounwind readnone
   3708 
   3709 define <32 x i8> @test_pmaxub(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
   3710 ; GENERIC-LABEL: test_pmaxub:
   3711 ; GENERIC:       # %bb.0:
   3712 ; GENERIC-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3713 ; GENERIC-NEXT:    vpmaxub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3714 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3715 ;
   3716 ; HASWELL-LABEL: test_pmaxub:
   3717 ; HASWELL:       # %bb.0:
   3718 ; HASWELL-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3719 ; HASWELL-NEXT:    vpmaxub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3720 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3721 ;
   3722 ; BROADWELL-LABEL: test_pmaxub:
   3723 ; BROADWELL:       # %bb.0:
   3724 ; BROADWELL-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3725 ; BROADWELL-NEXT:    vpmaxub (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   3726 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3727 ;
   3728 ; SKYLAKE-LABEL: test_pmaxub:
   3729 ; SKYLAKE:       # %bb.0:
   3730 ; SKYLAKE-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3731 ; SKYLAKE-NEXT:    vpmaxub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3732 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3733 ;
   3734 ; SKX-LABEL: test_pmaxub:
   3735 ; SKX:       # %bb.0:
   3736 ; SKX-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3737 ; SKX-NEXT:    vpmaxub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3738 ; SKX-NEXT:    retq # sched: [7:1.00]
   3739 ;
   3740 ; ZNVER1-LABEL: test_pmaxub:
   3741 ; ZNVER1:       # %bb.0:
   3742 ; ZNVER1-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   3743 ; ZNVER1-NEXT:    vpmaxub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3744 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   3745   %1 = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %a0, <32 x i8> %a1)
   3746   %2 = load <32 x i8>, <32 x i8> *%a2, align 32
   3747   %3 = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %1, <32 x i8> %2)
   3748   ret <32 x i8> %3
   3749 }
   3750 declare <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8>, <32 x i8>) nounwind readnone
   3751 
   3752 define <8 x i32> @test_pmaxud(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
   3753 ; GENERIC-LABEL: test_pmaxud:
   3754 ; GENERIC:       # %bb.0:
   3755 ; GENERIC-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3756 ; GENERIC-NEXT:    vpmaxud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3757 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3758 ;
   3759 ; HASWELL-LABEL: test_pmaxud:
   3760 ; HASWELL:       # %bb.0:
   3761 ; HASWELL-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3762 ; HASWELL-NEXT:    vpmaxud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3763 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3764 ;
   3765 ; BROADWELL-LABEL: test_pmaxud:
   3766 ; BROADWELL:       # %bb.0:
   3767 ; BROADWELL-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3768 ; BROADWELL-NEXT:    vpmaxud (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   3769 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3770 ;
   3771 ; SKYLAKE-LABEL: test_pmaxud:
   3772 ; SKYLAKE:       # %bb.0:
   3773 ; SKYLAKE-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3774 ; SKYLAKE-NEXT:    vpmaxud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3775 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3776 ;
   3777 ; SKX-LABEL: test_pmaxud:
   3778 ; SKX:       # %bb.0:
   3779 ; SKX-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3780 ; SKX-NEXT:    vpmaxud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3781 ; SKX-NEXT:    retq # sched: [7:1.00]
   3782 ;
   3783 ; ZNVER1-LABEL: test_pmaxud:
   3784 ; ZNVER1:       # %bb.0:
   3785 ; ZNVER1-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   3786 ; ZNVER1-NEXT:    vpmaxud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3787 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   3788   %1 = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %a0, <8 x i32> %a1)
   3789   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   3790   %3 = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %1, <8 x i32> %2)
   3791   ret <8 x i32> %3
   3792 }
   3793 declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone
   3794 
   3795 define <16 x i16> @test_pmaxuw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   3796 ; GENERIC-LABEL: test_pmaxuw:
   3797 ; GENERIC:       # %bb.0:
   3798 ; GENERIC-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3799 ; GENERIC-NEXT:    vpmaxuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3800 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3801 ;
   3802 ; HASWELL-LABEL: test_pmaxuw:
   3803 ; HASWELL:       # %bb.0:
   3804 ; HASWELL-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3805 ; HASWELL-NEXT:    vpmaxuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3806 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3807 ;
   3808 ; BROADWELL-LABEL: test_pmaxuw:
   3809 ; BROADWELL:       # %bb.0:
   3810 ; BROADWELL-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3811 ; BROADWELL-NEXT:    vpmaxuw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   3812 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3813 ;
   3814 ; SKYLAKE-LABEL: test_pmaxuw:
   3815 ; SKYLAKE:       # %bb.0:
   3816 ; SKYLAKE-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3817 ; SKYLAKE-NEXT:    vpmaxuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3818 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3819 ;
   3820 ; SKX-LABEL: test_pmaxuw:
   3821 ; SKX:       # %bb.0:
   3822 ; SKX-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3823 ; SKX-NEXT:    vpmaxuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3824 ; SKX-NEXT:    retq # sched: [7:1.00]
   3825 ;
   3826 ; ZNVER1-LABEL: test_pmaxuw:
   3827 ; ZNVER1:       # %bb.0:
   3828 ; ZNVER1-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   3829 ; ZNVER1-NEXT:    vpmaxuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3830 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   3831   %1 = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %a0, <16 x i16> %a1)
   3832   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   3833   %3 = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %1, <16 x i16> %2)
   3834   ret <16 x i16> %3
   3835 }
   3836 declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readnone
   3837 
   3838 define <32 x i8> @test_pminsb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
   3839 ; GENERIC-LABEL: test_pminsb:
   3840 ; GENERIC:       # %bb.0:
   3841 ; GENERIC-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3842 ; GENERIC-NEXT:    vpminsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3843 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3844 ;
   3845 ; HASWELL-LABEL: test_pminsb:
   3846 ; HASWELL:       # %bb.0:
   3847 ; HASWELL-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3848 ; HASWELL-NEXT:    vpminsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3849 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3850 ;
   3851 ; BROADWELL-LABEL: test_pminsb:
   3852 ; BROADWELL:       # %bb.0:
   3853 ; BROADWELL-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3854 ; BROADWELL-NEXT:    vpminsb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   3855 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3856 ;
   3857 ; SKYLAKE-LABEL: test_pminsb:
   3858 ; SKYLAKE:       # %bb.0:
   3859 ; SKYLAKE-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3860 ; SKYLAKE-NEXT:    vpminsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3861 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3862 ;
   3863 ; SKX-LABEL: test_pminsb:
   3864 ; SKX:       # %bb.0:
   3865 ; SKX-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3866 ; SKX-NEXT:    vpminsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3867 ; SKX-NEXT:    retq # sched: [7:1.00]
   3868 ;
   3869 ; ZNVER1-LABEL: test_pminsb:
   3870 ; ZNVER1:       # %bb.0:
   3871 ; ZNVER1-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   3872 ; ZNVER1-NEXT:    vpminsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3873 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   3874   %1 = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %a0, <32 x i8> %a1)
   3875   %2 = load <32 x i8>, <32 x i8> *%a2, align 32
   3876   %3 = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %1, <32 x i8> %2)
   3877   ret <32 x i8> %3
   3878 }
   3879 declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone
   3880 
   3881 define <8 x i32> @test_pminsd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
   3882 ; GENERIC-LABEL: test_pminsd:
   3883 ; GENERIC:       # %bb.0:
   3884 ; GENERIC-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3885 ; GENERIC-NEXT:    vpminsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3886 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3887 ;
   3888 ; HASWELL-LABEL: test_pminsd:
   3889 ; HASWELL:       # %bb.0:
   3890 ; HASWELL-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3891 ; HASWELL-NEXT:    vpminsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3892 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3893 ;
   3894 ; BROADWELL-LABEL: test_pminsd:
   3895 ; BROADWELL:       # %bb.0:
   3896 ; BROADWELL-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3897 ; BROADWELL-NEXT:    vpminsd (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   3898 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3899 ;
   3900 ; SKYLAKE-LABEL: test_pminsd:
   3901 ; SKYLAKE:       # %bb.0:
   3902 ; SKYLAKE-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3903 ; SKYLAKE-NEXT:    vpminsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3904 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3905 ;
   3906 ; SKX-LABEL: test_pminsd:
   3907 ; SKX:       # %bb.0:
   3908 ; SKX-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3909 ; SKX-NEXT:    vpminsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3910 ; SKX-NEXT:    retq # sched: [7:1.00]
   3911 ;
   3912 ; ZNVER1-LABEL: test_pminsd:
   3913 ; ZNVER1:       # %bb.0:
   3914 ; ZNVER1-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   3915 ; ZNVER1-NEXT:    vpminsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3916 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   3917   %1 = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %a0, <8 x i32> %a1)
   3918   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   3919   %3 = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %1, <8 x i32> %2)
   3920   ret <8 x i32> %3
   3921 }
   3922 declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone
   3923 
   3924 define <16 x i16> @test_pminsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   3925 ; GENERIC-LABEL: test_pminsw:
   3926 ; GENERIC:       # %bb.0:
   3927 ; GENERIC-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3928 ; GENERIC-NEXT:    vpminsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3929 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3930 ;
   3931 ; HASWELL-LABEL: test_pminsw:
   3932 ; HASWELL:       # %bb.0:
   3933 ; HASWELL-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3934 ; HASWELL-NEXT:    vpminsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3935 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3936 ;
   3937 ; BROADWELL-LABEL: test_pminsw:
   3938 ; BROADWELL:       # %bb.0:
   3939 ; BROADWELL-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3940 ; BROADWELL-NEXT:    vpminsw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   3941 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3942 ;
   3943 ; SKYLAKE-LABEL: test_pminsw:
   3944 ; SKYLAKE:       # %bb.0:
   3945 ; SKYLAKE-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3946 ; SKYLAKE-NEXT:    vpminsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3947 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3948 ;
   3949 ; SKX-LABEL: test_pminsw:
   3950 ; SKX:       # %bb.0:
   3951 ; SKX-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3952 ; SKX-NEXT:    vpminsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3953 ; SKX-NEXT:    retq # sched: [7:1.00]
   3954 ;
   3955 ; ZNVER1-LABEL: test_pminsw:
   3956 ; ZNVER1:       # %bb.0:
   3957 ; ZNVER1-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   3958 ; ZNVER1-NEXT:    vpminsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3959 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   3960   %1 = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %a0, <16 x i16> %a1)
   3961   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   3962   %3 = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %1, <16 x i16> %2)
   3963   ret <16 x i16> %3
   3964 }
   3965 declare <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16>, <16 x i16>) nounwind readnone
   3966 
   3967 define <32 x i8> @test_pminub(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
   3968 ; GENERIC-LABEL: test_pminub:
   3969 ; GENERIC:       # %bb.0:
   3970 ; GENERIC-NEXT:    vpminub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3971 ; GENERIC-NEXT:    vpminub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3972 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   3973 ;
   3974 ; HASWELL-LABEL: test_pminub:
   3975 ; HASWELL:       # %bb.0:
   3976 ; HASWELL-NEXT:    vpminub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3977 ; HASWELL-NEXT:    vpminub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3978 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   3979 ;
   3980 ; BROADWELL-LABEL: test_pminub:
   3981 ; BROADWELL:       # %bb.0:
   3982 ; BROADWELL-NEXT:    vpminub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3983 ; BROADWELL-NEXT:    vpminub (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   3984 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   3985 ;
   3986 ; SKYLAKE-LABEL: test_pminub:
   3987 ; SKYLAKE:       # %bb.0:
   3988 ; SKYLAKE-NEXT:    vpminub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3989 ; SKYLAKE-NEXT:    vpminub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3990 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   3991 ;
   3992 ; SKX-LABEL: test_pminub:
   3993 ; SKX:       # %bb.0:
   3994 ; SKX-NEXT:    vpminub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   3995 ; SKX-NEXT:    vpminub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   3996 ; SKX-NEXT:    retq # sched: [7:1.00]
   3997 ;
   3998 ; ZNVER1-LABEL: test_pminub:
   3999 ; ZNVER1:       # %bb.0:
   4000 ; ZNVER1-NEXT:    vpminub %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   4001 ; ZNVER1-NEXT:    vpminub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   4002 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   4003   %1 = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %a0, <32 x i8> %a1)
   4004   %2 = load <32 x i8>, <32 x i8> *%a2, align 32
   4005   %3 = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %1, <32 x i8> %2)
   4006   ret <32 x i8> %3
   4007 }
   4008 declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone
   4009 
   4010 define <8 x i32> @test_pminud(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
   4011 ; GENERIC-LABEL: test_pminud:
   4012 ; GENERIC:       # %bb.0:
   4013 ; GENERIC-NEXT:    vpminud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4014 ; GENERIC-NEXT:    vpminud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   4015 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4016 ;
   4017 ; HASWELL-LABEL: test_pminud:
   4018 ; HASWELL:       # %bb.0:
   4019 ; HASWELL-NEXT:    vpminud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4020 ; HASWELL-NEXT:    vpminud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   4021 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   4022 ;
   4023 ; BROADWELL-LABEL: test_pminud:
   4024 ; BROADWELL:       # %bb.0:
   4025 ; BROADWELL-NEXT:    vpminud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4026 ; BROADWELL-NEXT:    vpminud (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   4027 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   4028 ;
   4029 ; SKYLAKE-LABEL: test_pminud:
   4030 ; SKYLAKE:       # %bb.0:
   4031 ; SKYLAKE-NEXT:    vpminud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4032 ; SKYLAKE-NEXT:    vpminud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   4033 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   4034 ;
   4035 ; SKX-LABEL: test_pminud:
   4036 ; SKX:       # %bb.0:
   4037 ; SKX-NEXT:    vpminud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4038 ; SKX-NEXT:    vpminud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   4039 ; SKX-NEXT:    retq # sched: [7:1.00]
   4040 ;
   4041 ; ZNVER1-LABEL: test_pminud:
   4042 ; ZNVER1:       # %bb.0:
   4043 ; ZNVER1-NEXT:    vpminud %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   4044 ; ZNVER1-NEXT:    vpminud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   4045 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   4046   %1 = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %a0, <8 x i32> %a1)
   4047   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   4048   %3 = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %1, <8 x i32> %2)
   4049   ret <8 x i32> %3
   4050 }
   4051 declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone
   4052 
   4053 define <16 x i16> @test_pminuw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   4054 ; GENERIC-LABEL: test_pminuw:
   4055 ; GENERIC:       # %bb.0:
   4056 ; GENERIC-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4057 ; GENERIC-NEXT:    vpminuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   4058 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4059 ;
   4060 ; HASWELL-LABEL: test_pminuw:
   4061 ; HASWELL:       # %bb.0:
   4062 ; HASWELL-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4063 ; HASWELL-NEXT:    vpminuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   4064 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   4065 ;
   4066 ; BROADWELL-LABEL: test_pminuw:
   4067 ; BROADWELL:       # %bb.0:
   4068 ; BROADWELL-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4069 ; BROADWELL-NEXT:    vpminuw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   4070 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   4071 ;
   4072 ; SKYLAKE-LABEL: test_pminuw:
   4073 ; SKYLAKE:       # %bb.0:
   4074 ; SKYLAKE-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4075 ; SKYLAKE-NEXT:    vpminuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   4076 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   4077 ;
   4078 ; SKX-LABEL: test_pminuw:
   4079 ; SKX:       # %bb.0:
   4080 ; SKX-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4081 ; SKX-NEXT:    vpminuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   4082 ; SKX-NEXT:    retq # sched: [7:1.00]
   4083 ;
   4084 ; ZNVER1-LABEL: test_pminuw:
   4085 ; ZNVER1:       # %bb.0:
   4086 ; ZNVER1-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   4087 ; ZNVER1-NEXT:    vpminuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   4088 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   4089   %1 = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %a0, <16 x i16> %a1)
   4090   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   4091   %3 = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %1, <16 x i16> %2)
   4092   ret <16 x i16> %3
   4093 }
   4094 declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readnone
   4095 
   4096 define i32 @test_pmovmskb(<32 x i8> %a0) {
   4097 ; GENERIC-LABEL: test_pmovmskb:
   4098 ; GENERIC:       # %bb.0:
   4099 ; GENERIC-NEXT:    vpmovmskb %ymm0, %eax # sched: [2:1.00]
   4100 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
   4101 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4102 ;
   4103 ; HASWELL-LABEL: test_pmovmskb:
   4104 ; HASWELL:       # %bb.0:
   4105 ; HASWELL-NEXT:    vpmovmskb %ymm0, %eax # sched: [3:1.00]
   4106 ; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
   4107 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   4108 ;
   4109 ; BROADWELL-LABEL: test_pmovmskb:
   4110 ; BROADWELL:       # %bb.0:
   4111 ; BROADWELL-NEXT:    vpmovmskb %ymm0, %eax # sched: [3:1.00]
   4112 ; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
   4113 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   4114 ;
   4115 ; SKYLAKE-LABEL: test_pmovmskb:
   4116 ; SKYLAKE:       # %bb.0:
   4117 ; SKYLAKE-NEXT:    vpmovmskb %ymm0, %eax # sched: [2:1.00]
   4118 ; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
   4119 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   4120 ;
   4121 ; SKX-LABEL: test_pmovmskb:
   4122 ; SKX:       # %bb.0:
   4123 ; SKX-NEXT:    vpmovmskb %ymm0, %eax # sched: [2:1.00]
   4124 ; SKX-NEXT:    vzeroupper # sched: [4:1.00]
   4125 ; SKX-NEXT:    retq # sched: [7:1.00]
   4126 ;
   4127 ; ZNVER1-LABEL: test_pmovmskb:
   4128 ; ZNVER1:       # %bb.0:
   4129 ; ZNVER1-NEXT:    vpmovmskb %ymm0, %eax # sched: [2:2.00]
   4130 ; ZNVER1-NEXT:    vzeroupper # sched: [100:0.25]
   4131 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   4132   %1 = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %a0)
   4133   ret i32 %1
   4134 }
   4135 declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
   4136 
   4137 define <8 x i32> @test_pmovsxbd(<16 x i8> %a0, <16 x i8> *%a1) {
   4138 ; GENERIC-LABEL: test_pmovsxbd:
   4139 ; GENERIC:       # %bb.0:
   4140 ; GENERIC-NEXT:    vpmovsxbd %xmm0, %ymm0 # sched: [1:1.00]
   4141 ; GENERIC-NEXT:    vpmovsxbd (%rdi), %ymm1 # sched: [8:1.00]
   4142 ; GENERIC-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4143 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4144 ;
   4145 ; HASWELL-LABEL: test_pmovsxbd:
   4146 ; HASWELL:       # %bb.0:
   4147 ; HASWELL-NEXT:    vpmovsxbd %xmm0, %ymm0 # sched: [3:1.00]
   4148 ; HASWELL-NEXT:    vpmovsxbd (%rdi), %ymm1 # sched: [8:1.00]
   4149 ; HASWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4150 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   4151 ;
   4152 ; BROADWELL-LABEL: test_pmovsxbd:
   4153 ; BROADWELL:       # %bb.0:
   4154 ; BROADWELL-NEXT:    vpmovsxbd %xmm0, %ymm0 # sched: [3:1.00]
   4155 ; BROADWELL-NEXT:    vpmovsxbd (%rdi), %ymm1 # sched: [8:1.00]
   4156 ; BROADWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4157 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   4158 ;
   4159 ; SKYLAKE-LABEL: test_pmovsxbd:
   4160 ; SKYLAKE:       # %bb.0:
   4161 ; SKYLAKE-NEXT:    vpmovsxbd %xmm0, %ymm0 # sched: [3:1.00]
   4162 ; SKYLAKE-NEXT:    vpmovsxbd (%rdi), %ymm1 # sched: [8:1.00]
   4163 ; SKYLAKE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4164 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   4165 ;
   4166 ; SKX-LABEL: test_pmovsxbd:
   4167 ; SKX:       # %bb.0:
   4168 ; SKX-NEXT:    vpmovsxbd %xmm0, %ymm0 # sched: [3:1.00]
   4169 ; SKX-NEXT:    vpmovsxbd (%rdi), %ymm1 # sched: [8:1.00]
   4170 ; SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4171 ; SKX-NEXT:    retq # sched: [7:1.00]
   4172 ;
   4173 ; ZNVER1-LABEL: test_pmovsxbd:
   4174 ; ZNVER1:       # %bb.0:
   4175 ; ZNVER1-NEXT:    vpmovsxbd (%rdi), %ymm1 # sched: [8:0.50]
   4176 ; ZNVER1-NEXT:    vpmovsxbd %xmm0, %ymm0 # sched: [1:0.50]
   4177 ; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   4178 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   4179   %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   4180   %2 = sext <8 x i8> %1 to <8 x i32>
   4181   %3 = load <16 x i8>, <16 x i8> *%a1, align 16
   4182   %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   4183   %5 = sext <8 x i8> %4 to <8 x i32>
   4184   %6 = add <8 x i32> %2, %5
   4185   ret <8 x i32> %6
   4186 }
   4187 
   4188 define <4 x i64> @test_pmovsxbq(<16 x i8> %a0, <16 x i8> *%a1) {
   4189 ; GENERIC-LABEL: test_pmovsxbq:
   4190 ; GENERIC:       # %bb.0:
   4191 ; GENERIC-NEXT:    vpmovsxbq %xmm0, %ymm0 # sched: [1:1.00]
   4192 ; GENERIC-NEXT:    vpmovsxbq (%rdi), %ymm1 # sched: [8:1.00]
   4193 ; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4194 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4195 ;
   4196 ; HASWELL-LABEL: test_pmovsxbq:
   4197 ; HASWELL:       # %bb.0:
   4198 ; HASWELL-NEXT:    vpmovsxbq %xmm0, %ymm0 # sched: [3:1.00]
   4199 ; HASWELL-NEXT:    vpmovsxbq (%rdi), %ymm1 # sched: [8:1.00]
   4200 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4201 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   4202 ;
   4203 ; BROADWELL-LABEL: test_pmovsxbq:
   4204 ; BROADWELL:       # %bb.0:
   4205 ; BROADWELL-NEXT:    vpmovsxbq %xmm0, %ymm0 # sched: [3:1.00]
   4206 ; BROADWELL-NEXT:    vpmovsxbq (%rdi), %ymm1 # sched: [8:1.00]
   4207 ; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4208 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   4209 ;
   4210 ; SKYLAKE-LABEL: test_pmovsxbq:
   4211 ; SKYLAKE:       # %bb.0:
   4212 ; SKYLAKE-NEXT:    vpmovsxbq %xmm0, %ymm0 # sched: [3:1.00]
   4213 ; SKYLAKE-NEXT:    vpmovsxbq (%rdi), %ymm1 # sched: [8:1.00]
   4214 ; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4215 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   4216 ;
   4217 ; SKX-LABEL: test_pmovsxbq:
   4218 ; SKX:       # %bb.0:
   4219 ; SKX-NEXT:    vpmovsxbq %xmm0, %ymm0 # sched: [3:1.00]
   4220 ; SKX-NEXT:    vpmovsxbq (%rdi), %ymm1 # sched: [8:1.00]
   4221 ; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4222 ; SKX-NEXT:    retq # sched: [7:1.00]
   4223 ;
   4224 ; ZNVER1-LABEL: test_pmovsxbq:
   4225 ; ZNVER1:       # %bb.0:
   4226 ; ZNVER1-NEXT:    vpmovsxbq (%rdi), %ymm1 # sched: [8:0.50]
   4227 ; ZNVER1-NEXT:    vpmovsxbq %xmm0, %ymm0 # sched: [1:0.50]
   4228 ; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   4229 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   4230   %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4231   %2 = sext <4 x i8> %1 to <4 x i64>
   4232   %3 = load <16 x i8>, <16 x i8> *%a1, align 16
   4233   %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4234   %5 = sext <4 x i8> %4 to <4 x i64>
   4235   %6 = add <4 x i64> %2, %5
   4236   ret <4 x i64> %6
   4237 }
   4238 
   4239 define <16 x i16> @test_pmovsxbw(<16 x i8> %a0, <16 x i8> *%a1) {
   4240 ; GENERIC-LABEL: test_pmovsxbw:
   4241 ; GENERIC:       # %bb.0:
   4242 ; GENERIC-NEXT:    vpmovsxbw %xmm0, %ymm0 # sched: [1:1.00]
   4243 ; GENERIC-NEXT:    vpmovsxbw (%rdi), %ymm1 # sched: [8:1.00]
   4244 ; GENERIC-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4245 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4246 ;
   4247 ; HASWELL-LABEL: test_pmovsxbw:
   4248 ; HASWELL:       # %bb.0:
   4249 ; HASWELL-NEXT:    vpmovsxbw %xmm0, %ymm0 # sched: [3:1.00]
   4250 ; HASWELL-NEXT:    vpmovsxbw (%rdi), %ymm1 # sched: [9:1.00]
   4251 ; HASWELL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4252 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   4253 ;
   4254 ; BROADWELL-LABEL: test_pmovsxbw:
   4255 ; BROADWELL:       # %bb.0:
   4256 ; BROADWELL-NEXT:    vpmovsxbw %xmm0, %ymm0 # sched: [3:1.00]
   4257 ; BROADWELL-NEXT:    vpmovsxbw (%rdi), %ymm1 # sched: [8:1.00]
   4258 ; BROADWELL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4259 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   4260 ;
   4261 ; SKYLAKE-LABEL: test_pmovsxbw:
   4262 ; SKYLAKE:       # %bb.0:
   4263 ; SKYLAKE-NEXT:    vpmovsxbw %xmm0, %ymm0 # sched: [3:1.00]
   4264 ; SKYLAKE-NEXT:    vpmovsxbw (%rdi), %ymm1 # sched: [9:1.00]
   4265 ; SKYLAKE-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4266 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   4267 ;
   4268 ; SKX-LABEL: test_pmovsxbw:
   4269 ; SKX:       # %bb.0:
   4270 ; SKX-NEXT:    vpmovsxbw %xmm0, %ymm0 # sched: [3:1.00]
   4271 ; SKX-NEXT:    vpmovsxbw (%rdi), %ymm1 # sched: [9:1.00]
   4272 ; SKX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4273 ; SKX-NEXT:    retq # sched: [7:1.00]
   4274 ;
   4275 ; ZNVER1-LABEL: test_pmovsxbw:
   4276 ; ZNVER1:       # %bb.0:
   4277 ; ZNVER1-NEXT:    vpmovsxbw (%rdi), %ymm1 # sched: [8:0.50]
   4278 ; ZNVER1-NEXT:    vpmovsxbw %xmm0, %ymm0 # sched: [1:0.50]
   4279 ; ZNVER1-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   4280 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   4281   %1 = sext <16 x i8> %a0 to <16 x i16>
   4282   %2 = load <16 x i8>, <16 x i8> *%a1, align 16
   4283   %3 = sext <16 x i8> %2 to <16 x i16>
   4284   %4 = add <16 x i16> %1, %3
   4285   ret <16 x i16> %4
   4286 }
   4287 
   4288 define <4 x i64> @test_pmovsxdq(<4 x i32> %a0, <4 x i32> *%a1) {
   4289 ; GENERIC-LABEL: test_pmovsxdq:
   4290 ; GENERIC:       # %bb.0:
   4291 ; GENERIC-NEXT:    vpmovsxdq %xmm0, %ymm0 # sched: [1:1.00]
   4292 ; GENERIC-NEXT:    vpmovsxdq (%rdi), %ymm1 # sched: [8:1.00]
   4293 ; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4294 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4295 ;
   4296 ; HASWELL-LABEL: test_pmovsxdq:
   4297 ; HASWELL:       # %bb.0:
   4298 ; HASWELL-NEXT:    vpmovsxdq %xmm0, %ymm0 # sched: [3:1.00]
   4299 ; HASWELL-NEXT:    vpmovsxdq (%rdi), %ymm1 # sched: [9:1.00]
   4300 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4301 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   4302 ;
   4303 ; BROADWELL-LABEL: test_pmovsxdq:
   4304 ; BROADWELL:       # %bb.0:
   4305 ; BROADWELL-NEXT:    vpmovsxdq %xmm0, %ymm0 # sched: [3:1.00]
   4306 ; BROADWELL-NEXT:    vpmovsxdq (%rdi), %ymm1 # sched: [8:1.00]
   4307 ; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4308 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   4309 ;
   4310 ; SKYLAKE-LABEL: test_pmovsxdq:
   4311 ; SKYLAKE:       # %bb.0:
   4312 ; SKYLAKE-NEXT:    vpmovsxdq %xmm0, %ymm0 # sched: [3:1.00]
   4313 ; SKYLAKE-NEXT:    vpmovsxdq (%rdi), %ymm1 # sched: [9:1.00]
   4314 ; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4315 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   4316 ;
   4317 ; SKX-LABEL: test_pmovsxdq:
   4318 ; SKX:       # %bb.0:
   4319 ; SKX-NEXT:    vpmovsxdq %xmm0, %ymm0 # sched: [3:1.00]
   4320 ; SKX-NEXT:    vpmovsxdq (%rdi), %ymm1 # sched: [9:1.00]
   4321 ; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4322 ; SKX-NEXT:    retq # sched: [7:1.00]
   4323 ;
   4324 ; ZNVER1-LABEL: test_pmovsxdq:
   4325 ; ZNVER1:       # %bb.0:
   4326 ; ZNVER1-NEXT:    vpmovsxdq (%rdi), %ymm1 # sched: [8:0.50]
   4327 ; ZNVER1-NEXT:    vpmovsxdq %xmm0, %ymm0 # sched: [1:0.50]
   4328 ; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   4329 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   4330   %1 = sext <4 x i32> %a0 to <4 x i64>
   4331   %2 = load <4 x i32>, <4 x i32> *%a1, align 16
   4332   %3 = sext <4 x i32> %2 to <4 x i64>
   4333   %4 = add <4 x i64> %1, %3
   4334   ret <4 x i64> %4
   4335 }
   4336 
   4337 define <8 x i32> @test_pmovsxwd(<8 x i16> %a0, <8 x i16> *%a1) {
   4338 ; GENERIC-LABEL: test_pmovsxwd:
   4339 ; GENERIC:       # %bb.0:
   4340 ; GENERIC-NEXT:    vpmovsxwd %xmm0, %ymm0 # sched: [1:1.00]
   4341 ; GENERIC-NEXT:    vpmovsxwd (%rdi), %ymm1 # sched: [8:1.00]
   4342 ; GENERIC-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4343 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4344 ;
   4345 ; HASWELL-LABEL: test_pmovsxwd:
   4346 ; HASWELL:       # %bb.0:
   4347 ; HASWELL-NEXT:    vpmovsxwd %xmm0, %ymm0 # sched: [3:1.00]
   4348 ; HASWELL-NEXT:    vpmovsxwd (%rdi), %ymm1 # sched: [9:1.00]
   4349 ; HASWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4350 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   4351 ;
   4352 ; BROADWELL-LABEL: test_pmovsxwd:
   4353 ; BROADWELL:       # %bb.0:
   4354 ; BROADWELL-NEXT:    vpmovsxwd %xmm0, %ymm0 # sched: [3:1.00]
   4355 ; BROADWELL-NEXT:    vpmovsxwd (%rdi), %ymm1 # sched: [8:1.00]
   4356 ; BROADWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4357 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   4358 ;
   4359 ; SKYLAKE-LABEL: test_pmovsxwd:
   4360 ; SKYLAKE:       # %bb.0:
   4361 ; SKYLAKE-NEXT:    vpmovsxwd %xmm0, %ymm0 # sched: [3:1.00]
   4362 ; SKYLAKE-NEXT:    vpmovsxwd (%rdi), %ymm1 # sched: [9:1.00]
   4363 ; SKYLAKE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4364 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   4365 ;
   4366 ; SKX-LABEL: test_pmovsxwd:
   4367 ; SKX:       # %bb.0:
   4368 ; SKX-NEXT:    vpmovsxwd %xmm0, %ymm0 # sched: [3:1.00]
   4369 ; SKX-NEXT:    vpmovsxwd (%rdi), %ymm1 # sched: [9:1.00]
   4370 ; SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4371 ; SKX-NEXT:    retq # sched: [7:1.00]
   4372 ;
   4373 ; ZNVER1-LABEL: test_pmovsxwd:
   4374 ; ZNVER1:       # %bb.0:
   4375 ; ZNVER1-NEXT:    vpmovsxwd (%rdi), %ymm1 # sched: [8:0.50]
   4376 ; ZNVER1-NEXT:    vpmovsxwd %xmm0, %ymm0 # sched: [1:0.50]
   4377 ; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   4378 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   4379   %1 = sext <8 x i16> %a0 to <8 x i32>
   4380   %2 = load <8 x i16>, <8 x i16> *%a1, align 16
   4381   %3 = sext <8 x i16> %2 to <8 x i32>
   4382   %4 = add <8 x i32> %1, %3
   4383   ret <8 x i32> %4
   4384 }
   4385 
   4386 define <4 x i64> @test_pmovsxwq(<8 x i16> %a0, <8 x i16> *%a1) {
   4387 ; GENERIC-LABEL: test_pmovsxwq:
   4388 ; GENERIC:       # %bb.0:
   4389 ; GENERIC-NEXT:    vpmovsxwq %xmm0, %ymm0 # sched: [1:1.00]
   4390 ; GENERIC-NEXT:    vpmovsxwq (%rdi), %ymm1 # sched: [8:1.00]
   4391 ; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4392 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4393 ;
   4394 ; HASWELL-LABEL: test_pmovsxwq:
   4395 ; HASWELL:       # %bb.0:
   4396 ; HASWELL-NEXT:    vpmovsxwq %xmm0, %ymm0 # sched: [3:1.00]
   4397 ; HASWELL-NEXT:    vpmovsxwq (%rdi), %ymm1 # sched: [8:1.00]
   4398 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4399 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   4400 ;
   4401 ; BROADWELL-LABEL: test_pmovsxwq:
   4402 ; BROADWELL:       # %bb.0:
   4403 ; BROADWELL-NEXT:    vpmovsxwq %xmm0, %ymm0 # sched: [3:1.00]
   4404 ; BROADWELL-NEXT:    vpmovsxwq (%rdi), %ymm1 # sched: [8:1.00]
   4405 ; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4406 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   4407 ;
   4408 ; SKYLAKE-LABEL: test_pmovsxwq:
   4409 ; SKYLAKE:       # %bb.0:
   4410 ; SKYLAKE-NEXT:    vpmovsxwq %xmm0, %ymm0 # sched: [3:1.00]
   4411 ; SKYLAKE-NEXT:    vpmovsxwq (%rdi), %ymm1 # sched: [8:1.00]
   4412 ; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4413 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   4414 ;
   4415 ; SKX-LABEL: test_pmovsxwq:
   4416 ; SKX:       # %bb.0:
   4417 ; SKX-NEXT:    vpmovsxwq %xmm0, %ymm0 # sched: [3:1.00]
   4418 ; SKX-NEXT:    vpmovsxwq (%rdi), %ymm1 # sched: [8:1.00]
   4419 ; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4420 ; SKX-NEXT:    retq # sched: [7:1.00]
   4421 ;
   4422 ; ZNVER1-LABEL: test_pmovsxwq:
   4423 ; ZNVER1:       # %bb.0:
   4424 ; ZNVER1-NEXT:    vpmovsxwq (%rdi), %ymm1 # sched: [8:0.50]
   4425 ; ZNVER1-NEXT:    vpmovsxwq %xmm0, %ymm0 # sched: [1:0.50]
   4426 ; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   4427 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   4428   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4429   %2 = sext <4 x i16> %1 to <4 x i64>
   4430   %3 = load <8 x i16>, <8 x i16> *%a1, align 16
   4431   %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4432   %5 = sext <4 x i16> %4 to <4 x i64>
   4433   %6 = add <4 x i64> %2, %5
   4434   ret <4 x i64> %6
   4435 }
   4436 
   4437 define <8 x i32> @test_pmovzxbd(<16 x i8> %a0, <16 x i8> *%a1) {
   4438 ; GENERIC-LABEL: test_pmovzxbd:
   4439 ; GENERIC:       # %bb.0:
   4440 ; GENERIC-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [1:1.00]
   4441 ; GENERIC-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [8:1.00]
   4442 ; GENERIC-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4443 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4444 ;
   4445 ; HASWELL-LABEL: test_pmovzxbd:
   4446 ; HASWELL:       # %bb.0:
   4447 ; HASWELL-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00]
   4448 ; HASWELL-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [10:1.00]
   4449 ; HASWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4450 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   4451 ;
   4452 ; BROADWELL-LABEL: test_pmovzxbd:
   4453 ; BROADWELL:       # %bb.0:
   4454 ; BROADWELL-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00]
   4455 ; BROADWELL-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [9:1.00]
   4456 ; BROADWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4457 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   4458 ;
   4459 ; SKYLAKE-LABEL: test_pmovzxbd:
   4460 ; SKYLAKE:       # %bb.0:
   4461 ; SKYLAKE-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00]
   4462 ; SKYLAKE-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [10:1.00]
   4463 ; SKYLAKE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4464 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   4465 ;
   4466 ; SKX-LABEL: test_pmovzxbd:
   4467 ; SKX:       # %bb.0:
   4468 ; SKX-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00]
   4469 ; SKX-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [10:1.00]
   4470 ; SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4471 ; SKX-NEXT:    retq # sched: [7:1.00]
   4472 ;
   4473 ; ZNVER1-LABEL: test_pmovzxbd:
   4474 ; ZNVER1:       # %bb.0:
   4475 ; ZNVER1-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [8:0.50]
   4476 ; ZNVER1-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [1:0.50]
   4477 ; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   4478 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   4479   %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   4480   %2 = zext <8 x i8> %1 to <8 x i32>
   4481   %3 = load <16 x i8>, <16 x i8> *%a1, align 16
   4482   %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   4483   %5 = zext <8 x i8> %4 to <8 x i32>
   4484   %6 = add <8 x i32> %2, %5
   4485   ret <8 x i32> %6
   4486 }
   4487 
   4488 define <4 x i64> @test_pmovzxbq(<16 x i8> %a0, <16 x i8> *%a1) {
   4489 ; GENERIC-LABEL: test_pmovzxbq:
   4490 ; GENERIC:       # %bb.0:
   4491 ; GENERIC-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00]
   4492 ; GENERIC-NEXT:    vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [8:1.00]
   4493 ; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4494 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4495 ;
   4496 ; HASWELL-LABEL: test_pmovzxbq:
   4497 ; HASWELL:       # %bb.0:
   4498 ; HASWELL-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [3:1.00]
   4499 ; HASWELL-NEXT:    vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [10:1.00]
   4500 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4501 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   4502 ;
   4503 ; BROADWELL-LABEL: test_pmovzxbq:
   4504 ; BROADWELL:       # %bb.0:
   4505 ; BROADWELL-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [3:1.00]
   4506 ; BROADWELL-NEXT:    vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [9:1.00]
   4507 ; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4508 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   4509 ;
   4510 ; SKYLAKE-LABEL: test_pmovzxbq:
   4511 ; SKYLAKE:       # %bb.0:
   4512 ; SKYLAKE-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [3:1.00]
   4513 ; SKYLAKE-NEXT:    vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [10:1.00]
   4514 ; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4515 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   4516 ;
   4517 ; SKX-LABEL: test_pmovzxbq:
   4518 ; SKX:       # %bb.0:
   4519 ; SKX-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [3:1.00]
   4520 ; SKX-NEXT:    vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [10:1.00]
   4521 ; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4522 ; SKX-NEXT:    retq # sched: [7:1.00]
   4523 ;
   4524 ; ZNVER1-LABEL: test_pmovzxbq:
   4525 ; ZNVER1:       # %bb.0:
   4526 ; ZNVER1-NEXT:    vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [8:0.50]
   4527 ; ZNVER1-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50]
   4528 ; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   4529 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   4530   %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4531   %2 = zext <4 x i8> %1 to <4 x i64>
   4532   %3 = load <16 x i8>, <16 x i8> *%a1, align 16
   4533   %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4534   %5 = zext <4 x i8> %4 to <4 x i64>
   4535   %6 = add <4 x i64> %2, %5
   4536   ret <4 x i64> %6
   4537 }
   4538 
   4539 define <16 x i16> @test_pmovzxbw(<16 x i8> %a0, <16 x i8> *%a1) {
   4540 ; GENERIC-LABEL: test_pmovzxbw:
   4541 ; GENERIC:       # %bb.0:
   4542 ; GENERIC-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [1:1.00]
   4543 ; GENERIC-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [8:1.00]
   4544 ; GENERIC-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4545 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4546 ;
   4547 ; HASWELL-LABEL: test_pmovzxbw:
   4548 ; HASWELL:       # %bb.0:
   4549 ; HASWELL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00]
   4550 ; HASWELL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [10:1.00]
   4551 ; HASWELL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4552 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   4553 ;
   4554 ; BROADWELL-LABEL: test_pmovzxbw:
   4555 ; BROADWELL:       # %bb.0:
   4556 ; BROADWELL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00]
   4557 ; BROADWELL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [9:1.00]
   4558 ; BROADWELL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4559 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   4560 ;
   4561 ; SKYLAKE-LABEL: test_pmovzxbw:
   4562 ; SKYLAKE:       # %bb.0:
   4563 ; SKYLAKE-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00]
   4564 ; SKYLAKE-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [10:1.00]
   4565 ; SKYLAKE-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4566 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   4567 ;
   4568 ; SKX-LABEL: test_pmovzxbw:
   4569 ; SKX:       # %bb.0:
   4570 ; SKX-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00]
   4571 ; SKX-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [10:1.00]
   4572 ; SKX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4573 ; SKX-NEXT:    retq # sched: [7:1.00]
   4574 ;
   4575 ; ZNVER1-LABEL: test_pmovzxbw:
   4576 ; ZNVER1:       # %bb.0:
   4577 ; ZNVER1-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [8:0.50]
   4578 ; ZNVER1-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [1:0.50]
   4579 ; ZNVER1-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   4580 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   4581   %1 = zext <16 x i8> %a0 to <16 x i16>
   4582   %2 = load <16 x i8>, <16 x i8> *%a1, align 16
   4583   %3 = zext <16 x i8> %2 to <16 x i16>
   4584   %4 = add <16 x i16> %1, %3
   4585   ret <16 x i16> %4
   4586 }
   4587 
   4588 define <4 x i64> @test_pmovzxdq(<4 x i32> %a0, <4 x i32> *%a1) {
   4589 ; GENERIC-LABEL: test_pmovzxdq:
   4590 ; GENERIC:       # %bb.0:
   4591 ; GENERIC-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00]
   4592 ; GENERIC-NEXT:    vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [8:1.00]
   4593 ; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4594 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4595 ;
   4596 ; HASWELL-LABEL: test_pmovzxdq:
   4597 ; HASWELL:       # %bb.0:
   4598 ; HASWELL-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [3:1.00]
   4599 ; HASWELL-NEXT:    vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [10:1.00]
   4600 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4601 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   4602 ;
   4603 ; BROADWELL-LABEL: test_pmovzxdq:
   4604 ; BROADWELL:       # %bb.0:
   4605 ; BROADWELL-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [3:1.00]
   4606 ; BROADWELL-NEXT:    vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [9:1.00]
   4607 ; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4608 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   4609 ;
   4610 ; SKYLAKE-LABEL: test_pmovzxdq:
   4611 ; SKYLAKE:       # %bb.0:
   4612 ; SKYLAKE-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [3:1.00]
   4613 ; SKYLAKE-NEXT:    vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [10:1.00]
   4614 ; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4615 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   4616 ;
   4617 ; SKX-LABEL: test_pmovzxdq:
   4618 ; SKX:       # %bb.0:
   4619 ; SKX-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [3:1.00]
   4620 ; SKX-NEXT:    vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [10:1.00]
   4621 ; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4622 ; SKX-NEXT:    retq # sched: [7:1.00]
   4623 ;
   4624 ; ZNVER1-LABEL: test_pmovzxdq:
   4625 ; ZNVER1:       # %bb.0:
   4626 ; ZNVER1-NEXT:    vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [8:0.50]
   4627 ; ZNVER1-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50]
   4628 ; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   4629 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   4630   %1 = zext <4 x i32> %a0 to <4 x i64>
   4631   %2 = load <4 x i32>, <4 x i32> *%a1, align 16
   4632   %3 = zext <4 x i32> %2 to <4 x i64>
   4633   %4 = add <4 x i64> %1, %3
   4634   ret <4 x i64> %4
   4635 }
   4636 
   4637 define <8 x i32> @test_pmovzxwd(<8 x i16> %a0, <8 x i16> *%a1) {
   4638 ; GENERIC-LABEL: test_pmovzxwd:
   4639 ; GENERIC:       # %bb.0:
   4640 ; GENERIC-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
   4641 ; GENERIC-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [8:1.00]
   4642 ; GENERIC-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4643 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4644 ;
   4645 ; HASWELL-LABEL: test_pmovzxwd:
   4646 ; HASWELL:       # %bb.0:
   4647 ; HASWELL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
   4648 ; HASWELL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [9:1.00]
   4649 ; HASWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4650 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   4651 ;
   4652 ; BROADWELL-LABEL: test_pmovzxwd:
   4653 ; BROADWELL:       # %bb.0:
   4654 ; BROADWELL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
   4655 ; BROADWELL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [8:1.00]
   4656 ; BROADWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4657 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   4658 ;
   4659 ; SKYLAKE-LABEL: test_pmovzxwd:
   4660 ; SKYLAKE:       # %bb.0:
   4661 ; SKYLAKE-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
   4662 ; SKYLAKE-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [9:1.00]
   4663 ; SKYLAKE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4664 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   4665 ;
   4666 ; SKX-LABEL: test_pmovzxwd:
   4667 ; SKX:       # %bb.0:
   4668 ; SKX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
   4669 ; SKX-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [9:1.00]
   4670 ; SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4671 ; SKX-NEXT:    retq # sched: [7:1.00]
   4672 ;
   4673 ; ZNVER1-LABEL: test_pmovzxwd:
   4674 ; ZNVER1:       # %bb.0:
   4675 ; ZNVER1-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [8:0.50]
   4676 ; ZNVER1-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
   4677 ; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   4678 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   4679   %1 = zext <8 x i16> %a0 to <8 x i32>
   4680   %2 = load <8 x i16>, <8 x i16> *%a1, align 16
   4681   %3 = zext <8 x i16> %2 to <8 x i32>
   4682   %4 = add <8 x i32> %1, %3
   4683   ret <8 x i32> %4
   4684 }
   4685 
   4686 define <4 x i64> @test_pmovzxwq(<8 x i16> %a0, <8 x i16> *%a1) {
   4687 ; GENERIC-LABEL: test_pmovzxwq:
   4688 ; GENERIC:       # %bb.0:
   4689 ; GENERIC-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00]
   4690 ; GENERIC-NEXT:    vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [8:1.00]
   4691 ; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4692 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4693 ;
   4694 ; HASWELL-LABEL: test_pmovzxwq:
   4695 ; HASWELL:       # %bb.0:
   4696 ; HASWELL-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [3:1.00]
   4697 ; HASWELL-NEXT:    vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [10:1.00]
   4698 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4699 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   4700 ;
   4701 ; BROADWELL-LABEL: test_pmovzxwq:
   4702 ; BROADWELL:       # %bb.0:
   4703 ; BROADWELL-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [3:1.00]
   4704 ; BROADWELL-NEXT:    vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [9:1.00]
   4705 ; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   4706 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   4707 ;
   4708 ; SKYLAKE-LABEL: test_pmovzxwq:
   4709 ; SKYLAKE:       # %bb.0:
   4710 ; SKYLAKE-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [3:1.00]
   4711 ; SKYLAKE-NEXT:    vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [10:1.00]
   4712 ; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4713 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   4714 ;
   4715 ; SKX-LABEL: test_pmovzxwq:
   4716 ; SKX:       # %bb.0:
   4717 ; SKX-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [3:1.00]
   4718 ; SKX-NEXT:    vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [10:1.00]
   4719 ; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   4720 ; SKX-NEXT:    retq # sched: [7:1.00]
   4721 ;
   4722 ; ZNVER1-LABEL: test_pmovzxwq:
   4723 ; ZNVER1:       # %bb.0:
   4724 ; ZNVER1-NEXT:    vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [8:0.50]
   4725 ; ZNVER1-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50]
   4726 ; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   4727 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   4728   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4729   %2 = zext <4 x i16> %1 to <4 x i64>
   4730   %3 = load <8 x i16>, <8 x i16> *%a1, align 16
   4731   %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   4732   %5 = zext <4 x i16> %4 to <4 x i64>
   4733   %6 = add <4 x i64> %2, %5
   4734   ret <4 x i64> %6
   4735 }
   4736 
   4737 define <4 x i64> @test_pmuldq(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
   4738 ; GENERIC-LABEL: test_pmuldq:
   4739 ; GENERIC:       # %bb.0:
   4740 ; GENERIC-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   4741 ; GENERIC-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
   4742 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4743 ;
   4744 ; HASWELL-LABEL: test_pmuldq:
   4745 ; HASWELL:       # %bb.0:
   4746 ; HASWELL-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   4747 ; HASWELL-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
   4748 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   4749 ;
   4750 ; BROADWELL-LABEL: test_pmuldq:
   4751 ; BROADWELL:       # %bb.0:
   4752 ; BROADWELL-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   4753 ; BROADWELL-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   4754 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   4755 ;
   4756 ; SKYLAKE-LABEL: test_pmuldq:
   4757 ; SKYLAKE:       # %bb.0:
   4758 ; SKYLAKE-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
   4759 ; SKYLAKE-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
   4760 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   4761 ;
   4762 ; SKX-LABEL: test_pmuldq:
   4763 ; SKX:       # %bb.0:
   4764 ; SKX-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
   4765 ; SKX-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
   4766 ; SKX-NEXT:    retq # sched: [7:1.00]
   4767 ;
   4768 ; ZNVER1-LABEL: test_pmuldq:
   4769 ; ZNVER1:       # %bb.0:
   4770 ; ZNVER1-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
   4771 ; ZNVER1-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   4772 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   4773   %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %a0, <8 x i32> %a1)
   4774   %2 = bitcast <4 x i64> %1 to <8 x i32>
   4775   %3 = load <8 x i32>, <8 x i32> *%a2, align 32
   4776   %4 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %2, <8 x i32> %3)
   4777   ret <4 x i64> %4
   4778 }
   4779 declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
   4780 
   4781 define <16 x i16> @test_pmulhrsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   4782 ; GENERIC-LABEL: test_pmulhrsw:
   4783 ; GENERIC:       # %bb.0:
   4784 ; GENERIC-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   4785 ; GENERIC-NEXT:    vpmulhrsw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
   4786 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4787 ;
   4788 ; HASWELL-LABEL: test_pmulhrsw:
   4789 ; HASWELL:       # %bb.0:
   4790 ; HASWELL-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   4791 ; HASWELL-NEXT:    vpmulhrsw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
   4792 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   4793 ;
   4794 ; BROADWELL-LABEL: test_pmulhrsw:
   4795 ; BROADWELL:       # %bb.0:
   4796 ; BROADWELL-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   4797 ; BROADWELL-NEXT:    vpmulhrsw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   4798 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   4799 ;
   4800 ; SKYLAKE-LABEL: test_pmulhrsw:
   4801 ; SKYLAKE:       # %bb.0:
   4802 ; SKYLAKE-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
   4803 ; SKYLAKE-NEXT:    vpmulhrsw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
   4804 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   4805 ;
   4806 ; SKX-LABEL: test_pmulhrsw:
   4807 ; SKX:       # %bb.0:
   4808 ; SKX-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
   4809 ; SKX-NEXT:    vpmulhrsw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
   4810 ; SKX-NEXT:    retq # sched: [7:1.00]
   4811 ;
   4812 ; ZNVER1-LABEL: test_pmulhrsw:
   4813 ; ZNVER1:       # %bb.0:
   4814 ; ZNVER1-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
   4815 ; ZNVER1-NEXT:    vpmulhrsw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   4816 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   4817   %1 = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a0, <16 x i16> %a1)
   4818   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   4819   %3 = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %1, <16 x i16> %2)
   4820   ret <16 x i16> %3
   4821 }
   4822 declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
   4823 
   4824 define <16 x i16> @test_pmulhuw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   4825 ; GENERIC-LABEL: test_pmulhuw:
   4826 ; GENERIC:       # %bb.0:
   4827 ; GENERIC-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   4828 ; GENERIC-NEXT:    vpmulhuw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
   4829 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4830 ;
   4831 ; HASWELL-LABEL: test_pmulhuw:
   4832 ; HASWELL:       # %bb.0:
   4833 ; HASWELL-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   4834 ; HASWELL-NEXT:    vpmulhuw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
   4835 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   4836 ;
   4837 ; BROADWELL-LABEL: test_pmulhuw:
   4838 ; BROADWELL:       # %bb.0:
   4839 ; BROADWELL-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   4840 ; BROADWELL-NEXT:    vpmulhuw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   4841 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   4842 ;
   4843 ; SKYLAKE-LABEL: test_pmulhuw:
   4844 ; SKYLAKE:       # %bb.0:
   4845 ; SKYLAKE-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
   4846 ; SKYLAKE-NEXT:    vpmulhuw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
   4847 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   4848 ;
   4849 ; SKX-LABEL: test_pmulhuw:
   4850 ; SKX:       # %bb.0:
   4851 ; SKX-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
   4852 ; SKX-NEXT:    vpmulhuw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
   4853 ; SKX-NEXT:    retq # sched: [7:1.00]
   4854 ;
   4855 ; ZNVER1-LABEL: test_pmulhuw:
   4856 ; ZNVER1:       # %bb.0:
   4857 ; ZNVER1-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
   4858 ; ZNVER1-NEXT:    vpmulhuw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   4859 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   4860   %1 = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %a0, <16 x i16> %a1)
   4861   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   4862   %3 = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %1, <16 x i16> %2)
   4863   ret <16 x i16> %3
   4864 }
   4865 declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
   4866 
   4867 define <16 x i16> @test_pmulhw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   4868 ; GENERIC-LABEL: test_pmulhw:
   4869 ; GENERIC:       # %bb.0:
   4870 ; GENERIC-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   4871 ; GENERIC-NEXT:    vpmulhw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
   4872 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4873 ;
   4874 ; HASWELL-LABEL: test_pmulhw:
   4875 ; HASWELL:       # %bb.0:
   4876 ; HASWELL-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   4877 ; HASWELL-NEXT:    vpmulhw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
   4878 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   4879 ;
   4880 ; BROADWELL-LABEL: test_pmulhw:
   4881 ; BROADWELL:       # %bb.0:
   4882 ; BROADWELL-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   4883 ; BROADWELL-NEXT:    vpmulhw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   4884 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   4885 ;
   4886 ; SKYLAKE-LABEL: test_pmulhw:
   4887 ; SKYLAKE:       # %bb.0:
   4888 ; SKYLAKE-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
   4889 ; SKYLAKE-NEXT:    vpmulhw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
   4890 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   4891 ;
   4892 ; SKX-LABEL: test_pmulhw:
   4893 ; SKX:       # %bb.0:
   4894 ; SKX-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
   4895 ; SKX-NEXT:    vpmulhw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
   4896 ; SKX-NEXT:    retq # sched: [7:1.00]
   4897 ;
   4898 ; ZNVER1-LABEL: test_pmulhw:
   4899 ; ZNVER1:       # %bb.0:
   4900 ; ZNVER1-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
   4901 ; ZNVER1-NEXT:    vpmulhw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   4902 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   4903   %1 = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %a0, <16 x i16> %a1)
   4904   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   4905   %3 = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %1, <16 x i16> %2)
   4906   ret <16 x i16> %3
   4907 }
   4908 declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
   4909 
   4910 define <8 x i32> @test_pmulld(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
   4911 ; GENERIC-LABEL: test_pmulld:
   4912 ; GENERIC:       # %bb.0:
   4913 ; GENERIC-NEXT:    vpmulld %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   4914 ; GENERIC-NEXT:    vpmulld (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
   4915 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4916 ;
   4917 ; HASWELL-LABEL: test_pmulld:
   4918 ; HASWELL:       # %bb.0:
   4919 ; HASWELL-NEXT:    vpmulld %ymm1, %ymm0, %ymm0 # sched: [10:2.00]
   4920 ; HASWELL-NEXT:    vpmulld (%rdi), %ymm0, %ymm0 # sched: [17:2.00]
   4921 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   4922 ;
   4923 ; BROADWELL-LABEL: test_pmulld:
   4924 ; BROADWELL:       # %bb.0:
   4925 ; BROADWELL-NEXT:    vpmulld %ymm1, %ymm0, %ymm0 # sched: [10:2.00]
   4926 ; BROADWELL-NEXT:    vpmulld (%rdi), %ymm0, %ymm0 # sched: [16:2.00]
   4927 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   4928 ;
   4929 ; SKYLAKE-LABEL: test_pmulld:
   4930 ; SKYLAKE:       # %bb.0:
   4931 ; SKYLAKE-NEXT:    vpmulld %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
   4932 ; SKYLAKE-NEXT:    vpmulld (%rdi), %ymm0, %ymm0 # sched: [17:1.00]
   4933 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   4934 ;
   4935 ; SKX-LABEL: test_pmulld:
   4936 ; SKX:       # %bb.0:
   4937 ; SKX-NEXT:    vpmulld %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
   4938 ; SKX-NEXT:    vpmulld (%rdi), %ymm0, %ymm0 # sched: [17:1.00]
   4939 ; SKX-NEXT:    retq # sched: [7:1.00]
   4940 ;
   4941 ; ZNVER1-LABEL: test_pmulld:
   4942 ; ZNVER1:       # %bb.0:
   4943 ; ZNVER1-NEXT:    vpmulld %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
   4944 ; ZNVER1-NEXT:    vpmulld (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
   4945 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   4946   %1 = mul <8 x i32> %a0, %a1
   4947   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   4948   %3 = mul <8 x i32> %1, %2
   4949   ret <8 x i32> %3
   4950 }
   4951 
   4952 define <16 x i16> @test_pmullw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   4953 ; GENERIC-LABEL: test_pmullw:
   4954 ; GENERIC:       # %bb.0:
   4955 ; GENERIC-NEXT:    vpmullw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   4956 ; GENERIC-NEXT:    vpmullw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
   4957 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   4958 ;
   4959 ; HASWELL-LABEL: test_pmullw:
   4960 ; HASWELL:       # %bb.0:
   4961 ; HASWELL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   4962 ; HASWELL-NEXT:    vpmullw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
   4963 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   4964 ;
   4965 ; BROADWELL-LABEL: test_pmullw:
   4966 ; BROADWELL:       # %bb.0:
   4967 ; BROADWELL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   4968 ; BROADWELL-NEXT:    vpmullw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   4969 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   4970 ;
   4971 ; SKYLAKE-LABEL: test_pmullw:
   4972 ; SKYLAKE:       # %bb.0:
   4973 ; SKYLAKE-NEXT:    vpmullw %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
   4974 ; SKYLAKE-NEXT:    vpmullw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
   4975 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   4976 ;
   4977 ; SKX-LABEL: test_pmullw:
   4978 ; SKX:       # %bb.0:
   4979 ; SKX-NEXT:    vpmullw %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
   4980 ; SKX-NEXT:    vpmullw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
   4981 ; SKX-NEXT:    retq # sched: [7:1.00]
   4982 ;
   4983 ; ZNVER1-LABEL: test_pmullw:
   4984 ; ZNVER1:       # %bb.0:
   4985 ; ZNVER1-NEXT:    vpmullw %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
   4986 ; ZNVER1-NEXT:    vpmullw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   4987 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   4988   %1 = mul <16 x i16> %a0, %a1
   4989   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   4990   %3 = mul <16 x i16> %1, %2
   4991   ret <16 x i16> %3
   4992 }
   4993 
   4994 define <4 x i64> @test_pmuludq(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
   4995 ; GENERIC-LABEL: test_pmuludq:
   4996 ; GENERIC:       # %bb.0:
   4997 ; GENERIC-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   4998 ; GENERIC-NEXT:    vpmuludq (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
   4999 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5000 ;
   5001 ; HASWELL-LABEL: test_pmuludq:
   5002 ; HASWELL:       # %bb.0:
   5003 ; HASWELL-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   5004 ; HASWELL-NEXT:    vpmuludq (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
   5005 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   5006 ;
   5007 ; BROADWELL-LABEL: test_pmuludq:
   5008 ; BROADWELL:       # %bb.0:
   5009 ; BROADWELL-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   5010 ; BROADWELL-NEXT:    vpmuludq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   5011 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   5012 ;
   5013 ; SKYLAKE-LABEL: test_pmuludq:
   5014 ; SKYLAKE:       # %bb.0:
   5015 ; SKYLAKE-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
   5016 ; SKYLAKE-NEXT:    vpmuludq (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
   5017 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   5018 ;
   5019 ; SKX-LABEL: test_pmuludq:
   5020 ; SKX:       # %bb.0:
   5021 ; SKX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
   5022 ; SKX-NEXT:    vpmuludq (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
   5023 ; SKX-NEXT:    retq # sched: [7:1.00]
   5024 ;
   5025 ; ZNVER1-LABEL: test_pmuludq:
   5026 ; ZNVER1:       # %bb.0:
   5027 ; ZNVER1-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
   5028 ; ZNVER1-NEXT:    vpmuludq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   5029 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   5030   %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %a0, <8 x i32> %a1)
   5031   %2 = bitcast <4 x i64> %1 to <8 x i32>
   5032   %3 = load <8 x i32>, <8 x i32> *%a2, align 32
   5033   %4 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %2, <8 x i32> %3)
   5034   ret <4 x i64> %4
   5035 }
   5036 declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
   5037 
   5038 define <4 x i64> @test_por(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
   5039 ; GENERIC-LABEL: test_por:
   5040 ; GENERIC:       # %bb.0:
   5041 ; GENERIC-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   5042 ; GENERIC-NEXT:    vpor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5043 ; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5044 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5045 ;
   5046 ; HASWELL-LABEL: test_por:
   5047 ; HASWELL:       # %bb.0:
   5048 ; HASWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   5049 ; HASWELL-NEXT:    vpor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5050 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5051 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   5052 ;
   5053 ; BROADWELL-LABEL: test_por:
   5054 ; BROADWELL:       # %bb.0:
   5055 ; BROADWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   5056 ; BROADWELL-NEXT:    vpor (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   5057 ; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5058 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   5059 ;
   5060 ; SKYLAKE-LABEL: test_por:
   5061 ; SKYLAKE:       # %bb.0:
   5062 ; SKYLAKE-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   5063 ; SKYLAKE-NEXT:    vpor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5064 ; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   5065 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   5066 ;
   5067 ; SKX-LABEL: test_por:
   5068 ; SKX:       # %bb.0:
   5069 ; SKX-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   5070 ; SKX-NEXT:    vpor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5071 ; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   5072 ; SKX-NEXT:    retq # sched: [7:1.00]
   5073 ;
   5074 ; ZNVER1-LABEL: test_por:
   5075 ; ZNVER1:       # %bb.0:
   5076 ; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   5077 ; ZNVER1-NEXT:    vpor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5078 ; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   5079 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   5080   %1 = or <4 x i64> %a0, %a1
   5081   %2 = load <4 x i64>, <4 x i64> *%a2, align 32
   5082   %3 = or <4 x i64> %1, %2
   5083   %4 = add <4 x i64> %3, %a1
   5084   ret <4 x i64> %4
   5085 }
   5086 
   5087 define <4 x i64> @test_psadbw(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
   5088 ; GENERIC-LABEL: test_psadbw:
   5089 ; GENERIC:       # %bb.0:
   5090 ; GENERIC-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   5091 ; GENERIC-NEXT:    vpsadbw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
   5092 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5093 ;
   5094 ; HASWELL-LABEL: test_psadbw:
   5095 ; HASWELL:       # %bb.0:
   5096 ; HASWELL-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   5097 ; HASWELL-NEXT:    vpsadbw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
   5098 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   5099 ;
   5100 ; BROADWELL-LABEL: test_psadbw:
   5101 ; BROADWELL:       # %bb.0:
   5102 ; BROADWELL-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   5103 ; BROADWELL-NEXT:    vpsadbw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   5104 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   5105 ;
   5106 ; SKYLAKE-LABEL: test_psadbw:
   5107 ; SKYLAKE:       # %bb.0:
   5108 ; SKYLAKE-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
   5109 ; SKYLAKE-NEXT:    vpsadbw (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
   5110 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   5111 ;
   5112 ; SKX-LABEL: test_psadbw:
   5113 ; SKX:       # %bb.0:
   5114 ; SKX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
   5115 ; SKX-NEXT:    vpsadbw (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
   5116 ; SKX-NEXT:    retq # sched: [7:1.00]
   5117 ;
   5118 ; ZNVER1-LABEL: test_psadbw:
   5119 ; ZNVER1:       # %bb.0:
   5120 ; ZNVER1-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
   5121 ; ZNVER1-NEXT:    vpsadbw (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
   5122 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   5123   %1 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1)
   5124   %2 = bitcast <4 x i64> %1 to <32 x i8>
   5125   %3 = load <32 x i8>, <32 x i8> *%a2, align 32
   5126   %4 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %2, <32 x i8> %3)
   5127   ret <4 x i64> %4
   5128 }
   5129 declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
   5130 
   5131 define <32 x i8> @test_pshufb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
   5132 ; GENERIC-LABEL: test_pshufb:
   5133 ; GENERIC:       # %bb.0:
   5134 ; GENERIC-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5135 ; GENERIC-NEXT:    vpshufb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5136 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5137 ;
   5138 ; HASWELL-LABEL: test_pshufb:
   5139 ; HASWELL:       # %bb.0:
   5140 ; HASWELL-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
   5141 ; HASWELL-NEXT:    vpshufb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   5142 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   5143 ;
   5144 ; BROADWELL-LABEL: test_pshufb:
   5145 ; BROADWELL:       # %bb.0:
   5146 ; BROADWELL-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
   5147 ; BROADWELL-NEXT:    vpshufb (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
   5148 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   5149 ;
   5150 ; SKYLAKE-LABEL: test_pshufb:
   5151 ; SKYLAKE:       # %bb.0:
   5152 ; SKYLAKE-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
   5153 ; SKYLAKE-NEXT:    vpshufb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   5154 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   5155 ;
   5156 ; SKX-LABEL: test_pshufb:
   5157 ; SKX:       # %bb.0:
   5158 ; SKX-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
   5159 ; SKX-NEXT:    vpshufb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   5160 ; SKX-NEXT:    retq # sched: [7:1.00]
   5161 ;
   5162 ; ZNVER1-LABEL: test_pshufb:
   5163 ; ZNVER1:       # %bb.0:
   5164 ; ZNVER1-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   5165 ; ZNVER1-NEXT:    vpshufb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5166 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   5167   %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1)
   5168   %2 = load <32 x i8>, <32 x i8> *%a2, align 32
   5169   %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> %2)
   5170   ret <32 x i8> %3
   5171 }
   5172 declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
   5173 
   5174 define <8 x i32> @test_pshufd(<8 x i32> %a0, <8 x i32> *%a1) {
   5175 ; GENERIC-LABEL: test_pshufd:
   5176 ; GENERIC:       # %bb.0:
   5177 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
   5178 ; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 = mem[1,0,3,2,5,4,7,6] sched: [8:1.00]
   5179 ; GENERIC-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5180 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5181 ;
   5182 ; HASWELL-LABEL: test_pshufd:
   5183 ; HASWELL:       # %bb.0:
   5184 ; HASWELL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
   5185 ; HASWELL-NEXT:    vpshufd {{.*#+}} ymm1 = mem[1,0,3,2,5,4,7,6] sched: [8:1.00]
   5186 ; HASWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5187 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   5188 ;
   5189 ; BROADWELL-LABEL: test_pshufd:
   5190 ; BROADWELL:       # %bb.0:
   5191 ; BROADWELL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
   5192 ; BROADWELL-NEXT:    vpshufd {{.*#+}} ymm1 = mem[1,0,3,2,5,4,7,6] sched: [7:1.00]
   5193 ; BROADWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5194 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   5195 ;
   5196 ; SKYLAKE-LABEL: test_pshufd:
   5197 ; SKYLAKE:       # %bb.0:
   5198 ; SKYLAKE-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
   5199 ; SKYLAKE-NEXT:    vpshufd {{.*#+}} ymm1 = mem[1,0,3,2,5,4,7,6] sched: [8:1.00]
   5200 ; SKYLAKE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   5201 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   5202 ;
   5203 ; SKX-LABEL: test_pshufd:
   5204 ; SKX:       # %bb.0:
   5205 ; SKX-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
   5206 ; SKX-NEXT:    vpshufd {{.*#+}} ymm1 = mem[1,0,3,2,5,4,7,6] sched: [8:1.00]
   5207 ; SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   5208 ; SKX-NEXT:    retq # sched: [7:1.00]
   5209 ;
   5210 ; ZNVER1-LABEL: test_pshufd:
   5211 ; ZNVER1:       # %bb.0:
   5212 ; ZNVER1-NEXT:    vpshufd {{.*#+}} ymm1 = mem[1,0,3,2,5,4,7,6] sched: [8:0.50]
   5213 ; ZNVER1-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:0.25]
   5214 ; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   5215 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   5216   %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   5217   %2 = load <8 x i32>, <8 x i32> *%a1, align 32
   5218   %3 = shufflevector <8 x i32> %2, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   5219   %4 = add <8 x i32> %1, %3
   5220   ret <8 x i32> %4
   5221 }
   5222 
   5223 define <16 x i16> @test_pshufhw(<16 x i16> %a0, <16 x i16> *%a1) {
   5224 ; GENERIC-LABEL: test_pshufhw:
   5225 ; GENERIC:       # %bb.0:
   5226 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] sched: [1:1.00]
   5227 ; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 = mem[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] sched: [8:1.00]
   5228 ; GENERIC-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   5229 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5230 ;
   5231 ; HASWELL-LABEL: test_pshufhw:
   5232 ; HASWELL:       # %bb.0:
   5233 ; HASWELL-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] sched: [1:1.00]
   5234 ; HASWELL-NEXT:    vpshufhw {{.*#+}} ymm1 = mem[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] sched: [8:1.00]
   5235 ; HASWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   5236 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   5237 ;
   5238 ; BROADWELL-LABEL: test_pshufhw:
   5239 ; BROADWELL:       # %bb.0:
   5240 ; BROADWELL-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] sched: [1:1.00]
   5241 ; BROADWELL-NEXT:    vpshufhw {{.*#+}} ymm1 = mem[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] sched: [7:1.00]
   5242 ; BROADWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   5243 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   5244 ;
   5245 ; SKYLAKE-LABEL: test_pshufhw:
   5246 ; SKYLAKE:       # %bb.0:
   5247 ; SKYLAKE-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] sched: [1:1.00]
   5248 ; SKYLAKE-NEXT:    vpshufhw {{.*#+}} ymm1 = mem[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] sched: [8:1.00]
   5249 ; SKYLAKE-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   5250 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   5251 ;
   5252 ; SKX-LABEL: test_pshufhw:
   5253 ; SKX:       # %bb.0:
   5254 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] sched: [1:1.00]
   5255 ; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 = mem[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] sched: [8:1.00]
   5256 ; SKX-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   5257 ; SKX-NEXT:    retq # sched: [7:1.00]
   5258 ;
   5259 ; ZNVER1-LABEL: test_pshufhw:
   5260 ; ZNVER1:       # %bb.0:
   5261 ; ZNVER1-NEXT:    vpshufhw {{.*#+}} ymm1 = mem[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] sched: [8:0.50]
   5262 ; ZNVER1-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] sched: [1:0.25]
   5263 ; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   5264 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   5265   %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
   5266   %2 = load <16 x i16>, <16 x i16> *%a1, align 32
   5267   %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 7, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 12, i32 15, i32 14>
   5268   %4 = or <16 x i16> %1, %3
   5269   ret <16 x i16> %4
   5270 }
   5271 
   5272 define <16 x i16> @test_pshuflw(<16 x i16> %a0, <16 x i16> *%a1) {
   5273 ; GENERIC-LABEL: test_pshuflw:
   5274 ; GENERIC:       # %bb.0:
   5275 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] sched: [1:1.00]
   5276 ; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 = mem[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] sched: [8:1.00]
   5277 ; GENERIC-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   5278 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5279 ;
   5280 ; HASWELL-LABEL: test_pshuflw:
   5281 ; HASWELL:       # %bb.0:
   5282 ; HASWELL-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] sched: [1:1.00]
   5283 ; HASWELL-NEXT:    vpshuflw {{.*#+}} ymm1 = mem[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] sched: [8:1.00]
   5284 ; HASWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   5285 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   5286 ;
   5287 ; BROADWELL-LABEL: test_pshuflw:
   5288 ; BROADWELL:       # %bb.0:
   5289 ; BROADWELL-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] sched: [1:1.00]
   5290 ; BROADWELL-NEXT:    vpshuflw {{.*#+}} ymm1 = mem[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] sched: [7:1.00]
   5291 ; BROADWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   5292 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   5293 ;
   5294 ; SKYLAKE-LABEL: test_pshuflw:
   5295 ; SKYLAKE:       # %bb.0:
   5296 ; SKYLAKE-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] sched: [1:1.00]
   5297 ; SKYLAKE-NEXT:    vpshuflw {{.*#+}} ymm1 = mem[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] sched: [8:1.00]
   5298 ; SKYLAKE-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   5299 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   5300 ;
   5301 ; SKX-LABEL: test_pshuflw:
   5302 ; SKX:       # %bb.0:
   5303 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] sched: [1:1.00]
   5304 ; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 = mem[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] sched: [8:1.00]
   5305 ; SKX-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   5306 ; SKX-NEXT:    retq # sched: [7:1.00]
   5307 ;
   5308 ; ZNVER1-LABEL: test_pshuflw:
   5309 ; ZNVER1:       # %bb.0:
   5310 ; ZNVER1-NEXT:    vpshuflw {{.*#+}} ymm1 = mem[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] sched: [8:0.50]
   5311 ; ZNVER1-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] sched: [1:0.25]
   5312 ; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   5313 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   5314   %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
   5315   %2 = load <16 x i16>, <16 x i16> *%a1, align 32
   5316   %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   5317   %4 = or <16 x i16> %1, %3
   5318   ret <16 x i16> %4
   5319 }
   5320 
   5321 define <32 x i8> @test_psignb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
   5322 ; GENERIC-LABEL: test_psignb:
   5323 ; GENERIC:       # %bb.0:
   5324 ; GENERIC-NEXT:    vpsignb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5325 ; GENERIC-NEXT:    vpsignb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5326 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5327 ;
   5328 ; HASWELL-LABEL: test_psignb:
   5329 ; HASWELL:       # %bb.0:
   5330 ; HASWELL-NEXT:    vpsignb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5331 ; HASWELL-NEXT:    vpsignb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5332 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   5333 ;
   5334 ; BROADWELL-LABEL: test_psignb:
   5335 ; BROADWELL:       # %bb.0:
   5336 ; BROADWELL-NEXT:    vpsignb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5337 ; BROADWELL-NEXT:    vpsignb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   5338 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   5339 ;
   5340 ; SKYLAKE-LABEL: test_psignb:
   5341 ; SKYLAKE:       # %bb.0:
   5342 ; SKYLAKE-NEXT:    vpsignb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5343 ; SKYLAKE-NEXT:    vpsignb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5344 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   5345 ;
   5346 ; SKX-LABEL: test_psignb:
   5347 ; SKX:       # %bb.0:
   5348 ; SKX-NEXT:    vpsignb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5349 ; SKX-NEXT:    vpsignb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5350 ; SKX-NEXT:    retq # sched: [7:1.00]
   5351 ;
   5352 ; ZNVER1-LABEL: test_psignb:
   5353 ; ZNVER1:       # %bb.0:
   5354 ; ZNVER1-NEXT:    vpsignb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   5355 ; ZNVER1-NEXT:    vpsignb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5356 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   5357   %1 = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %a0, <32 x i8> %a1)
   5358   %2 = load <32 x i8>, <32 x i8> *%a2, align 32
   5359   %3 = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %1, <32 x i8> %2)
   5360   ret <32 x i8> %3
   5361 }
   5362 declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
   5363 
   5364 define <8 x i32> @test_psignd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
   5365 ; GENERIC-LABEL: test_psignd:
   5366 ; GENERIC:       # %bb.0:
   5367 ; GENERIC-NEXT:    vpsignd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5368 ; GENERIC-NEXT:    vpsignd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5369 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5370 ;
   5371 ; HASWELL-LABEL: test_psignd:
   5372 ; HASWELL:       # %bb.0:
   5373 ; HASWELL-NEXT:    vpsignd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5374 ; HASWELL-NEXT:    vpsignd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5375 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   5376 ;
   5377 ; BROADWELL-LABEL: test_psignd:
   5378 ; BROADWELL:       # %bb.0:
   5379 ; BROADWELL-NEXT:    vpsignd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5380 ; BROADWELL-NEXT:    vpsignd (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   5381 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   5382 ;
   5383 ; SKYLAKE-LABEL: test_psignd:
   5384 ; SKYLAKE:       # %bb.0:
   5385 ; SKYLAKE-NEXT:    vpsignd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5386 ; SKYLAKE-NEXT:    vpsignd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5387 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   5388 ;
   5389 ; SKX-LABEL: test_psignd:
   5390 ; SKX:       # %bb.0:
   5391 ; SKX-NEXT:    vpsignd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5392 ; SKX-NEXT:    vpsignd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5393 ; SKX-NEXT:    retq # sched: [7:1.00]
   5394 ;
   5395 ; ZNVER1-LABEL: test_psignd:
   5396 ; ZNVER1:       # %bb.0:
   5397 ; ZNVER1-NEXT:    vpsignd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   5398 ; ZNVER1-NEXT:    vpsignd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5399 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   5400   %1 = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %a0, <8 x i32> %a1)
   5401   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   5402   %3 = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %1, <8 x i32> %2)
   5403   ret <8 x i32> %3
   5404 }
   5405 declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
   5406 
   5407 define <16 x i16> @test_psignw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   5408 ; GENERIC-LABEL: test_psignw:
   5409 ; GENERIC:       # %bb.0:
   5410 ; GENERIC-NEXT:    vpsignw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5411 ; GENERIC-NEXT:    vpsignw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5412 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5413 ;
   5414 ; HASWELL-LABEL: test_psignw:
   5415 ; HASWELL:       # %bb.0:
   5416 ; HASWELL-NEXT:    vpsignw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5417 ; HASWELL-NEXT:    vpsignw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5418 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   5419 ;
   5420 ; BROADWELL-LABEL: test_psignw:
   5421 ; BROADWELL:       # %bb.0:
   5422 ; BROADWELL-NEXT:    vpsignw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5423 ; BROADWELL-NEXT:    vpsignw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   5424 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   5425 ;
   5426 ; SKYLAKE-LABEL: test_psignw:
   5427 ; SKYLAKE:       # %bb.0:
   5428 ; SKYLAKE-NEXT:    vpsignw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5429 ; SKYLAKE-NEXT:    vpsignw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5430 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   5431 ;
   5432 ; SKX-LABEL: test_psignw:
   5433 ; SKX:       # %bb.0:
   5434 ; SKX-NEXT:    vpsignw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5435 ; SKX-NEXT:    vpsignw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5436 ; SKX-NEXT:    retq # sched: [7:1.00]
   5437 ;
   5438 ; ZNVER1-LABEL: test_psignw:
   5439 ; ZNVER1:       # %bb.0:
   5440 ; ZNVER1-NEXT:    vpsignw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   5441 ; ZNVER1-NEXT:    vpsignw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5442 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   5443   %1 = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %a0, <16 x i16> %a1)
   5444   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   5445   %3 = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %1, <16 x i16> %2)
   5446   ret <16 x i16> %3
   5447 }
   5448 declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
   5449 
   5450 define <8 x i32> @test_pslld(<8 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
   5451 ; GENERIC-LABEL: test_pslld:
   5452 ; GENERIC:       # %bb.0:
   5453 ; GENERIC-NEXT:    vpslld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5454 ; GENERIC-NEXT:    vpslld (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   5455 ; GENERIC-NEXT:    vpslld $2, %ymm0, %ymm0 # sched: [1:1.00]
   5456 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5457 ;
   5458 ; HASWELL-LABEL: test_pslld:
   5459 ; HASWELL:       # %bb.0:
   5460 ; HASWELL-NEXT:    vpslld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5461 ; HASWELL-NEXT:    vpslld (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   5462 ; HASWELL-NEXT:    vpslld $2, %ymm0, %ymm0 # sched: [1:1.00]
   5463 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   5464 ;
   5465 ; BROADWELL-LABEL: test_pslld:
   5466 ; BROADWELL:       # %bb.0:
   5467 ; BROADWELL-NEXT:    vpslld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5468 ; BROADWELL-NEXT:    vpslld (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
   5469 ; BROADWELL-NEXT:    vpslld $2, %ymm0, %ymm0 # sched: [1:1.00]
   5470 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   5471 ;
   5472 ; SKYLAKE-LABEL: test_pslld:
   5473 ; SKYLAKE:       # %bb.0:
   5474 ; SKYLAKE-NEXT:    vpslld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5475 ; SKYLAKE-NEXT:    vpslld (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5476 ; SKYLAKE-NEXT:    vpslld $2, %ymm0, %ymm0 # sched: [1:0.50]
   5477 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   5478 ;
   5479 ; SKX-LABEL: test_pslld:
   5480 ; SKX:       # %bb.0:
   5481 ; SKX-NEXT:    vpslld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5482 ; SKX-NEXT:    vpslld (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5483 ; SKX-NEXT:    vpslld $2, %ymm0, %ymm0 # sched: [1:0.50]
   5484 ; SKX-NEXT:    retq # sched: [7:1.00]
   5485 ;
   5486 ; ZNVER1-LABEL: test_pslld:
   5487 ; ZNVER1:       # %bb.0:
   5488 ; ZNVER1-NEXT:    vpslld %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
   5489 ; ZNVER1-NEXT:    vpslld (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
   5490 ; ZNVER1-NEXT:    vpslld $2, %ymm0, %ymm0 # sched: [1:0.25]
   5491 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   5492   %1 = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1)
   5493   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   5494   %3 = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %1, <4 x i32> %2)
   5495   %4 = shl <8 x i32> %3, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
   5496   ret <8 x i32> %4
   5497 }
   5498 declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
   5499 
   5500 define <32 x i8> @test_pslldq(<32 x i8> %a0) {
   5501 ; GENERIC-LABEL: test_pslldq:
   5502 ; GENERIC:       # %bb.0:
   5503 ; GENERIC-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] sched: [1:1.00]
   5504 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5505 ;
   5506 ; HASWELL-LABEL: test_pslldq:
   5507 ; HASWELL:       # %bb.0:
   5508 ; HASWELL-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] sched: [1:1.00]
   5509 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   5510 ;
   5511 ; BROADWELL-LABEL: test_pslldq:
   5512 ; BROADWELL:       # %bb.0:
   5513 ; BROADWELL-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] sched: [1:1.00]
   5514 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   5515 ;
   5516 ; SKYLAKE-LABEL: test_pslldq:
   5517 ; SKYLAKE:       # %bb.0:
   5518 ; SKYLAKE-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] sched: [1:1.00]
   5519 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   5520 ;
   5521 ; SKX-LABEL: test_pslldq:
   5522 ; SKX:       # %bb.0:
   5523 ; SKX-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] sched: [1:1.00]
   5524 ; SKX-NEXT:    retq # sched: [7:1.00]
   5525 ;
   5526 ; ZNVER1-LABEL: test_pslldq:
   5527 ; ZNVER1:       # %bb.0:
   5528 ; ZNVER1-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] sched: [2:1.00]
   5529 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   5530   %1 = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
   5531   ret <32 x i8> %1
   5532 }
   5533 
   5534 define <4 x i64> @test_psllq(<4 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
   5535 ; GENERIC-LABEL: test_psllq:
   5536 ; GENERIC:       # %bb.0:
   5537 ; GENERIC-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5538 ; GENERIC-NEXT:    vpsllq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   5539 ; GENERIC-NEXT:    vpsllq $2, %ymm0, %ymm0 # sched: [1:1.00]
   5540 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5541 ;
   5542 ; HASWELL-LABEL: test_psllq:
   5543 ; HASWELL:       # %bb.0:
   5544 ; HASWELL-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5545 ; HASWELL-NEXT:    vpsllq (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   5546 ; HASWELL-NEXT:    vpsllq $2, %ymm0, %ymm0 # sched: [1:1.00]
   5547 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   5548 ;
   5549 ; BROADWELL-LABEL: test_psllq:
   5550 ; BROADWELL:       # %bb.0:
   5551 ; BROADWELL-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5552 ; BROADWELL-NEXT:    vpsllq (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
   5553 ; BROADWELL-NEXT:    vpsllq $2, %ymm0, %ymm0 # sched: [1:1.00]
   5554 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   5555 ;
   5556 ; SKYLAKE-LABEL: test_psllq:
   5557 ; SKYLAKE:       # %bb.0:
   5558 ; SKYLAKE-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5559 ; SKYLAKE-NEXT:    vpsllq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5560 ; SKYLAKE-NEXT:    vpsllq $2, %ymm0, %ymm0 # sched: [1:0.50]
   5561 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   5562 ;
   5563 ; SKX-LABEL: test_psllq:
   5564 ; SKX:       # %bb.0:
   5565 ; SKX-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5566 ; SKX-NEXT:    vpsllq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5567 ; SKX-NEXT:    vpsllq $2, %ymm0, %ymm0 # sched: [1:0.50]
   5568 ; SKX-NEXT:    retq # sched: [7:1.00]
   5569 ;
   5570 ; ZNVER1-LABEL: test_psllq:
   5571 ; ZNVER1:       # %bb.0:
   5572 ; ZNVER1-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
   5573 ; ZNVER1-NEXT:    vpsllq (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
   5574 ; ZNVER1-NEXT:    vpsllq $2, %ymm0, %ymm0 # sched: [1:0.25]
   5575 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   5576   %1 = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
   5577   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   5578   %3 = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %1, <2 x i64> %2)
   5579   %4 = shl <4 x i64> %3, <i64 2, i64 2, i64 2, i64 2>
   5580   ret <4 x i64> %4
   5581 }
   5582 declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
   5583 
   5584 define <4 x i32> @test_psllvd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
   5585 ; GENERIC-LABEL: test_psllvd:
   5586 ; GENERIC:       # %bb.0:
   5587 ; GENERIC-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
   5588 ; GENERIC-NEXT:    vpsllvd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
   5589 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5590 ;
   5591 ; HASWELL-LABEL: test_psllvd:
   5592 ; HASWELL:       # %bb.0:
   5593 ; HASWELL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
   5594 ; HASWELL-NEXT:    vpsllvd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
   5595 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   5596 ;
   5597 ; BROADWELL-LABEL: test_psllvd:
   5598 ; BROADWELL:       # %bb.0:
   5599 ; BROADWELL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
   5600 ; BROADWELL-NEXT:    vpsllvd (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
   5601 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   5602 ;
   5603 ; SKYLAKE-LABEL: test_psllvd:
   5604 ; SKYLAKE:       # %bb.0:
   5605 ; SKYLAKE-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   5606 ; SKYLAKE-NEXT:    vpsllvd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
   5607 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   5608 ;
   5609 ; SKX-LABEL: test_psllvd:
   5610 ; SKX:       # %bb.0:
   5611 ; SKX-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   5612 ; SKX-NEXT:    vpsllvd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
   5613 ; SKX-NEXT:    retq # sched: [7:1.00]
   5614 ;
   5615 ; ZNVER1-LABEL: test_psllvd:
   5616 ; ZNVER1:       # %bb.0:
   5617 ; ZNVER1-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   5618 ; ZNVER1-NEXT:    vpsllvd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
   5619 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   5620   %1 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1)
   5621   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   5622   %3 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %1, <4 x i32> %2)
   5623   ret <4 x i32> %3
   5624 }
   5625 declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
   5626 
   5627 define <8 x i32> @test_psllvd_ymm(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
   5628 ; GENERIC-LABEL: test_psllvd_ymm:
   5629 ; GENERIC:       # %bb.0:
   5630 ; GENERIC-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
   5631 ; GENERIC-NEXT:    vpsllvd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   5632 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5633 ;
   5634 ; HASWELL-LABEL: test_psllvd_ymm:
   5635 ; HASWELL:       # %bb.0:
   5636 ; HASWELL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   5637 ; HASWELL-NEXT:    vpsllvd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
   5638 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   5639 ;
   5640 ; BROADWELL-LABEL: test_psllvd_ymm:
   5641 ; BROADWELL:       # %bb.0:
   5642 ; BROADWELL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   5643 ; BROADWELL-NEXT:    vpsllvd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
   5644 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   5645 ;
   5646 ; SKYLAKE-LABEL: test_psllvd_ymm:
   5647 ; SKYLAKE:       # %bb.0:
   5648 ; SKYLAKE-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5649 ; SKYLAKE-NEXT:    vpsllvd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5650 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   5651 ;
   5652 ; SKX-LABEL: test_psllvd_ymm:
   5653 ; SKX:       # %bb.0:
   5654 ; SKX-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5655 ; SKX-NEXT:    vpsllvd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5656 ; SKX-NEXT:    retq # sched: [7:1.00]
   5657 ;
   5658 ; ZNVER1-LABEL: test_psllvd_ymm:
   5659 ; ZNVER1:       # %bb.0:
   5660 ; ZNVER1-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5661 ; ZNVER1-NEXT:    vpsllvd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5662 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   5663   %1 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1)
   5664   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   5665   %3 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %1, <8 x i32> %2)
   5666   ret <8 x i32> %3
   5667 }
   5668 declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
   5669 
   5670 define <2 x i64> @test_psllvq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
   5671 ; GENERIC-LABEL: test_psllvq:
   5672 ; GENERIC:       # %bb.0:
   5673 ; GENERIC-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
   5674 ; GENERIC-NEXT:    vpsllvq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
   5675 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5676 ;
   5677 ; HASWELL-LABEL: test_psllvq:
   5678 ; HASWELL:       # %bb.0:
   5679 ; HASWELL-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
   5680 ; HASWELL-NEXT:    vpsllvq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
   5681 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   5682 ;
   5683 ; BROADWELL-LABEL: test_psllvq:
   5684 ; BROADWELL:       # %bb.0:
   5685 ; BROADWELL-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
   5686 ; BROADWELL-NEXT:    vpsllvq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
   5687 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   5688 ;
   5689 ; SKYLAKE-LABEL: test_psllvq:
   5690 ; SKYLAKE:       # %bb.0:
   5691 ; SKYLAKE-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   5692 ; SKYLAKE-NEXT:    vpsllvq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
   5693 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   5694 ;
   5695 ; SKX-LABEL: test_psllvq:
   5696 ; SKX:       # %bb.0:
   5697 ; SKX-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   5698 ; SKX-NEXT:    vpsllvq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
   5699 ; SKX-NEXT:    retq # sched: [7:1.00]
   5700 ;
   5701 ; ZNVER1-LABEL: test_psllvq:
   5702 ; ZNVER1:       # %bb.0:
   5703 ; ZNVER1-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   5704 ; ZNVER1-NEXT:    vpsllvq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
   5705 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   5706   %1 = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
   5707   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   5708   %3 = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %1, <2 x i64> %2)
   5709   ret <2 x i64> %3
   5710 }
   5711 declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
   5712 
   5713 define <4 x i64> @test_psllvq_ymm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
   5714 ; GENERIC-LABEL: test_psllvq_ymm:
   5715 ; GENERIC:       # %bb.0:
   5716 ; GENERIC-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
   5717 ; GENERIC-NEXT:    vpsllvq (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   5718 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5719 ;
   5720 ; HASWELL-LABEL: test_psllvq_ymm:
   5721 ; HASWELL:       # %bb.0:
   5722 ; HASWELL-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
   5723 ; HASWELL-NEXT:    vpsllvq (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   5724 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   5725 ;
   5726 ; BROADWELL-LABEL: test_psllvq_ymm:
   5727 ; BROADWELL:       # %bb.0:
   5728 ; BROADWELL-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
   5729 ; BROADWELL-NEXT:    vpsllvq (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
   5730 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   5731 ;
   5732 ; SKYLAKE-LABEL: test_psllvq_ymm:
   5733 ; SKYLAKE:       # %bb.0:
   5734 ; SKYLAKE-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5735 ; SKYLAKE-NEXT:    vpsllvq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5736 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   5737 ;
   5738 ; SKX-LABEL: test_psllvq_ymm:
   5739 ; SKX:       # %bb.0:
   5740 ; SKX-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5741 ; SKX-NEXT:    vpsllvq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5742 ; SKX-NEXT:    retq # sched: [7:1.00]
   5743 ;
   5744 ; ZNVER1-LABEL: test_psllvq_ymm:
   5745 ; ZNVER1:       # %bb.0:
   5746 ; ZNVER1-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5747 ; ZNVER1-NEXT:    vpsllvq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5748 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   5749   %1 = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
   5750   %2 = load <4 x i64>, <4 x i64> *%a2, align 32
   5751   %3 = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %1, <4 x i64> %2)
   5752   ret <4 x i64> %3
   5753 }
   5754 declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
   5755 
   5756 define <16 x i16> @test_psllw(<16 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
   5757 ; GENERIC-LABEL: test_psllw:
   5758 ; GENERIC:       # %bb.0:
   5759 ; GENERIC-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5760 ; GENERIC-NEXT:    vpsllw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   5761 ; GENERIC-NEXT:    vpsllw $2, %ymm0, %ymm0 # sched: [1:1.00]
   5762 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5763 ;
   5764 ; HASWELL-LABEL: test_psllw:
   5765 ; HASWELL:       # %bb.0:
   5766 ; HASWELL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5767 ; HASWELL-NEXT:    vpsllw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   5768 ; HASWELL-NEXT:    vpsllw $2, %ymm0, %ymm0 # sched: [1:1.00]
   5769 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   5770 ;
   5771 ; BROADWELL-LABEL: test_psllw:
   5772 ; BROADWELL:       # %bb.0:
   5773 ; BROADWELL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5774 ; BROADWELL-NEXT:    vpsllw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
   5775 ; BROADWELL-NEXT:    vpsllw $2, %ymm0, %ymm0 # sched: [1:1.00]
   5776 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   5777 ;
   5778 ; SKYLAKE-LABEL: test_psllw:
   5779 ; SKYLAKE:       # %bb.0:
   5780 ; SKYLAKE-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5781 ; SKYLAKE-NEXT:    vpsllw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5782 ; SKYLAKE-NEXT:    vpsllw $2, %ymm0, %ymm0 # sched: [1:0.50]
   5783 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   5784 ;
   5785 ; SKX-LABEL: test_psllw:
   5786 ; SKX:       # %bb.0:
   5787 ; SKX-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5788 ; SKX-NEXT:    vpsllw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5789 ; SKX-NEXT:    vpsllw $2, %ymm0, %ymm0 # sched: [1:0.50]
   5790 ; SKX-NEXT:    retq # sched: [7:1.00]
   5791 ;
   5792 ; ZNVER1-LABEL: test_psllw:
   5793 ; ZNVER1:       # %bb.0:
   5794 ; ZNVER1-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
   5795 ; ZNVER1-NEXT:    vpsllw (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
   5796 ; ZNVER1-NEXT:    vpsllw $2, %ymm0, %ymm0 # sched: [1:0.25]
   5797 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   5798   %1 = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1)
   5799   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   5800   %3 = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %1, <8 x i16> %2)
   5801   %4 = shl <16 x i16> %3, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
   5802   ret <16 x i16> %4
   5803 }
   5804 declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
   5805 
   5806 define <8 x i32> @test_psrad(<8 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
   5807 ; GENERIC-LABEL: test_psrad:
   5808 ; GENERIC:       # %bb.0:
   5809 ; GENERIC-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5810 ; GENERIC-NEXT:    vpsrad (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   5811 ; GENERIC-NEXT:    vpsrad $2, %ymm0, %ymm0 # sched: [1:1.00]
   5812 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5813 ;
   5814 ; HASWELL-LABEL: test_psrad:
   5815 ; HASWELL:       # %bb.0:
   5816 ; HASWELL-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5817 ; HASWELL-NEXT:    vpsrad (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   5818 ; HASWELL-NEXT:    vpsrad $2, %ymm0, %ymm0 # sched: [1:1.00]
   5819 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   5820 ;
   5821 ; BROADWELL-LABEL: test_psrad:
   5822 ; BROADWELL:       # %bb.0:
   5823 ; BROADWELL-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5824 ; BROADWELL-NEXT:    vpsrad (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
   5825 ; BROADWELL-NEXT:    vpsrad $2, %ymm0, %ymm0 # sched: [1:1.00]
   5826 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   5827 ;
   5828 ; SKYLAKE-LABEL: test_psrad:
   5829 ; SKYLAKE:       # %bb.0:
   5830 ; SKYLAKE-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5831 ; SKYLAKE-NEXT:    vpsrad (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5832 ; SKYLAKE-NEXT:    vpsrad $2, %ymm0, %ymm0 # sched: [1:0.50]
   5833 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   5834 ;
   5835 ; SKX-LABEL: test_psrad:
   5836 ; SKX:       # %bb.0:
   5837 ; SKX-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5838 ; SKX-NEXT:    vpsrad (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5839 ; SKX-NEXT:    vpsrad $2, %ymm0, %ymm0 # sched: [1:0.50]
   5840 ; SKX-NEXT:    retq # sched: [7:1.00]
   5841 ;
   5842 ; ZNVER1-LABEL: test_psrad:
   5843 ; ZNVER1:       # %bb.0:
   5844 ; ZNVER1-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
   5845 ; ZNVER1-NEXT:    vpsrad (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
   5846 ; ZNVER1-NEXT:    vpsrad $2, %ymm0, %ymm0 # sched: [1:0.25]
   5847 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   5848   %1 = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1)
   5849   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   5850   %3 = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> %2)
   5851   %4 = ashr <8 x i32> %3, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
   5852   ret <8 x i32> %4
   5853 }
   5854 declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
   5855 
   5856 define <4 x i32> @test_psravd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
   5857 ; GENERIC-LABEL: test_psravd:
   5858 ; GENERIC:       # %bb.0:
   5859 ; GENERIC-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
   5860 ; GENERIC-NEXT:    vpsravd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
   5861 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5862 ;
   5863 ; HASWELL-LABEL: test_psravd:
   5864 ; HASWELL:       # %bb.0:
   5865 ; HASWELL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
   5866 ; HASWELL-NEXT:    vpsravd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
   5867 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   5868 ;
   5869 ; BROADWELL-LABEL: test_psravd:
   5870 ; BROADWELL:       # %bb.0:
   5871 ; BROADWELL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
   5872 ; BROADWELL-NEXT:    vpsravd (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
   5873 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   5874 ;
   5875 ; SKYLAKE-LABEL: test_psravd:
   5876 ; SKYLAKE:       # %bb.0:
   5877 ; SKYLAKE-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   5878 ; SKYLAKE-NEXT:    vpsravd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
   5879 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   5880 ;
   5881 ; SKX-LABEL: test_psravd:
   5882 ; SKX:       # %bb.0:
   5883 ; SKX-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   5884 ; SKX-NEXT:    vpsravd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
   5885 ; SKX-NEXT:    retq # sched: [7:1.00]
   5886 ;
   5887 ; ZNVER1-LABEL: test_psravd:
   5888 ; ZNVER1:       # %bb.0:
   5889 ; ZNVER1-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   5890 ; ZNVER1-NEXT:    vpsravd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
   5891 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   5892   %1 = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1)
   5893   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   5894   %3 = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %1, <4 x i32> %2)
   5895   ret <4 x i32> %3
   5896 }
   5897 declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
   5898 
   5899 define <8 x i32> @test_psravd_ymm(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
   5900 ; GENERIC-LABEL: test_psravd_ymm:
   5901 ; GENERIC:       # %bb.0:
   5902 ; GENERIC-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
   5903 ; GENERIC-NEXT:    vpsravd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   5904 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5905 ;
   5906 ; HASWELL-LABEL: test_psravd_ymm:
   5907 ; HASWELL:       # %bb.0:
   5908 ; HASWELL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   5909 ; HASWELL-NEXT:    vpsravd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
   5910 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   5911 ;
   5912 ; BROADWELL-LABEL: test_psravd_ymm:
   5913 ; BROADWELL:       # %bb.0:
   5914 ; BROADWELL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   5915 ; BROADWELL-NEXT:    vpsravd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
   5916 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   5917 ;
   5918 ; SKYLAKE-LABEL: test_psravd_ymm:
   5919 ; SKYLAKE:       # %bb.0:
   5920 ; SKYLAKE-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5921 ; SKYLAKE-NEXT:    vpsravd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5922 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   5923 ;
   5924 ; SKX-LABEL: test_psravd_ymm:
   5925 ; SKX:       # %bb.0:
   5926 ; SKX-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5927 ; SKX-NEXT:    vpsravd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5928 ; SKX-NEXT:    retq # sched: [7:1.00]
   5929 ;
   5930 ; ZNVER1-LABEL: test_psravd_ymm:
   5931 ; ZNVER1:       # %bb.0:
   5932 ; ZNVER1-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   5933 ; ZNVER1-NEXT:    vpsravd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5934 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   5935   %1 = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1)
   5936   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   5937   %3 = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %1, <8 x i32> %2)
   5938   ret <8 x i32> %3
   5939 }
   5940 declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
   5941 
   5942 define <16 x i16> @test_psraw(<16 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
   5943 ; GENERIC-LABEL: test_psraw:
   5944 ; GENERIC:       # %bb.0:
   5945 ; GENERIC-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5946 ; GENERIC-NEXT:    vpsraw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   5947 ; GENERIC-NEXT:    vpsraw $2, %ymm0, %ymm0 # sched: [1:1.00]
   5948 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5949 ;
   5950 ; HASWELL-LABEL: test_psraw:
   5951 ; HASWELL:       # %bb.0:
   5952 ; HASWELL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5953 ; HASWELL-NEXT:    vpsraw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   5954 ; HASWELL-NEXT:    vpsraw $2, %ymm0, %ymm0 # sched: [1:1.00]
   5955 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   5956 ;
   5957 ; BROADWELL-LABEL: test_psraw:
   5958 ; BROADWELL:       # %bb.0:
   5959 ; BROADWELL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5960 ; BROADWELL-NEXT:    vpsraw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
   5961 ; BROADWELL-NEXT:    vpsraw $2, %ymm0, %ymm0 # sched: [1:1.00]
   5962 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   5963 ;
   5964 ; SKYLAKE-LABEL: test_psraw:
   5965 ; SKYLAKE:       # %bb.0:
   5966 ; SKYLAKE-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5967 ; SKYLAKE-NEXT:    vpsraw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5968 ; SKYLAKE-NEXT:    vpsraw $2, %ymm0, %ymm0 # sched: [1:0.50]
   5969 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   5970 ;
   5971 ; SKX-LABEL: test_psraw:
   5972 ; SKX:       # %bb.0:
   5973 ; SKX-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5974 ; SKX-NEXT:    vpsraw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   5975 ; SKX-NEXT:    vpsraw $2, %ymm0, %ymm0 # sched: [1:0.50]
   5976 ; SKX-NEXT:    retq # sched: [7:1.00]
   5977 ;
   5978 ; ZNVER1-LABEL: test_psraw:
   5979 ; ZNVER1:       # %bb.0:
   5980 ; ZNVER1-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
   5981 ; ZNVER1-NEXT:    vpsraw (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
   5982 ; ZNVER1-NEXT:    vpsraw $2, %ymm0, %ymm0 # sched: [1:0.25]
   5983 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   5984   %1 = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1)
   5985   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   5986   %3 = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> %2)
   5987   %4 = ashr <16 x i16> %3, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
   5988   ret <16 x i16> %4
   5989 }
   5990 declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
   5991 
   5992 define <8 x i32> @test_psrld(<8 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
   5993 ; GENERIC-LABEL: test_psrld:
   5994 ; GENERIC:       # %bb.0:
   5995 ; GENERIC-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   5996 ; GENERIC-NEXT:    vpsrld (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   5997 ; GENERIC-NEXT:    vpsrld $2, %ymm0, %ymm0 # sched: [1:1.00]
   5998 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   5999 ;
   6000 ; HASWELL-LABEL: test_psrld:
   6001 ; HASWELL:       # %bb.0:
   6002 ; HASWELL-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   6003 ; HASWELL-NEXT:    vpsrld (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   6004 ; HASWELL-NEXT:    vpsrld $2, %ymm0, %ymm0 # sched: [1:1.00]
   6005 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6006 ;
   6007 ; BROADWELL-LABEL: test_psrld:
   6008 ; BROADWELL:       # %bb.0:
   6009 ; BROADWELL-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   6010 ; BROADWELL-NEXT:    vpsrld (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
   6011 ; BROADWELL-NEXT:    vpsrld $2, %ymm0, %ymm0 # sched: [1:1.00]
   6012 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6013 ;
   6014 ; SKYLAKE-LABEL: test_psrld:
   6015 ; SKYLAKE:       # %bb.0:
   6016 ; SKYLAKE-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   6017 ; SKYLAKE-NEXT:    vpsrld (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6018 ; SKYLAKE-NEXT:    vpsrld $2, %ymm0, %ymm0 # sched: [1:0.50]
   6019 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   6020 ;
   6021 ; SKX-LABEL: test_psrld:
   6022 ; SKX:       # %bb.0:
   6023 ; SKX-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   6024 ; SKX-NEXT:    vpsrld (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6025 ; SKX-NEXT:    vpsrld $2, %ymm0, %ymm0 # sched: [1:0.50]
   6026 ; SKX-NEXT:    retq # sched: [7:1.00]
   6027 ;
   6028 ; ZNVER1-LABEL: test_psrld:
   6029 ; ZNVER1:       # %bb.0:
   6030 ; ZNVER1-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
   6031 ; ZNVER1-NEXT:    vpsrld (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
   6032 ; ZNVER1-NEXT:    vpsrld $2, %ymm0, %ymm0 # sched: [1:0.25]
   6033 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   6034   %1 = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1)
   6035   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   6036   %3 = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %1, <4 x i32> %2)
   6037   %4 = lshr <8 x i32> %3, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
   6038   ret <8 x i32> %4
   6039 }
   6040 declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
   6041 
   6042 define <32 x i8> @test_psrldq(<32 x i8> %a0) {
   6043 ; GENERIC-LABEL: test_psrldq:
   6044 ; GENERIC:       # %bb.0:
   6045 ; GENERIC-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero sched: [1:1.00]
   6046 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6047 ;
   6048 ; HASWELL-LABEL: test_psrldq:
   6049 ; HASWELL:       # %bb.0:
   6050 ; HASWELL-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero sched: [1:1.00]
   6051 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6052 ;
   6053 ; BROADWELL-LABEL: test_psrldq:
   6054 ; BROADWELL:       # %bb.0:
   6055 ; BROADWELL-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero sched: [1:1.00]
   6056 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6057 ;
   6058 ; SKYLAKE-LABEL: test_psrldq:
   6059 ; SKYLAKE:       # %bb.0:
   6060 ; SKYLAKE-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero sched: [1:1.00]
   6061 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   6062 ;
   6063 ; SKX-LABEL: test_psrldq:
   6064 ; SKX:       # %bb.0:
   6065 ; SKX-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero sched: [1:1.00]
   6066 ; SKX-NEXT:    retq # sched: [7:1.00]
   6067 ;
   6068 ; ZNVER1-LABEL: test_psrldq:
   6069 ; ZNVER1:       # %bb.0:
   6070 ; ZNVER1-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero sched: [2:1.00]
   6071 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   6072   %1 = shufflevector <32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
   6073   ret <32 x i8> %1
   6074 }
   6075 
   6076 define <4 x i64> @test_psrlq(<4 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
   6077 ; GENERIC-LABEL: test_psrlq:
   6078 ; GENERIC:       # %bb.0:
   6079 ; GENERIC-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   6080 ; GENERIC-NEXT:    vpsrlq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   6081 ; GENERIC-NEXT:    vpsrlq $2, %ymm0, %ymm0 # sched: [1:1.00]
   6082 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6083 ;
   6084 ; HASWELL-LABEL: test_psrlq:
   6085 ; HASWELL:       # %bb.0:
   6086 ; HASWELL-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   6087 ; HASWELL-NEXT:    vpsrlq (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   6088 ; HASWELL-NEXT:    vpsrlq $2, %ymm0, %ymm0 # sched: [1:1.00]
   6089 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6090 ;
   6091 ; BROADWELL-LABEL: test_psrlq:
   6092 ; BROADWELL:       # %bb.0:
   6093 ; BROADWELL-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   6094 ; BROADWELL-NEXT:    vpsrlq (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
   6095 ; BROADWELL-NEXT:    vpsrlq $2, %ymm0, %ymm0 # sched: [1:1.00]
   6096 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6097 ;
   6098 ; SKYLAKE-LABEL: test_psrlq:
   6099 ; SKYLAKE:       # %bb.0:
   6100 ; SKYLAKE-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   6101 ; SKYLAKE-NEXT:    vpsrlq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6102 ; SKYLAKE-NEXT:    vpsrlq $2, %ymm0, %ymm0 # sched: [1:0.50]
   6103 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   6104 ;
   6105 ; SKX-LABEL: test_psrlq:
   6106 ; SKX:       # %bb.0:
   6107 ; SKX-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   6108 ; SKX-NEXT:    vpsrlq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6109 ; SKX-NEXT:    vpsrlq $2, %ymm0, %ymm0 # sched: [1:0.50]
   6110 ; SKX-NEXT:    retq # sched: [7:1.00]
   6111 ;
   6112 ; ZNVER1-LABEL: test_psrlq:
   6113 ; ZNVER1:       # %bb.0:
   6114 ; ZNVER1-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
   6115 ; ZNVER1-NEXT:    vpsrlq (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
   6116 ; ZNVER1-NEXT:    vpsrlq $2, %ymm0, %ymm0 # sched: [1:0.25]
   6117 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   6118   %1 = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
   6119   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   6120   %3 = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %1, <2 x i64> %2)
   6121   %4 = lshr <4 x i64> %3, <i64 2, i64 2, i64 2, i64 2>
   6122   ret <4 x i64> %4
   6123 }
   6124 declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
   6125 
   6126 define <4 x i32> @test_psrlvd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
   6127 ; GENERIC-LABEL: test_psrlvd:
   6128 ; GENERIC:       # %bb.0:
   6129 ; GENERIC-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
   6130 ; GENERIC-NEXT:    vpsrlvd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
   6131 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6132 ;
   6133 ; HASWELL-LABEL: test_psrlvd:
   6134 ; HASWELL:       # %bb.0:
   6135 ; HASWELL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
   6136 ; HASWELL-NEXT:    vpsrlvd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
   6137 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6138 ;
   6139 ; BROADWELL-LABEL: test_psrlvd:
   6140 ; BROADWELL:       # %bb.0:
   6141 ; BROADWELL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
   6142 ; BROADWELL-NEXT:    vpsrlvd (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
   6143 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6144 ;
   6145 ; SKYLAKE-LABEL: test_psrlvd:
   6146 ; SKYLAKE:       # %bb.0:
   6147 ; SKYLAKE-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   6148 ; SKYLAKE-NEXT:    vpsrlvd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
   6149 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   6150 ;
   6151 ; SKX-LABEL: test_psrlvd:
   6152 ; SKX:       # %bb.0:
   6153 ; SKX-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   6154 ; SKX-NEXT:    vpsrlvd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
   6155 ; SKX-NEXT:    retq # sched: [7:1.00]
   6156 ;
   6157 ; ZNVER1-LABEL: test_psrlvd:
   6158 ; ZNVER1:       # %bb.0:
   6159 ; ZNVER1-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   6160 ; ZNVER1-NEXT:    vpsrlvd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
   6161 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   6162   %1 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1)
   6163   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   6164   %3 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %1, <4 x i32> %2)
   6165   ret <4 x i32> %3
   6166 }
   6167 declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
   6168 
   6169 define <8 x i32> @test_psrlvd_ymm(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
   6170 ; GENERIC-LABEL: test_psrlvd_ymm:
   6171 ; GENERIC:       # %bb.0:
   6172 ; GENERIC-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
   6173 ; GENERIC-NEXT:    vpsrlvd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   6174 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6175 ;
   6176 ; HASWELL-LABEL: test_psrlvd_ymm:
   6177 ; HASWELL:       # %bb.0:
   6178 ; HASWELL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   6179 ; HASWELL-NEXT:    vpsrlvd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
   6180 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6181 ;
   6182 ; BROADWELL-LABEL: test_psrlvd_ymm:
   6183 ; BROADWELL:       # %bb.0:
   6184 ; BROADWELL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
   6185 ; BROADWELL-NEXT:    vpsrlvd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
   6186 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6187 ;
   6188 ; SKYLAKE-LABEL: test_psrlvd_ymm:
   6189 ; SKYLAKE:       # %bb.0:
   6190 ; SKYLAKE-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6191 ; SKYLAKE-NEXT:    vpsrlvd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6192 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   6193 ;
   6194 ; SKX-LABEL: test_psrlvd_ymm:
   6195 ; SKX:       # %bb.0:
   6196 ; SKX-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6197 ; SKX-NEXT:    vpsrlvd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6198 ; SKX-NEXT:    retq # sched: [7:1.00]
   6199 ;
   6200 ; ZNVER1-LABEL: test_psrlvd_ymm:
   6201 ; ZNVER1:       # %bb.0:
   6202 ; ZNVER1-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6203 ; ZNVER1-NEXT:    vpsrlvd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6204 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   6205   %1 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1)
   6206   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   6207   %3 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %1, <8 x i32> %2)
   6208   ret <8 x i32> %3
   6209 }
   6210 declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
   6211 
   6212 define <2 x i64> @test_psrlvq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
   6213 ; GENERIC-LABEL: test_psrlvq:
   6214 ; GENERIC:       # %bb.0:
   6215 ; GENERIC-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
   6216 ; GENERIC-NEXT:    vpsrlvq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
   6217 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6218 ;
   6219 ; HASWELL-LABEL: test_psrlvq:
   6220 ; HASWELL:       # %bb.0:
   6221 ; HASWELL-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
   6222 ; HASWELL-NEXT:    vpsrlvq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
   6223 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6224 ;
   6225 ; BROADWELL-LABEL: test_psrlvq:
   6226 ; BROADWELL:       # %bb.0:
   6227 ; BROADWELL-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
   6228 ; BROADWELL-NEXT:    vpsrlvq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
   6229 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6230 ;
   6231 ; SKYLAKE-LABEL: test_psrlvq:
   6232 ; SKYLAKE:       # %bb.0:
   6233 ; SKYLAKE-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   6234 ; SKYLAKE-NEXT:    vpsrlvq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
   6235 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   6236 ;
   6237 ; SKX-LABEL: test_psrlvq:
   6238 ; SKX:       # %bb.0:
   6239 ; SKX-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   6240 ; SKX-NEXT:    vpsrlvq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
   6241 ; SKX-NEXT:    retq # sched: [7:1.00]
   6242 ;
   6243 ; ZNVER1-LABEL: test_psrlvq:
   6244 ; ZNVER1:       # %bb.0:
   6245 ; ZNVER1-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
   6246 ; ZNVER1-NEXT:    vpsrlvq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
   6247 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   6248   %1 = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
   6249   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   6250   %3 = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %1, <2 x i64> %2)
   6251   ret <2 x i64> %3
   6252 }
   6253 declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
   6254 
   6255 define <4 x i64> @test_psrlvq_ymm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
   6256 ; GENERIC-LABEL: test_psrlvq_ymm:
   6257 ; GENERIC:       # %bb.0:
   6258 ; GENERIC-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
   6259 ; GENERIC-NEXT:    vpsrlvq (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   6260 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6261 ;
   6262 ; HASWELL-LABEL: test_psrlvq_ymm:
   6263 ; HASWELL:       # %bb.0:
   6264 ; HASWELL-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
   6265 ; HASWELL-NEXT:    vpsrlvq (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   6266 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6267 ;
   6268 ; BROADWELL-LABEL: test_psrlvq_ymm:
   6269 ; BROADWELL:       # %bb.0:
   6270 ; BROADWELL-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
   6271 ; BROADWELL-NEXT:    vpsrlvq (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
   6272 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6273 ;
   6274 ; SKYLAKE-LABEL: test_psrlvq_ymm:
   6275 ; SKYLAKE:       # %bb.0:
   6276 ; SKYLAKE-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6277 ; SKYLAKE-NEXT:    vpsrlvq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6278 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   6279 ;
   6280 ; SKX-LABEL: test_psrlvq_ymm:
   6281 ; SKX:       # %bb.0:
   6282 ; SKX-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6283 ; SKX-NEXT:    vpsrlvq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6284 ; SKX-NEXT:    retq # sched: [7:1.00]
   6285 ;
   6286 ; ZNVER1-LABEL: test_psrlvq_ymm:
   6287 ; ZNVER1:       # %bb.0:
   6288 ; ZNVER1-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6289 ; ZNVER1-NEXT:    vpsrlvq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6290 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   6291   %1 = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
   6292   %2 = load <4 x i64>, <4 x i64> *%a2, align 32
   6293   %3 = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %1, <4 x i64> %2)
   6294   ret <4 x i64> %3
   6295 }
   6296 declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
   6297 
   6298 define <16 x i16> @test_psrlw(<16 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
   6299 ; GENERIC-LABEL: test_psrlw:
   6300 ; GENERIC:       # %bb.0:
   6301 ; GENERIC-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   6302 ; GENERIC-NEXT:    vpsrlw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
   6303 ; GENERIC-NEXT:    vpsrlw $2, %ymm0, %ymm0 # sched: [1:1.00]
   6304 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6305 ;
   6306 ; HASWELL-LABEL: test_psrlw:
   6307 ; HASWELL:       # %bb.0:
   6308 ; HASWELL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   6309 ; HASWELL-NEXT:    vpsrlw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
   6310 ; HASWELL-NEXT:    vpsrlw $2, %ymm0, %ymm0 # sched: [1:1.00]
   6311 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6312 ;
   6313 ; BROADWELL-LABEL: test_psrlw:
   6314 ; BROADWELL:       # %bb.0:
   6315 ; BROADWELL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   6316 ; BROADWELL-NEXT:    vpsrlw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
   6317 ; BROADWELL-NEXT:    vpsrlw $2, %ymm0, %ymm0 # sched: [1:1.00]
   6318 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6319 ;
   6320 ; SKYLAKE-LABEL: test_psrlw:
   6321 ; SKYLAKE:       # %bb.0:
   6322 ; SKYLAKE-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   6323 ; SKYLAKE-NEXT:    vpsrlw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6324 ; SKYLAKE-NEXT:    vpsrlw $2, %ymm0, %ymm0 # sched: [1:0.50]
   6325 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   6326 ;
   6327 ; SKX-LABEL: test_psrlw:
   6328 ; SKX:       # %bb.0:
   6329 ; SKX-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
   6330 ; SKX-NEXT:    vpsrlw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6331 ; SKX-NEXT:    vpsrlw $2, %ymm0, %ymm0 # sched: [1:0.50]
   6332 ; SKX-NEXT:    retq # sched: [7:1.00]
   6333 ;
   6334 ; ZNVER1-LABEL: test_psrlw:
   6335 ; ZNVER1:       # %bb.0:
   6336 ; ZNVER1-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
   6337 ; ZNVER1-NEXT:    vpsrlw (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
   6338 ; ZNVER1-NEXT:    vpsrlw $2, %ymm0, %ymm0 # sched: [1:0.25]
   6339 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   6340   %1 = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1)
   6341   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   6342   %3 = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %1, <8 x i16> %2)
   6343   %4 = lshr <16 x i16> %3, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
   6344   ret <16 x i16> %4
   6345 }
   6346 declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
   6347 
   6348 define <32 x i8> @test_psubb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
   6349 ; GENERIC-LABEL: test_psubb:
   6350 ; GENERIC:       # %bb.0:
   6351 ; GENERIC-NEXT:    vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6352 ; GENERIC-NEXT:    vpsubb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6353 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6354 ;
   6355 ; HASWELL-LABEL: test_psubb:
   6356 ; HASWELL:       # %bb.0:
   6357 ; HASWELL-NEXT:    vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6358 ; HASWELL-NEXT:    vpsubb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6359 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6360 ;
   6361 ; BROADWELL-LABEL: test_psubb:
   6362 ; BROADWELL:       # %bb.0:
   6363 ; BROADWELL-NEXT:    vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6364 ; BROADWELL-NEXT:    vpsubb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   6365 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6366 ;
   6367 ; SKYLAKE-LABEL: test_psubb:
   6368 ; SKYLAKE:       # %bb.0:
   6369 ; SKYLAKE-NEXT:    vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   6370 ; SKYLAKE-NEXT:    vpsubb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6371 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   6372 ;
   6373 ; SKX-LABEL: test_psubb:
   6374 ; SKX:       # %bb.0:
   6375 ; SKX-NEXT:    vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   6376 ; SKX-NEXT:    vpsubb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6377 ; SKX-NEXT:    retq # sched: [7:1.00]
   6378 ;
   6379 ; ZNVER1-LABEL: test_psubb:
   6380 ; ZNVER1:       # %bb.0:
   6381 ; ZNVER1-NEXT:    vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   6382 ; ZNVER1-NEXT:    vpsubb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6383 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   6384   %1 = sub <32 x i8> %a0, %a1
   6385   %2 = load <32 x i8>, <32 x i8> *%a2, align 32
   6386   %3 = sub <32 x i8> %1, %2
   6387   ret <32 x i8> %3
   6388 }
   6389 
   6390 define <8 x i32> @test_psubd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
   6391 ; GENERIC-LABEL: test_psubd:
   6392 ; GENERIC:       # %bb.0:
   6393 ; GENERIC-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6394 ; GENERIC-NEXT:    vpsubd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6395 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6396 ;
   6397 ; HASWELL-LABEL: test_psubd:
   6398 ; HASWELL:       # %bb.0:
   6399 ; HASWELL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6400 ; HASWELL-NEXT:    vpsubd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6401 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6402 ;
   6403 ; BROADWELL-LABEL: test_psubd:
   6404 ; BROADWELL:       # %bb.0:
   6405 ; BROADWELL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6406 ; BROADWELL-NEXT:    vpsubd (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   6407 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6408 ;
   6409 ; SKYLAKE-LABEL: test_psubd:
   6410 ; SKYLAKE:       # %bb.0:
   6411 ; SKYLAKE-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   6412 ; SKYLAKE-NEXT:    vpsubd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6413 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   6414 ;
   6415 ; SKX-LABEL: test_psubd:
   6416 ; SKX:       # %bb.0:
   6417 ; SKX-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   6418 ; SKX-NEXT:    vpsubd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6419 ; SKX-NEXT:    retq # sched: [7:1.00]
   6420 ;
   6421 ; ZNVER1-LABEL: test_psubd:
   6422 ; ZNVER1:       # %bb.0:
   6423 ; ZNVER1-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   6424 ; ZNVER1-NEXT:    vpsubd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6425 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   6426   %1 = sub <8 x i32> %a0, %a1
   6427   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   6428   %3 = sub <8 x i32> %1, %2
   6429   ret <8 x i32> %3
   6430 }
   6431 
   6432 define <4 x i64> @test_psubq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
   6433 ; GENERIC-LABEL: test_psubq:
   6434 ; GENERIC:       # %bb.0:
   6435 ; GENERIC-NEXT:    vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6436 ; GENERIC-NEXT:    vpsubq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6437 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6438 ;
   6439 ; HASWELL-LABEL: test_psubq:
   6440 ; HASWELL:       # %bb.0:
   6441 ; HASWELL-NEXT:    vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6442 ; HASWELL-NEXT:    vpsubq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6443 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6444 ;
   6445 ; BROADWELL-LABEL: test_psubq:
   6446 ; BROADWELL:       # %bb.0:
   6447 ; BROADWELL-NEXT:    vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6448 ; BROADWELL-NEXT:    vpsubq (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   6449 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6450 ;
   6451 ; SKYLAKE-LABEL: test_psubq:
   6452 ; SKYLAKE:       # %bb.0:
   6453 ; SKYLAKE-NEXT:    vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   6454 ; SKYLAKE-NEXT:    vpsubq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6455 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   6456 ;
   6457 ; SKX-LABEL: test_psubq:
   6458 ; SKX:       # %bb.0:
   6459 ; SKX-NEXT:    vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   6460 ; SKX-NEXT:    vpsubq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6461 ; SKX-NEXT:    retq # sched: [7:1.00]
   6462 ;
   6463 ; ZNVER1-LABEL: test_psubq:
   6464 ; ZNVER1:       # %bb.0:
   6465 ; ZNVER1-NEXT:    vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   6466 ; ZNVER1-NEXT:    vpsubq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6467 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   6468   %1 = sub <4 x i64> %a0, %a1
   6469   %2 = load <4 x i64>, <4 x i64> *%a2, align 32
   6470   %3 = sub <4 x i64> %1, %2
   6471   ret <4 x i64> %3
   6472 }
   6473 
   6474 define <32 x i8> @test_psubsb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
   6475 ; GENERIC-LABEL: test_psubsb:
   6476 ; GENERIC:       # %bb.0:
   6477 ; GENERIC-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6478 ; GENERIC-NEXT:    vpsubsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6479 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6480 ;
   6481 ; HASWELL-LABEL: test_psubsb:
   6482 ; HASWELL:       # %bb.0:
   6483 ; HASWELL-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6484 ; HASWELL-NEXT:    vpsubsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6485 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6486 ;
   6487 ; BROADWELL-LABEL: test_psubsb:
   6488 ; BROADWELL:       # %bb.0:
   6489 ; BROADWELL-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6490 ; BROADWELL-NEXT:    vpsubsb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   6491 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6492 ;
   6493 ; SKYLAKE-LABEL: test_psubsb:
   6494 ; SKYLAKE:       # %bb.0:
   6495 ; SKYLAKE-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6496 ; SKYLAKE-NEXT:    vpsubsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6497 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   6498 ;
   6499 ; SKX-LABEL: test_psubsb:
   6500 ; SKX:       # %bb.0:
   6501 ; SKX-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6502 ; SKX-NEXT:    vpsubsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6503 ; SKX-NEXT:    retq # sched: [7:1.00]
   6504 ;
   6505 ; ZNVER1-LABEL: test_psubsb:
   6506 ; ZNVER1:       # %bb.0:
   6507 ; ZNVER1-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   6508 ; ZNVER1-NEXT:    vpsubsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6509 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   6510   %1 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1)
   6511   %2 = load <32 x i8>, <32 x i8> *%a2, align 32
   6512   %3 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %1, <32 x i8> %2)
   6513   ret <32 x i8> %3
   6514 }
   6515 declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
   6516 
   6517 define <16 x i16> @test_psubsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   6518 ; GENERIC-LABEL: test_psubsw:
   6519 ; GENERIC:       # %bb.0:
   6520 ; GENERIC-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6521 ; GENERIC-NEXT:    vpsubsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6522 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6523 ;
   6524 ; HASWELL-LABEL: test_psubsw:
   6525 ; HASWELL:       # %bb.0:
   6526 ; HASWELL-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6527 ; HASWELL-NEXT:    vpsubsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6528 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6529 ;
   6530 ; BROADWELL-LABEL: test_psubsw:
   6531 ; BROADWELL:       # %bb.0:
   6532 ; BROADWELL-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6533 ; BROADWELL-NEXT:    vpsubsw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   6534 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6535 ;
   6536 ; SKYLAKE-LABEL: test_psubsw:
   6537 ; SKYLAKE:       # %bb.0:
   6538 ; SKYLAKE-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6539 ; SKYLAKE-NEXT:    vpsubsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6540 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   6541 ;
   6542 ; SKX-LABEL: test_psubsw:
   6543 ; SKX:       # %bb.0:
   6544 ; SKX-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6545 ; SKX-NEXT:    vpsubsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6546 ; SKX-NEXT:    retq # sched: [7:1.00]
   6547 ;
   6548 ; ZNVER1-LABEL: test_psubsw:
   6549 ; ZNVER1:       # %bb.0:
   6550 ; ZNVER1-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   6551 ; ZNVER1-NEXT:    vpsubsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6552 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   6553   %1 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1)
   6554   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   6555   %3 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %1, <16 x i16> %2)
   6556   ret <16 x i16> %3
   6557 }
   6558 declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone
   6559 
   6560 define <32 x i8> @test_psubusb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
   6561 ; GENERIC-LABEL: test_psubusb:
   6562 ; GENERIC:       # %bb.0:
   6563 ; GENERIC-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6564 ; GENERIC-NEXT:    vpsubusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6565 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6566 ;
   6567 ; HASWELL-LABEL: test_psubusb:
   6568 ; HASWELL:       # %bb.0:
   6569 ; HASWELL-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6570 ; HASWELL-NEXT:    vpsubusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6571 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6572 ;
   6573 ; BROADWELL-LABEL: test_psubusb:
   6574 ; BROADWELL:       # %bb.0:
   6575 ; BROADWELL-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6576 ; BROADWELL-NEXT:    vpsubusb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   6577 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6578 ;
   6579 ; SKYLAKE-LABEL: test_psubusb:
   6580 ; SKYLAKE:       # %bb.0:
   6581 ; SKYLAKE-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6582 ; SKYLAKE-NEXT:    vpsubusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6583 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   6584 ;
   6585 ; SKX-LABEL: test_psubusb:
   6586 ; SKX:       # %bb.0:
   6587 ; SKX-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6588 ; SKX-NEXT:    vpsubusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6589 ; SKX-NEXT:    retq # sched: [7:1.00]
   6590 ;
   6591 ; ZNVER1-LABEL: test_psubusb:
   6592 ; ZNVER1:       # %bb.0:
   6593 ; ZNVER1-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   6594 ; ZNVER1-NEXT:    vpsubusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6595 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   6596   %1 = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1)
   6597   %2 = load <32 x i8>, <32 x i8> *%a2, align 32
   6598   %3 = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %1, <32 x i8> %2)
   6599   ret <32 x i8> %3
   6600 }
   6601 declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone
   6602 
   6603 define <16 x i16> @test_psubusw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   6604 ; GENERIC-LABEL: test_psubusw:
   6605 ; GENERIC:       # %bb.0:
   6606 ; GENERIC-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6607 ; GENERIC-NEXT:    vpsubusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6608 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6609 ;
   6610 ; HASWELL-LABEL: test_psubusw:
   6611 ; HASWELL:       # %bb.0:
   6612 ; HASWELL-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6613 ; HASWELL-NEXT:    vpsubusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6614 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6615 ;
   6616 ; BROADWELL-LABEL: test_psubusw:
   6617 ; BROADWELL:       # %bb.0:
   6618 ; BROADWELL-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6619 ; BROADWELL-NEXT:    vpsubusw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   6620 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6621 ;
   6622 ; SKYLAKE-LABEL: test_psubusw:
   6623 ; SKYLAKE:       # %bb.0:
   6624 ; SKYLAKE-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6625 ; SKYLAKE-NEXT:    vpsubusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6626 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   6627 ;
   6628 ; SKX-LABEL: test_psubusw:
   6629 ; SKX:       # %bb.0:
   6630 ; SKX-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6631 ; SKX-NEXT:    vpsubusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6632 ; SKX-NEXT:    retq # sched: [7:1.00]
   6633 ;
   6634 ; ZNVER1-LABEL: test_psubusw:
   6635 ; ZNVER1:       # %bb.0:
   6636 ; ZNVER1-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   6637 ; ZNVER1-NEXT:    vpsubusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6638 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   6639   %1 = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1)
   6640   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   6641   %3 = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %1, <16 x i16> %2)
   6642   ret <16 x i16> %3
   6643 }
   6644 declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone
   6645 
   6646 define <16 x i16> @test_psubw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   6647 ; GENERIC-LABEL: test_psubw:
   6648 ; GENERIC:       # %bb.0:
   6649 ; GENERIC-NEXT:    vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6650 ; GENERIC-NEXT:    vpsubw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6651 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6652 ;
   6653 ; HASWELL-LABEL: test_psubw:
   6654 ; HASWELL:       # %bb.0:
   6655 ; HASWELL-NEXT:    vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6656 ; HASWELL-NEXT:    vpsubw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6657 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6658 ;
   6659 ; BROADWELL-LABEL: test_psubw:
   6660 ; BROADWELL:       # %bb.0:
   6661 ; BROADWELL-NEXT:    vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6662 ; BROADWELL-NEXT:    vpsubw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   6663 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6664 ;
   6665 ; SKYLAKE-LABEL: test_psubw:
   6666 ; SKYLAKE:       # %bb.0:
   6667 ; SKYLAKE-NEXT:    vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   6668 ; SKYLAKE-NEXT:    vpsubw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6669 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   6670 ;
   6671 ; SKX-LABEL: test_psubw:
   6672 ; SKX:       # %bb.0:
   6673 ; SKX-NEXT:    vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   6674 ; SKX-NEXT:    vpsubw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6675 ; SKX-NEXT:    retq # sched: [7:1.00]
   6676 ;
   6677 ; ZNVER1-LABEL: test_psubw:
   6678 ; ZNVER1:       # %bb.0:
   6679 ; ZNVER1-NEXT:    vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   6680 ; ZNVER1-NEXT:    vpsubw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   6681 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   6682   %1 = sub <16 x i16> %a0, %a1
   6683   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   6684   %3 = sub <16 x i16> %1, %2
   6685   ret <16 x i16> %3
   6686 }
   6687 
   6688 define <32 x i8> @test_punpckhbw(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
   6689 ; GENERIC-LABEL: test_punpckhbw:
   6690 ; GENERIC:       # %bb.0:
   6691 ; GENERIC-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] sched: [1:1.00]
   6692 ; GENERIC-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] sched: [8:1.00]
   6693 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6694 ;
   6695 ; HASWELL-LABEL: test_punpckhbw:
   6696 ; HASWELL:       # %bb.0:
   6697 ; HASWELL-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] sched: [1:1.00]
   6698 ; HASWELL-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] sched: [8:1.00]
   6699 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6700 ;
   6701 ; BROADWELL-LABEL: test_punpckhbw:
   6702 ; BROADWELL:       # %bb.0:
   6703 ; BROADWELL-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] sched: [1:1.00]
   6704 ; BROADWELL-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] sched: [7:1.00]
   6705 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6706 ;
   6707 ; SKYLAKE-LABEL: test_punpckhbw:
   6708 ; SKYLAKE:       # %bb.0:
   6709 ; SKYLAKE-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] sched: [1:1.00]
   6710 ; SKYLAKE-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] sched: [8:1.00]
   6711 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   6712 ;
   6713 ; SKX-LABEL: test_punpckhbw:
   6714 ; SKX:       # %bb.0:
   6715 ; SKX-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] sched: [1:1.00]
   6716 ; SKX-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] sched: [8:1.00]
   6717 ; SKX-NEXT:    retq # sched: [7:1.00]
   6718 ;
   6719 ; ZNVER1-LABEL: test_punpckhbw:
   6720 ; ZNVER1:       # %bb.0:
   6721 ; ZNVER1-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] sched: [1:0.25]
   6722 ; ZNVER1-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] sched: [8:0.50]
   6723 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   6724   %1 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   6725   %2 = load <32 x i8>, <32 x i8> *%a2, align 32
   6726   %3 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   6727   ret <32 x i8> %3
   6728 }
   6729 
   6730 define <8 x i32> @test_punpckhdq(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
   6731 ; GENERIC-LABEL: test_punpckhdq:
   6732 ; GENERIC:       # %bb.0:
   6733 ; GENERIC-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   6734 ; GENERIC-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   6735 ; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
   6736 ; GENERIC-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6737 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6738 ;
   6739 ; HASWELL-LABEL: test_punpckhdq:
   6740 ; HASWELL:       # %bb.0:
   6741 ; HASWELL-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   6742 ; HASWELL-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   6743 ; HASWELL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
   6744 ; HASWELL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6745 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6746 ;
   6747 ; BROADWELL-LABEL: test_punpckhdq:
   6748 ; BROADWELL:       # %bb.0:
   6749 ; BROADWELL-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   6750 ; BROADWELL-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
   6751 ; BROADWELL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
   6752 ; BROADWELL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6753 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6754 ;
   6755 ; SKYLAKE-LABEL: test_punpckhdq:
   6756 ; SKYLAKE:       # %bb.0:
   6757 ; SKYLAKE-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   6758 ; SKYLAKE-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   6759 ; SKYLAKE-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
   6760 ; SKYLAKE-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   6761 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   6762 ;
   6763 ; SKX-LABEL: test_punpckhdq:
   6764 ; SKX:       # %bb.0:
   6765 ; SKX-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
   6766 ; SKX-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
   6767 ; SKX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
   6768 ; SKX-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   6769 ; SKX-NEXT:    retq # sched: [7:1.00]
   6770 ;
   6771 ; ZNVER1-LABEL: test_punpckhdq:
   6772 ; ZNVER1:       # %bb.0:
   6773 ; ZNVER1-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:0.25]
   6774 ; ZNVER1-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:0.50]
   6775 ; ZNVER1-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.25]
   6776 ; ZNVER1-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   6777 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   6778   %1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   6779   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   6780   %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   6781   %4 = add <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   6782   ret <8 x i32> %4
   6783 }
   6784 
   6785 define <4 x i64> @test_punpckhqdq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
   6786 ; GENERIC-LABEL: test_punpckhqdq:
   6787 ; GENERIC:       # %bb.0:
   6788 ; GENERIC-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   6789 ; GENERIC-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   6790 ; GENERIC-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
   6791 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6792 ;
   6793 ; HASWELL-LABEL: test_punpckhqdq:
   6794 ; HASWELL:       # %bb.0:
   6795 ; HASWELL-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   6796 ; HASWELL-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   6797 ; HASWELL-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
   6798 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6799 ;
   6800 ; BROADWELL-LABEL: test_punpckhqdq:
   6801 ; BROADWELL:       # %bb.0:
   6802 ; BROADWELL-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   6803 ; BROADWELL-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
   6804 ; BROADWELL-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
   6805 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6806 ;
   6807 ; SKYLAKE-LABEL: test_punpckhqdq:
   6808 ; SKYLAKE:       # %bb.0:
   6809 ; SKYLAKE-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   6810 ; SKYLAKE-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   6811 ; SKYLAKE-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
   6812 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   6813 ;
   6814 ; SKX-LABEL: test_punpckhqdq:
   6815 ; SKX:       # %bb.0:
   6816 ; SKX-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
   6817 ; SKX-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
   6818 ; SKX-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
   6819 ; SKX-NEXT:    retq # sched: [7:1.00]
   6820 ;
   6821 ; ZNVER1-LABEL: test_punpckhqdq:
   6822 ; ZNVER1:       # %bb.0:
   6823 ; ZNVER1-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:0.25]
   6824 ; ZNVER1-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:0.50]
   6825 ; ZNVER1-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
   6826 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   6827   %1 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   6828   %2 = load <4 x i64>, <4 x i64> *%a2, align 32
   6829   %3 = shufflevector <4 x i64> %a0, <4 x i64> %2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   6830   %4 = add <4 x i64> %1, %3
   6831   ret <4 x i64> %4
   6832 }
   6833 
   6834 define <16 x i16> @test_punpckhwd(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   6835 ; GENERIC-LABEL: test_punpckhwd:
   6836 ; GENERIC:       # %bb.0:
   6837 ; GENERIC-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] sched: [1:1.00]
   6838 ; GENERIC-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] sched: [8:1.00]
   6839 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6840 ;
   6841 ; HASWELL-LABEL: test_punpckhwd:
   6842 ; HASWELL:       # %bb.0:
   6843 ; HASWELL-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] sched: [1:1.00]
   6844 ; HASWELL-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] sched: [8:1.00]
   6845 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6846 ;
   6847 ; BROADWELL-LABEL: test_punpckhwd:
   6848 ; BROADWELL:       # %bb.0:
   6849 ; BROADWELL-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] sched: [1:1.00]
   6850 ; BROADWELL-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] sched: [7:1.00]
   6851 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6852 ;
   6853 ; SKYLAKE-LABEL: test_punpckhwd:
   6854 ; SKYLAKE:       # %bb.0:
   6855 ; SKYLAKE-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] sched: [1:1.00]
   6856 ; SKYLAKE-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] sched: [8:1.00]
   6857 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   6858 ;
   6859 ; SKX-LABEL: test_punpckhwd:
   6860 ; SKX:       # %bb.0:
   6861 ; SKX-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] sched: [1:1.00]
   6862 ; SKX-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] sched: [8:1.00]
   6863 ; SKX-NEXT:    retq # sched: [7:1.00]
   6864 ;
   6865 ; ZNVER1-LABEL: test_punpckhwd:
   6866 ; ZNVER1:       # %bb.0:
   6867 ; ZNVER1-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] sched: [1:0.25]
   6868 ; ZNVER1-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] sched: [8:0.50]
   6869 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   6870   %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   6871   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   6872   %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   6873   ret <16 x i16> %3
   6874 }
   6875 
   6876 define <32 x i8> @test_punpcklbw(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
   6877 ; GENERIC-LABEL: test_punpcklbw:
   6878 ; GENERIC:       # %bb.0:
   6879 ; GENERIC-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] sched: [1:1.00]
   6880 ; GENERIC-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] sched: [8:1.00]
   6881 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6882 ;
   6883 ; HASWELL-LABEL: test_punpcklbw:
   6884 ; HASWELL:       # %bb.0:
   6885 ; HASWELL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] sched: [1:1.00]
   6886 ; HASWELL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] sched: [8:1.00]
   6887 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6888 ;
   6889 ; BROADWELL-LABEL: test_punpcklbw:
   6890 ; BROADWELL:       # %bb.0:
   6891 ; BROADWELL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] sched: [1:1.00]
   6892 ; BROADWELL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] sched: [7:1.00]
   6893 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6894 ;
   6895 ; SKYLAKE-LABEL: test_punpcklbw:
   6896 ; SKYLAKE:       # %bb.0:
   6897 ; SKYLAKE-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] sched: [1:1.00]
   6898 ; SKYLAKE-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] sched: [8:1.00]
   6899 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   6900 ;
   6901 ; SKX-LABEL: test_punpcklbw:
   6902 ; SKX:       # %bb.0:
   6903 ; SKX-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] sched: [1:1.00]
   6904 ; SKX-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] sched: [8:1.00]
   6905 ; SKX-NEXT:    retq # sched: [7:1.00]
   6906 ;
   6907 ; ZNVER1-LABEL: test_punpcklbw:
   6908 ; ZNVER1:       # %bb.0:
   6909 ; ZNVER1-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] sched: [1:0.25]
   6910 ; ZNVER1-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] sched: [8:0.50]
   6911 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   6912   %1 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
   6913   %2 = load <32 x i8>, <32 x i8> *%a2, align 32
   6914   %3 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
   6915   ret <32 x i8> %3
   6916 }
   6917 
   6918 define <8 x i32> @test_punpckldq(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
   6919 ; GENERIC-LABEL: test_punpckldq:
   6920 ; GENERIC:       # %bb.0:
   6921 ; GENERIC-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   6922 ; GENERIC-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   6923 ; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
   6924 ; GENERIC-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6925 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6926 ;
   6927 ; HASWELL-LABEL: test_punpckldq:
   6928 ; HASWELL:       # %bb.0:
   6929 ; HASWELL-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   6930 ; HASWELL-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   6931 ; HASWELL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
   6932 ; HASWELL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6933 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6934 ;
   6935 ; BROADWELL-LABEL: test_punpckldq:
   6936 ; BROADWELL:       # %bb.0:
   6937 ; BROADWELL-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   6938 ; BROADWELL-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
   6939 ; BROADWELL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
   6940 ; BROADWELL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   6941 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6942 ;
   6943 ; SKYLAKE-LABEL: test_punpckldq:
   6944 ; SKYLAKE:       # %bb.0:
   6945 ; SKYLAKE-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   6946 ; SKYLAKE-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   6947 ; SKYLAKE-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
   6948 ; SKYLAKE-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   6949 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   6950 ;
   6951 ; SKX-LABEL: test_punpckldq:
   6952 ; SKX:       # %bb.0:
   6953 ; SKX-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
   6954 ; SKX-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
   6955 ; SKX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
   6956 ; SKX-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   6957 ; SKX-NEXT:    retq # sched: [7:1.00]
   6958 ;
   6959 ; ZNVER1-LABEL: test_punpckldq:
   6960 ; ZNVER1:       # %bb.0:
   6961 ; ZNVER1-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:0.25]
   6962 ; ZNVER1-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:0.50]
   6963 ; ZNVER1-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.25]
   6964 ; ZNVER1-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   6965 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   6966   %1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   6967   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   6968   %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   6969   %4 = add <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   6970   ret <8 x i32> %4
   6971 }
   6972 
   6973 define <4 x i64> @test_punpcklqdq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
   6974 ; GENERIC-LABEL: test_punpcklqdq:
   6975 ; GENERIC:       # %bb.0:
   6976 ; GENERIC-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   6977 ; GENERIC-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   6978 ; GENERIC-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
   6979 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   6980 ;
   6981 ; HASWELL-LABEL: test_punpcklqdq:
   6982 ; HASWELL:       # %bb.0:
   6983 ; HASWELL-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   6984 ; HASWELL-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   6985 ; HASWELL-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
   6986 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   6987 ;
   6988 ; BROADWELL-LABEL: test_punpcklqdq:
   6989 ; BROADWELL:       # %bb.0:
   6990 ; BROADWELL-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   6991 ; BROADWELL-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
   6992 ; BROADWELL-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
   6993 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   6994 ;
   6995 ; SKYLAKE-LABEL: test_punpcklqdq:
   6996 ; SKYLAKE:       # %bb.0:
   6997 ; SKYLAKE-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   6998 ; SKYLAKE-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   6999 ; SKYLAKE-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
   7000 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   7001 ;
   7002 ; SKX-LABEL: test_punpcklqdq:
   7003 ; SKX:       # %bb.0:
   7004 ; SKX-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
   7005 ; SKX-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
   7006 ; SKX-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
   7007 ; SKX-NEXT:    retq # sched: [7:1.00]
   7008 ;
   7009 ; ZNVER1-LABEL: test_punpcklqdq:
   7010 ; ZNVER1:       # %bb.0:
   7011 ; ZNVER1-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:0.25]
   7012 ; ZNVER1-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:0.50]
   7013 ; ZNVER1-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
   7014 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   7015   %1 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   7016   %2 = load <4 x i64>, <4 x i64> *%a2, align 32
   7017   %3 = shufflevector <4 x i64> %a0, <4 x i64> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   7018   %4 = add <4 x i64> %1, %3
   7019   ret <4 x i64> %4
   7020 }
   7021 
   7022 define <16 x i16> @test_punpcklwd(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
   7023 ; GENERIC-LABEL: test_punpcklwd:
   7024 ; GENERIC:       # %bb.0:
   7025 ; GENERIC-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] sched: [1:1.00]
   7026 ; GENERIC-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] sched: [8:1.00]
   7027 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7028 ;
   7029 ; HASWELL-LABEL: test_punpcklwd:
   7030 ; HASWELL:       # %bb.0:
   7031 ; HASWELL-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] sched: [1:1.00]
   7032 ; HASWELL-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] sched: [8:1.00]
   7033 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   7034 ;
   7035 ; BROADWELL-LABEL: test_punpcklwd:
   7036 ; BROADWELL:       # %bb.0:
   7037 ; BROADWELL-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] sched: [1:1.00]
   7038 ; BROADWELL-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] sched: [7:1.00]
   7039 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   7040 ;
   7041 ; SKYLAKE-LABEL: test_punpcklwd:
   7042 ; SKYLAKE:       # %bb.0:
   7043 ; SKYLAKE-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] sched: [1:1.00]
   7044 ; SKYLAKE-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] sched: [8:1.00]
   7045 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   7046 ;
   7047 ; SKX-LABEL: test_punpcklwd:
   7048 ; SKX:       # %bb.0:
   7049 ; SKX-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] sched: [1:1.00]
   7050 ; SKX-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] sched: [8:1.00]
   7051 ; SKX-NEXT:    retq # sched: [7:1.00]
   7052 ;
   7053 ; ZNVER1-LABEL: test_punpcklwd:
   7054 ; ZNVER1:       # %bb.0:
   7055 ; ZNVER1-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] sched: [1:0.25]
   7056 ; ZNVER1-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] sched: [8:0.50]
   7057 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   7058   %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
   7059   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   7060   %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
   7061   ret <16 x i16> %3
   7062 }
   7063 
   7064 define <4 x i64> @test_pxor(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
   7065 ; GENERIC-LABEL: test_pxor:
   7066 ; GENERIC:       # %bb.0:
   7067 ; GENERIC-NEXT:    vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   7068 ; GENERIC-NEXT:    vpxor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   7069 ; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   7070 ; GENERIC-NEXT:    retq # sched: [1:1.00]
   7071 ;
   7072 ; HASWELL-LABEL: test_pxor:
   7073 ; HASWELL:       # %bb.0:
   7074 ; HASWELL-NEXT:    vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   7075 ; HASWELL-NEXT:    vpxor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   7076 ; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   7077 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   7078 ;
   7079 ; BROADWELL-LABEL: test_pxor:
   7080 ; BROADWELL:       # %bb.0:
   7081 ; BROADWELL-NEXT:    vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   7082 ; BROADWELL-NEXT:    vpxor (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
   7083 ; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
   7084 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
   7085 ;
   7086 ; SKYLAKE-LABEL: test_pxor:
   7087 ; SKYLAKE:       # %bb.0:
   7088 ; SKYLAKE-NEXT:    vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   7089 ; SKYLAKE-NEXT:    vpxor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   7090 ; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   7091 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
   7092 ;
   7093 ; SKX-LABEL: test_pxor:
   7094 ; SKX:       # %bb.0:
   7095 ; SKX-NEXT:    vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   7096 ; SKX-NEXT:    vpxor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   7097 ; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
   7098 ; SKX-NEXT:    retq # sched: [7:1.00]
   7099 ;
   7100 ; ZNVER1-LABEL: test_pxor:
   7101 ; ZNVER1:       # %bb.0:
   7102 ; ZNVER1-NEXT:    vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   7103 ; ZNVER1-NEXT:    vpxor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
   7104 ; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
   7105 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   7106   %1 = xor <4 x i64> %a0, %a1
   7107   %2 = load <4 x i64>, <4 x i64> *%a2, align 32
   7108   %3 = xor <4 x i64> %1, %2
   7109   %4 = add <4 x i64> %3, %a1
   7110   ret <4 x i64> %4
   7111 }
   7112 
   7113 !0 = !{i32 1}
   7114