Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512F
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512BW
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512DQ
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512bw,+avx512vl | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
      7 
      8 define <8 x double> @addpd512(<8 x double> %y, <8 x double> %x) {
      9 ; CHECK-LABEL: addpd512:
     10 ; CHECK:       # %bb.0: # %entry
     11 ; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
     12 ; CHECK-NEXT:    retq
     13 entry:
     14   %add.i = fadd <8 x double> %x, %y
     15   ret <8 x double> %add.i
     16 }
     17 
     18 define <8 x double> @addpd512fold(<8 x double> %y) {
     19 ; CHECK-LABEL: addpd512fold:
     20 ; CHECK:       # %bb.0: # %entry
     21 ; CHECK-NEXT:    vaddpd {{.*}}(%rip), %zmm0, %zmm0
     22 ; CHECK-NEXT:    retq
     23 entry:
     24   %add.i = fadd <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.800000e+00, double 2.300000e+00, double 1.200000e+00>
     25   ret <8 x double> %add.i
     26 }
     27 
     28 define <16 x float> @addps512(<16 x float> %y, <16 x float> %x) {
     29 ; CHECK-LABEL: addps512:
     30 ; CHECK:       # %bb.0: # %entry
     31 ; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
     32 ; CHECK-NEXT:    retq
     33 entry:
     34   %add.i = fadd <16 x float> %x, %y
     35   ret <16 x float> %add.i
     36 }
     37 
     38 define <16 x float> @addps512fold(<16 x float> %y) {
     39 ; CHECK-LABEL: addps512fold:
     40 ; CHECK:       # %bb.0: # %entry
     41 ; CHECK-NEXT:    vaddps {{.*}}(%rip), %zmm0, %zmm0
     42 ; CHECK-NEXT:    retq
     43 entry:
     44   %add.i = fadd <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 4.500000e+00, float 4.500000e+00, float 0x400B333340000000,  float 0x4002666660000000, float 0x3FF3333340000000>
     45   ret <16 x float> %add.i
     46 }
     47 
     48 define <8 x double> @subpd512(<8 x double> %y, <8 x double> %x) {
     49 ; CHECK-LABEL: subpd512:
     50 ; CHECK:       # %bb.0: # %entry
     51 ; CHECK-NEXT:    vsubpd %zmm0, %zmm1, %zmm0
     52 ; CHECK-NEXT:    retq
     53 entry:
     54   %sub.i = fsub <8 x double> %x, %y
     55   ret <8 x double> %sub.i
     56 }
     57 
     58 define <8 x double> @subpd512fold(<8 x double> %y, <8 x double>* %x) {
     59 ; CHECK-LABEL: subpd512fold:
     60 ; CHECK:       # %bb.0: # %entry
     61 ; CHECK-NEXT:    vsubpd (%rdi), %zmm0, %zmm0
     62 ; CHECK-NEXT:    retq
     63 entry:
     64   %tmp2 = load <8 x double>, <8 x double>* %x, align 8
     65   %sub.i = fsub <8 x double> %y, %tmp2
     66   ret <8 x double> %sub.i
     67 }
     68 
     69 define <16 x float> @subps512(<16 x float> %y, <16 x float> %x) {
     70 ; CHECK-LABEL: subps512:
     71 ; CHECK:       # %bb.0: # %entry
     72 ; CHECK-NEXT:    vsubps %zmm0, %zmm1, %zmm0
     73 ; CHECK-NEXT:    retq
     74 entry:
     75   %sub.i = fsub <16 x float> %x, %y
     76   ret <16 x float> %sub.i
     77 }
     78 
     79 define <16 x float> @subps512fold(<16 x float> %y, <16 x float>* %x) {
     80 ; CHECK-LABEL: subps512fold:
     81 ; CHECK:       # %bb.0: # %entry
     82 ; CHECK-NEXT:    vsubps (%rdi), %zmm0, %zmm0
     83 ; CHECK-NEXT:    retq
     84 entry:
     85   %tmp2 = load <16 x float>, <16 x float>* %x, align 4
     86   %sub.i = fsub <16 x float> %y, %tmp2
     87   ret <16 x float> %sub.i
     88 }
     89 
     90 define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
     91 ; AVX512F-LABEL: imulq512:
     92 ; AVX512F:       # %bb.0:
     93 ; AVX512F-NEXT:    vpsrlq $32, %zmm1, %zmm2
     94 ; AVX512F-NEXT:    vpmuludq %zmm0, %zmm2, %zmm2
     95 ; AVX512F-NEXT:    vpsrlq $32, %zmm0, %zmm3
     96 ; AVX512F-NEXT:    vpmuludq %zmm3, %zmm1, %zmm3
     97 ; AVX512F-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
     98 ; AVX512F-NEXT:    vpsllq $32, %zmm2, %zmm2
     99 ; AVX512F-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
    100 ; AVX512F-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
    101 ; AVX512F-NEXT:    retq
    102 ;
    103 ; AVX512VL-LABEL: imulq512:
    104 ; AVX512VL:       # %bb.0:
    105 ; AVX512VL-NEXT:    vpsrlq $32, %zmm1, %zmm2
    106 ; AVX512VL-NEXT:    vpmuludq %zmm0, %zmm2, %zmm2
    107 ; AVX512VL-NEXT:    vpsrlq $32, %zmm0, %zmm3
    108 ; AVX512VL-NEXT:    vpmuludq %zmm3, %zmm1, %zmm3
    109 ; AVX512VL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
    110 ; AVX512VL-NEXT:    vpsllq $32, %zmm2, %zmm2
    111 ; AVX512VL-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
    112 ; AVX512VL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
    113 ; AVX512VL-NEXT:    retq
    114 ;
    115 ; AVX512BW-LABEL: imulq512:
    116 ; AVX512BW:       # %bb.0:
    117 ; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm2
    118 ; AVX512BW-NEXT:    vpmuludq %zmm0, %zmm2, %zmm2
    119 ; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm3
    120 ; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm1, %zmm3
    121 ; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
    122 ; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
    123 ; AVX512BW-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
    124 ; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
    125 ; AVX512BW-NEXT:    retq
    126 ;
    127 ; AVX512DQ-LABEL: imulq512:
    128 ; AVX512DQ:       # %bb.0:
    129 ; AVX512DQ-NEXT:    vpmullq %zmm0, %zmm1, %zmm0
    130 ; AVX512DQ-NEXT:    retq
    131 ;
    132 ; SKX-LABEL: imulq512:
    133 ; SKX:       # %bb.0:
    134 ; SKX-NEXT:    vpmullq %zmm0, %zmm1, %zmm0
    135 ; SKX-NEXT:    retq
    136   %z = mul <8 x i64>%x, %y
    137   ret <8 x i64>%z
    138 }
    139 
    140 define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) {
    141 ; AVX512F-LABEL: imulq256:
    142 ; AVX512F:       # %bb.0:
    143 ; AVX512F-NEXT:    vpsrlq $32, %ymm1, %ymm2
    144 ; AVX512F-NEXT:    vpmuludq %ymm0, %ymm2, %ymm2
    145 ; AVX512F-NEXT:    vpsrlq $32, %ymm0, %ymm3
    146 ; AVX512F-NEXT:    vpmuludq %ymm3, %ymm1, %ymm3
    147 ; AVX512F-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
    148 ; AVX512F-NEXT:    vpsllq $32, %ymm2, %ymm2
    149 ; AVX512F-NEXT:    vpmuludq %ymm0, %ymm1, %ymm0
    150 ; AVX512F-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    151 ; AVX512F-NEXT:    retq
    152 ;
    153 ; AVX512VL-LABEL: imulq256:
    154 ; AVX512VL:       # %bb.0:
    155 ; AVX512VL-NEXT:    vpsrlq $32, %ymm1, %ymm2
    156 ; AVX512VL-NEXT:    vpmuludq %ymm0, %ymm2, %ymm2
    157 ; AVX512VL-NEXT:    vpsrlq $32, %ymm0, %ymm3
    158 ; AVX512VL-NEXT:    vpmuludq %ymm3, %ymm1, %ymm3
    159 ; AVX512VL-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
    160 ; AVX512VL-NEXT:    vpsllq $32, %ymm2, %ymm2
    161 ; AVX512VL-NEXT:    vpmuludq %ymm0, %ymm1, %ymm0
    162 ; AVX512VL-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    163 ; AVX512VL-NEXT:    retq
    164 ;
    165 ; AVX512BW-LABEL: imulq256:
    166 ; AVX512BW:       # %bb.0:
    167 ; AVX512BW-NEXT:    vpsrlq $32, %ymm1, %ymm2
    168 ; AVX512BW-NEXT:    vpmuludq %ymm0, %ymm2, %ymm2
    169 ; AVX512BW-NEXT:    vpsrlq $32, %ymm0, %ymm3
    170 ; AVX512BW-NEXT:    vpmuludq %ymm3, %ymm1, %ymm3
    171 ; AVX512BW-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
    172 ; AVX512BW-NEXT:    vpsllq $32, %ymm2, %ymm2
    173 ; AVX512BW-NEXT:    vpmuludq %ymm0, %ymm1, %ymm0
    174 ; AVX512BW-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    175 ; AVX512BW-NEXT:    retq
    176 ;
    177 ; AVX512DQ-LABEL: imulq256:
    178 ; AVX512DQ:       # %bb.0:
    179 ; AVX512DQ-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
    180 ; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    181 ; AVX512DQ-NEXT:    vpmullq %zmm0, %zmm1, %zmm0
    182 ; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
    183 ; AVX512DQ-NEXT:    retq
    184 ;
    185 ; SKX-LABEL: imulq256:
    186 ; SKX:       # %bb.0:
    187 ; SKX-NEXT:    vpmullq %ymm0, %ymm1, %ymm0
    188 ; SKX-NEXT:    retq
    189   %z = mul <4 x i64>%x, %y
    190   ret <4 x i64>%z
    191 }
    192 
    193 define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) {
    194 ; AVX512F-LABEL: imulq128:
    195 ; AVX512F:       # %bb.0:
    196 ; AVX512F-NEXT:    vpsrlq $32, %xmm1, %xmm2
    197 ; AVX512F-NEXT:    vpmuludq %xmm0, %xmm2, %xmm2
    198 ; AVX512F-NEXT:    vpsrlq $32, %xmm0, %xmm3
    199 ; AVX512F-NEXT:    vpmuludq %xmm3, %xmm1, %xmm3
    200 ; AVX512F-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
    201 ; AVX512F-NEXT:    vpsllq $32, %xmm2, %xmm2
    202 ; AVX512F-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
    203 ; AVX512F-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
    204 ; AVX512F-NEXT:    retq
    205 ;
    206 ; AVX512VL-LABEL: imulq128:
    207 ; AVX512VL:       # %bb.0:
    208 ; AVX512VL-NEXT:    vpsrlq $32, %xmm1, %xmm2
    209 ; AVX512VL-NEXT:    vpmuludq %xmm0, %xmm2, %xmm2
    210 ; AVX512VL-NEXT:    vpsrlq $32, %xmm0, %xmm3
    211 ; AVX512VL-NEXT:    vpmuludq %xmm3, %xmm1, %xmm3
    212 ; AVX512VL-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
    213 ; AVX512VL-NEXT:    vpsllq $32, %xmm2, %xmm2
    214 ; AVX512VL-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
    215 ; AVX512VL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
    216 ; AVX512VL-NEXT:    retq
    217 ;
    218 ; AVX512BW-LABEL: imulq128:
    219 ; AVX512BW:       # %bb.0:
    220 ; AVX512BW-NEXT:    vpsrlq $32, %xmm1, %xmm2
    221 ; AVX512BW-NEXT:    vpmuludq %xmm0, %xmm2, %xmm2
    222 ; AVX512BW-NEXT:    vpsrlq $32, %xmm0, %xmm3
    223 ; AVX512BW-NEXT:    vpmuludq %xmm3, %xmm1, %xmm3
    224 ; AVX512BW-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
    225 ; AVX512BW-NEXT:    vpsllq $32, %xmm2, %xmm2
    226 ; AVX512BW-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
    227 ; AVX512BW-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
    228 ; AVX512BW-NEXT:    retq
    229 ;
    230 ; AVX512DQ-LABEL: imulq128:
    231 ; AVX512DQ:       # %bb.0:
    232 ; AVX512DQ-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
    233 ; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
    234 ; AVX512DQ-NEXT:    vpmullq %zmm0, %zmm1, %zmm0
    235 ; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    236 ; AVX512DQ-NEXT:    vzeroupper
    237 ; AVX512DQ-NEXT:    retq
    238 ;
    239 ; SKX-LABEL: imulq128:
    240 ; SKX:       # %bb.0:
    241 ; SKX-NEXT:    vpmullq %xmm0, %xmm1, %xmm0
    242 ; SKX-NEXT:    retq
    243   %z = mul <2 x i64>%x, %y
    244   ret <2 x i64>%z
    245 }
    246 
    247 define <8 x double> @mulpd512(<8 x double> %y, <8 x double> %x) {
    248 ; CHECK-LABEL: mulpd512:
    249 ; CHECK:       # %bb.0: # %entry
    250 ; CHECK-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
    251 ; CHECK-NEXT:    retq
    252 entry:
    253   %mul.i = fmul <8 x double> %x, %y
    254   ret <8 x double> %mul.i
    255 }
    256 
    257 define <8 x double> @mulpd512fold(<8 x double> %y) {
    258 ; CHECK-LABEL: mulpd512fold:
    259 ; CHECK:       # %bb.0: # %entry
    260 ; CHECK-NEXT:    vmulpd {{.*}}(%rip), %zmm0, %zmm0
    261 ; CHECK-NEXT:    retq
    262 entry:
    263   %mul.i = fmul <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
    264   ret <8 x double> %mul.i
    265 }
    266 
    267 define <16 x float> @mulps512(<16 x float> %y, <16 x float> %x) {
    268 ; CHECK-LABEL: mulps512:
    269 ; CHECK:       # %bb.0: # %entry
    270 ; CHECK-NEXT:    vmulps %zmm0, %zmm1, %zmm0
    271 ; CHECK-NEXT:    retq
    272 entry:
    273   %mul.i = fmul <16 x float> %x, %y
    274   ret <16 x float> %mul.i
    275 }
    276 
    277 define <16 x float> @mulps512fold(<16 x float> %y) {
    278 ; CHECK-LABEL: mulps512fold:
    279 ; CHECK:       # %bb.0: # %entry
    280 ; CHECK-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0
    281 ; CHECK-NEXT:    retq
    282 entry:
    283   %mul.i = fmul <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000>
    284   ret <16 x float> %mul.i
    285 }
    286 
    287 define <8 x double> @divpd512(<8 x double> %y, <8 x double> %x) {
    288 ; CHECK-LABEL: divpd512:
    289 ; CHECK:       # %bb.0: # %entry
    290 ; CHECK-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
    291 ; CHECK-NEXT:    retq
    292 entry:
    293   %div.i = fdiv <8 x double> %x, %y
    294   ret <8 x double> %div.i
    295 }
    296 
    297 define <8 x double> @divpd512fold(<8 x double> %y) {
    298 ; CHECK-LABEL: divpd512fold:
    299 ; CHECK:       # %bb.0: # %entry
    300 ; CHECK-NEXT:    vdivpd {{.*}}(%rip), %zmm0, %zmm0
    301 ; CHECK-NEXT:    retq
    302 entry:
    303   %div.i = fdiv <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
    304   ret <8 x double> %div.i
    305 }
    306 
    307 define <16 x float> @divps512(<16 x float> %y, <16 x float> %x) {
    308 ; CHECK-LABEL: divps512:
    309 ; CHECK:       # %bb.0: # %entry
    310 ; CHECK-NEXT:    vdivps %zmm0, %zmm1, %zmm0
    311 ; CHECK-NEXT:    retq
    312 entry:
    313   %div.i = fdiv <16 x float> %x, %y
    314   ret <16 x float> %div.i
    315 }
    316 
    317 define <16 x float> @divps512fold(<16 x float> %y) {
    318 ; CHECK-LABEL: divps512fold:
    319 ; CHECK:       # %bb.0: # %entry
    320 ; CHECK-NEXT:    vdivps {{.*}}(%rip), %zmm0, %zmm0
    321 ; CHECK-NEXT:    retq
    322 entry:
    323   %div.i = fdiv <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000>
    324   ret <16 x float> %div.i
    325 }
    326 
    327 define <8 x i64> @vpaddq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
    328 ; CHECK-LABEL: vpaddq_test:
    329 ; CHECK:       # %bb.0:
    330 ; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
    331 ; CHECK-NEXT:    retq
    332   %x = add <8 x i64> %i, %j
    333   ret <8 x i64> %x
    334 }
    335 
    336 define <8 x i64> @vpaddq_fold_test(<8 x i64> %i, <8 x i64>* %j) nounwind {
    337 ; CHECK-LABEL: vpaddq_fold_test:
    338 ; CHECK:       # %bb.0:
    339 ; CHECK-NEXT:    vpaddq (%rdi), %zmm0, %zmm0
    340 ; CHECK-NEXT:    retq
    341   %tmp = load <8 x i64>, <8 x i64>* %j, align 4
    342   %x = add <8 x i64> %i, %tmp
    343   ret <8 x i64> %x
    344 }
    345 
    346 define <8 x i64> @vpaddq_broadcast_test(<8 x i64> %i) nounwind {
    347 ; CHECK-LABEL: vpaddq_broadcast_test:
    348 ; CHECK:       # %bb.0:
    349 ; CHECK-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
    350 ; CHECK-NEXT:    retq
    351   %x = add <8 x i64> %i, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
    352   ret <8 x i64> %x
    353 }
    354 
    355 define <8 x i64> @vpaddq_broadcast2_test(<8 x i64> %i, i64* %j) nounwind {
    356 ; CHECK-LABEL: vpaddq_broadcast2_test:
    357 ; CHECK:       # %bb.0:
    358 ; CHECK-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm0
    359 ; CHECK-NEXT:    retq
    360   %tmp = load i64, i64* %j
    361   %j.0 = insertelement <8 x i64> undef, i64 %tmp, i32 0
    362   %j.1 = insertelement <8 x i64> %j.0, i64 %tmp, i32 1
    363   %j.2 = insertelement <8 x i64> %j.1, i64 %tmp, i32 2
    364   %j.3 = insertelement <8 x i64> %j.2, i64 %tmp, i32 3
    365   %j.4 = insertelement <8 x i64> %j.3, i64 %tmp, i32 4
    366   %j.5 = insertelement <8 x i64> %j.4, i64 %tmp, i32 5
    367   %j.6 = insertelement <8 x i64> %j.5, i64 %tmp, i32 6
    368   %j.7 = insertelement <8 x i64> %j.6, i64 %tmp, i32 7
    369   %x = add <8 x i64> %i, %j.7
    370   ret <8 x i64> %x
    371 }
    372 
    373 define <16 x i32> @vpaddd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
    374 ; CHECK-LABEL: vpaddd_test:
    375 ; CHECK:       # %bb.0:
    376 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    377 ; CHECK-NEXT:    retq
    378   %x = add <16 x i32> %i, %j
    379   ret <16 x i32> %x
    380 }
    381 
    382 define <16 x i32> @vpaddd_fold_test(<16 x i32> %i, <16 x i32>* %j) nounwind {
    383 ; CHECK-LABEL: vpaddd_fold_test:
    384 ; CHECK:       # %bb.0:
    385 ; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0
    386 ; CHECK-NEXT:    retq
    387   %tmp = load <16 x i32>, <16 x i32>* %j, align 4
    388   %x = add <16 x i32> %i, %tmp
    389   ret <16 x i32> %x
    390 }
    391 
    392 define <16 x i32> @vpaddd_broadcast_test(<16 x i32> %i) nounwind {
    393 ; CHECK-LABEL: vpaddd_broadcast_test:
    394 ; CHECK:       # %bb.0:
    395 ; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
    396 ; CHECK-NEXT:    retq
    397   %x = add <16 x i32> %i, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
    398   ret <16 x i32> %x
    399 }
    400 
    401 define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
    402 ; CHECK-LABEL: vpaddd_mask_test:
    403 ; CHECK:       # %bb.0:
    404 ; CHECK-NEXT:    vptestmd %zmm2, %zmm2, %k1
    405 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1}
    406 ; CHECK-NEXT:    retq
    407   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
    408   %x = add <16 x i32> %i, %j
    409   %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
    410   ret <16 x i32> %r
    411 }
    412 
    413 define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
    414 ; CHECK-LABEL: vpaddd_maskz_test:
    415 ; CHECK:       # %bb.0:
    416 ; CHECK-NEXT:    vptestmd %zmm2, %zmm2, %k1
    417 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z}
    418 ; CHECK-NEXT:    retq
    419   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
    420   %x = add <16 x i32> %i, %j
    421   %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
    422   ret <16 x i32> %r
    423 }
    424 
    425 define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
    426 ; CHECK-LABEL: vpaddd_mask_fold_test:
    427 ; CHECK:       # %bb.0:
    428 ; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1
    429 ; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 {%k1}
    430 ; CHECK-NEXT:    retq
    431   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
    432   %j = load <16 x i32>, <16 x i32>* %j.ptr
    433   %x = add <16 x i32> %i, %j
    434   %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
    435   ret <16 x i32> %r
    436 }
    437 
    438 define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
    439 ; CHECK-LABEL: vpaddd_mask_broadcast_test:
    440 ; CHECK:       # %bb.0:
    441 ; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1
    442 ; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1}
    443 ; CHECK-NEXT:    retq
    444   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
    445   %x = add <16 x i32> %i, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
    446   %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
    447   ret <16 x i32> %r
    448 }
    449 
    450 define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
    451 ; CHECK-LABEL: vpaddd_maskz_fold_test:
    452 ; CHECK:       # %bb.0:
    453 ; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1
    454 ; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z}
    455 ; CHECK-NEXT:    retq
    456   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
    457   %j = load <16 x i32>, <16 x i32>* %j.ptr
    458   %x = add <16 x i32> %i, %j
    459   %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
    460   ret <16 x i32> %r
    461 }
    462 
    463 define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
    464 ; CHECK-LABEL: vpaddd_maskz_broadcast_test:
    465 ; CHECK:       # %bb.0:
    466 ; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1
    467 ; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z}
    468 ; CHECK-NEXT:    retq
    469   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
    470   %x = add <16 x i32> %i, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
    471   %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
    472   ret <16 x i32> %r
    473 }
    474 
    475 define <8 x i64> @vpsubq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
    476 ; CHECK-LABEL: vpsubq_test:
    477 ; CHECK:       # %bb.0:
    478 ; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
    479 ; CHECK-NEXT:    retq
    480   %x = sub <8 x i64> %i, %j
    481   ret <8 x i64> %x
    482 }
    483 
    484 define <16 x i32> @vpsubd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
    485 ; CHECK-LABEL: vpsubd_test:
    486 ; CHECK:       # %bb.0:
    487 ; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
    488 ; CHECK-NEXT:    retq
    489   %x = sub <16 x i32> %i, %j
    490   ret <16 x i32> %x
    491 }
    492 
    493 define <16 x i32> @vpmulld_test(<16 x i32> %i, <16 x i32> %j) {
    494 ; CHECK-LABEL: vpmulld_test:
    495 ; CHECK:       # %bb.0:
    496 ; CHECK-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
    497 ; CHECK-NEXT:    retq
    498   %x = mul <16 x i32> %i, %j
    499   ret <16 x i32> %x
    500 }
    501 
    502 declare float @sqrtf(float) readnone
    503 define float @sqrtA(float %a) nounwind uwtable readnone ssp {
    504 ; CHECK-LABEL: sqrtA:
    505 ; CHECK:       # %bb.0: # %entry
    506 ; CHECK-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
    507 ; CHECK-NEXT:    retq
    508 entry:
    509   %conv1 = tail call float @sqrtf(float %a) nounwind readnone
    510   ret float %conv1
    511 }
    512 
    513 declare double @sqrt(double) readnone
    514 define double @sqrtB(double %a) nounwind uwtable readnone ssp {
    515 ; CHECK-LABEL: sqrtB:
    516 ; CHECK:       # %bb.0: # %entry
    517 ; CHECK-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
    518 ; CHECK-NEXT:    retq
    519 entry:
    520   %call = tail call double @sqrt(double %a) nounwind readnone
    521   ret double %call
    522 }
    523 
    524 declare float @llvm.sqrt.f32(float)
    525 define float @sqrtC(float %a) nounwind {
    526 ; CHECK-LABEL: sqrtC:
    527 ; CHECK:       # %bb.0:
    528 ; CHECK-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
    529 ; CHECK-NEXT:    retq
    530   %b = call float @llvm.sqrt.f32(float %a)
    531   ret float %b
    532 }
    533 
    534 declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
    535 define <16 x float> @sqrtD(<16 x float> %a) nounwind {
    536 ; CHECK-LABEL: sqrtD:
    537 ; CHECK:       # %bb.0:
    538 ; CHECK-NEXT:    vsqrtps %zmm0, %zmm0
    539 ; CHECK-NEXT:    retq
    540   %b = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a)
    541   ret <16 x float> %b
    542 }
    543 
    544 declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
    545 define <8 x double> @sqrtE(<8 x double> %a) nounwind {
    546 ; CHECK-LABEL: sqrtE:
    547 ; CHECK:       # %bb.0:
    548 ; CHECK-NEXT:    vsqrtpd %zmm0, %zmm0
    549 ; CHECK-NEXT:    retq
    550   %b = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a)
    551   ret <8 x double> %b
    552 }
    553 
    554 define <16 x float> @fadd_broadcast(<16 x float> %a) nounwind {
    555 ; CHECK-LABEL: fadd_broadcast:
    556 ; CHECK:       # %bb.0:
    557 ; CHECK-NEXT:    vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0
    558 ; CHECK-NEXT:    retq
    559   %b = fadd <16 x float> %a, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
    560   ret <16 x float> %b
    561 }
    562 
    563 define <8 x i64> @addq_broadcast(<8 x i64> %a) nounwind {
    564 ; CHECK-LABEL: addq_broadcast:
    565 ; CHECK:       # %bb.0:
    566 ; CHECK-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
    567 ; CHECK-NEXT:    retq
    568   %b = add <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
    569   ret <8 x i64> %b
    570 }
    571 
    572 define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind {
    573 ; AVX512F-LABEL: orq_broadcast:
    574 ; AVX512F:       # %bb.0:
    575 ; AVX512F-NEXT:    vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
    576 ; AVX512F-NEXT:    retq
    577 ;
    578 ; AVX512VL-LABEL: orq_broadcast:
    579 ; AVX512VL:       # %bb.0:
    580 ; AVX512VL-NEXT:    vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
    581 ; AVX512VL-NEXT:    retq
    582 ;
    583 ; AVX512BW-LABEL: orq_broadcast:
    584 ; AVX512BW:       # %bb.0:
    585 ; AVX512BW-NEXT:    vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
    586 ; AVX512BW-NEXT:    retq
    587 ;
    588 ; AVX512DQ-LABEL: orq_broadcast:
    589 ; AVX512DQ:       # %bb.0:
    590 ; AVX512DQ-NEXT:    vorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
    591 ; AVX512DQ-NEXT:    retq
    592 ;
    593 ; SKX-LABEL: orq_broadcast:
    594 ; SKX:       # %bb.0:
    595 ; SKX-NEXT:    vorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
    596 ; SKX-NEXT:    retq
    597   %b = or <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
    598   ret <8 x i64> %b
    599 }
    600 
    601 define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) {
    602 ; AVX512F-LABEL: andd512fold:
    603 ; AVX512F:       # %bb.0: # %entry
    604 ; AVX512F-NEXT:    vpandq (%rdi), %zmm0, %zmm0
    605 ; AVX512F-NEXT:    retq
    606 ;
    607 ; AVX512VL-LABEL: andd512fold:
    608 ; AVX512VL:       # %bb.0: # %entry
    609 ; AVX512VL-NEXT:    vpandq (%rdi), %zmm0, %zmm0
    610 ; AVX512VL-NEXT:    retq
    611 ;
    612 ; AVX512BW-LABEL: andd512fold:
    613 ; AVX512BW:       # %bb.0: # %entry
    614 ; AVX512BW-NEXT:    vpandq (%rdi), %zmm0, %zmm0
    615 ; AVX512BW-NEXT:    retq
    616 ;
    617 ; AVX512DQ-LABEL: andd512fold:
    618 ; AVX512DQ:       # %bb.0: # %entry
    619 ; AVX512DQ-NEXT:    vandps (%rdi), %zmm0, %zmm0
    620 ; AVX512DQ-NEXT:    retq
    621 ;
    622 ; SKX-LABEL: andd512fold:
    623 ; SKX:       # %bb.0: # %entry
    624 ; SKX-NEXT:    vandps (%rdi), %zmm0, %zmm0
    625 ; SKX-NEXT:    retq
    626 entry:
    627   %a = load <16 x i32>, <16 x i32>* %x, align 4
    628   %b = and <16 x i32> %y, %a
    629   ret <16 x i32> %b
    630 }
    631 
    632 define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) {
    633 ; AVX512F-LABEL: andqbrst:
    634 ; AVX512F:       # %bb.0: # %entry
    635 ; AVX512F-NEXT:    vpandq (%rdi){1to8}, %zmm0, %zmm0
    636 ; AVX512F-NEXT:    retq
    637 ;
    638 ; AVX512VL-LABEL: andqbrst:
    639 ; AVX512VL:       # %bb.0: # %entry
    640 ; AVX512VL-NEXT:    vpandq (%rdi){1to8}, %zmm0, %zmm0
    641 ; AVX512VL-NEXT:    retq
    642 ;
    643 ; AVX512BW-LABEL: andqbrst:
    644 ; AVX512BW:       # %bb.0: # %entry
    645 ; AVX512BW-NEXT:    vpandq (%rdi){1to8}, %zmm0, %zmm0
    646 ; AVX512BW-NEXT:    retq
    647 ;
    648 ; AVX512DQ-LABEL: andqbrst:
    649 ; AVX512DQ:       # %bb.0: # %entry
    650 ; AVX512DQ-NEXT:    vandpd (%rdi){1to8}, %zmm0, %zmm0
    651 ; AVX512DQ-NEXT:    retq
    652 ;
    653 ; SKX-LABEL: andqbrst:
    654 ; SKX:       # %bb.0: # %entry
    655 ; SKX-NEXT:    vandpd (%rdi){1to8}, %zmm0, %zmm0
    656 ; SKX-NEXT:    retq
    657 entry:
    658   %a = load i64, i64* %ap, align 8
    659   %b = insertelement <8 x i64> undef, i64 %a, i32 0
    660   %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
    661   %d = and <8 x i64> %p1, %c
    662   ret <8 x i64>%d
    663 }
    664 
    665 define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i,
    666 ; CHECK-LABEL: test_mask_vaddps:
    667 ; CHECK:       # %bb.0:
    668 ; CHECK-NEXT:    vptestmd %zmm3, %zmm3, %k1
    669 ; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm0 {%k1}
    670 ; CHECK-NEXT:    retq
    671                                      <16 x float> %j, <16 x i32> %mask1)
    672                                      nounwind readnone {
    673   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
    674   %x = fadd <16 x float> %i, %j
    675   %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst
    676   ret <16 x float> %r
    677 }
    678 
    679 define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i,
    680 ; CHECK-LABEL: test_mask_vmulps:
    681 ; CHECK:       # %bb.0:
    682 ; CHECK-NEXT:    vptestmd %zmm3, %zmm3, %k1
    683 ; CHECK-NEXT:    vmulps %zmm2, %zmm1, %zmm0 {%k1}
    684 ; CHECK-NEXT:    retq
    685                                      <16 x float> %j, <16 x i32> %mask1)
    686                                      nounwind readnone {
    687   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
    688   %x = fmul <16 x float> %i, %j
    689   %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst
    690   ret <16 x float> %r
    691 }
    692 
    693 define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i,
    694 ; CHECK-LABEL: test_mask_vminps:
    695 ; CHECK:       # %bb.0:
    696 ; CHECK-NEXT:    vptestmd %zmm3, %zmm3, %k1
    697 ; CHECK-NEXT:    vminps %zmm2, %zmm1, %zmm0 {%k1}
    698 ; CHECK-NEXT:    retq
    699                                      <16 x float> %j, <16 x i32> %mask1)
    700                                      nounwind readnone {
    701   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
    702   %cmp_res = fcmp olt <16 x float> %i, %j
    703   %min = select <16 x i1> %cmp_res, <16 x float> %i, <16 x float> %j
    704   %r = select <16 x i1> %mask, <16 x float> %min, <16 x float> %dst
    705   ret <16 x float> %r
    706 }
    707 
    708 define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i,
    709 ; AVX512F-LABEL: test_mask_vminpd:
    710 ; AVX512F:       # %bb.0:
    711 ; AVX512F-NEXT:    # kill: def $ymm3 killed $ymm3 def $zmm3
    712 ; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k1
    713 ; AVX512F-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
    714 ; AVX512F-NEXT:    retq
    715 ;
    716 ; AVX512VL-LABEL: test_mask_vminpd:
    717 ; AVX512VL:       # %bb.0:
    718 ; AVX512VL-NEXT:    vptestmd %ymm3, %ymm3, %k1
    719 ; AVX512VL-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
    720 ; AVX512VL-NEXT:    retq
    721 ;
    722 ; AVX512BW-LABEL: test_mask_vminpd:
    723 ; AVX512BW:       # %bb.0:
    724 ; AVX512BW-NEXT:    # kill: def $ymm3 killed $ymm3 def $zmm3
    725 ; AVX512BW-NEXT:    vptestmd %zmm3, %zmm3, %k1
    726 ; AVX512BW-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
    727 ; AVX512BW-NEXT:    retq
    728 ;
    729 ; AVX512DQ-LABEL: test_mask_vminpd:
    730 ; AVX512DQ:       # %bb.0:
    731 ; AVX512DQ-NEXT:    # kill: def $ymm3 killed $ymm3 def $zmm3
    732 ; AVX512DQ-NEXT:    vptestmd %zmm3, %zmm3, %k1
    733 ; AVX512DQ-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
    734 ; AVX512DQ-NEXT:    retq
    735 ;
    736 ; SKX-LABEL: test_mask_vminpd:
    737 ; SKX:       # %bb.0:
    738 ; SKX-NEXT:    vptestmd %ymm3, %ymm3, %k1
    739 ; SKX-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
    740 ; SKX-NEXT:    retq
    741                                      <8 x double> %j, <8 x i32> %mask1)
    742                                      nounwind readnone {
    743   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
    744   %cmp_res = fcmp olt <8 x double> %i, %j
    745   %min = select <8 x i1> %cmp_res, <8 x double> %i, <8 x double> %j
    746   %r = select <8 x i1> %mask, <8 x double> %min, <8 x double> %dst
    747   ret <8 x double> %r
    748 }
    749 
    750 define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i,
    751 ; CHECK-LABEL: test_mask_vmaxps:
    752 ; CHECK:       # %bb.0:
    753 ; CHECK-NEXT:    vptestmd %zmm3, %zmm3, %k1
    754 ; CHECK-NEXT:    vmaxps %zmm2, %zmm1, %zmm0 {%k1}
    755 ; CHECK-NEXT:    retq
    756                                      <16 x float> %j, <16 x i32> %mask1)
    757                                      nounwind readnone {
    758   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
    759   %cmp_res = fcmp ogt <16 x float> %i, %j
    760   %max = select <16 x i1> %cmp_res, <16 x float> %i, <16 x float> %j
    761   %r = select <16 x i1> %mask, <16 x float> %max, <16 x float> %dst
    762   ret <16 x float> %r
    763 }
    764 
    765 define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i,
    766 ; AVX512F-LABEL: test_mask_vmaxpd:
    767 ; AVX512F:       # %bb.0:
    768 ; AVX512F-NEXT:    # kill: def $ymm3 killed $ymm3 def $zmm3
    769 ; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k1
    770 ; AVX512F-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
    771 ; AVX512F-NEXT:    retq
    772 ;
    773 ; AVX512VL-LABEL: test_mask_vmaxpd:
    774 ; AVX512VL:       # %bb.0:
    775 ; AVX512VL-NEXT:    vptestmd %ymm3, %ymm3, %k1
    776 ; AVX512VL-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
    777 ; AVX512VL-NEXT:    retq
    778 ;
    779 ; AVX512BW-LABEL: test_mask_vmaxpd:
    780 ; AVX512BW:       # %bb.0:
    781 ; AVX512BW-NEXT:    # kill: def $ymm3 killed $ymm3 def $zmm3
    782 ; AVX512BW-NEXT:    vptestmd %zmm3, %zmm3, %k1
    783 ; AVX512BW-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
    784 ; AVX512BW-NEXT:    retq
    785 ;
    786 ; AVX512DQ-LABEL: test_mask_vmaxpd:
    787 ; AVX512DQ:       # %bb.0:
    788 ; AVX512DQ-NEXT:    # kill: def $ymm3 killed $ymm3 def $zmm3
    789 ; AVX512DQ-NEXT:    vptestmd %zmm3, %zmm3, %k1
    790 ; AVX512DQ-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
    791 ; AVX512DQ-NEXT:    retq
    792 ;
    793 ; SKX-LABEL: test_mask_vmaxpd:
    794 ; SKX:       # %bb.0:
    795 ; SKX-NEXT:    vptestmd %ymm3, %ymm3, %k1
    796 ; SKX-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
    797 ; SKX-NEXT:    retq
    798                                      <8 x double> %j, <8 x i32> %mask1)
    799                                      nounwind readnone {
    800   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
    801   %cmp_res = fcmp ogt <8 x double> %i, %j
    802   %max = select <8 x i1> %cmp_res, <8 x double> %i, <8 x double> %j
    803   %r = select <8 x i1> %mask, <8 x double> %max, <8 x double> %dst
    804   ret <8 x double> %r
    805 }
    806 
    807 define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i,
    808 ; CHECK-LABEL: test_mask_vsubps:
    809 ; CHECK:       # %bb.0:
    810 ; CHECK-NEXT:    vptestmd %zmm3, %zmm3, %k1
    811 ; CHECK-NEXT:    vsubps %zmm2, %zmm1, %zmm0 {%k1}
    812 ; CHECK-NEXT:    retq
    813                                      <16 x float> %j, <16 x i32> %mask1)
    814                                      nounwind readnone {
    815   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
    816   %x = fsub <16 x float> %i, %j
    817   %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst
    818   ret <16 x float> %r
    819 }
    820 
    821 define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i,
    822 ; CHECK-LABEL: test_mask_vdivps:
    823 ; CHECK:       # %bb.0:
    824 ; CHECK-NEXT:    vptestmd %zmm3, %zmm3, %k1
    825 ; CHECK-NEXT:    vdivps %zmm2, %zmm1, %zmm0 {%k1}
    826 ; CHECK-NEXT:    retq
    827                                      <16 x float> %j, <16 x i32> %mask1)
    828                                      nounwind readnone {
    829   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
    830   %x = fdiv <16 x float> %i, %j
    831   %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst
    832   ret <16 x float> %r
    833 }
    834 
    835 define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i,
    836 ; CHECK-LABEL: test_mask_vaddpd:
    837 ; CHECK:       # %bb.0:
    838 ; CHECK-NEXT:    vptestmq %zmm3, %zmm3, %k1
    839 ; CHECK-NEXT:    vaddpd %zmm2, %zmm1, %zmm0 {%k1}
    840 ; CHECK-NEXT:    retq
    841                                      <8 x double> %j, <8 x i64> %mask1)
    842                                      nounwind readnone {
    843   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
    844   %x = fadd <8 x double> %i, %j
    845   %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> %dst
    846   ret <8 x double> %r
    847 }
    848 
    849 define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j,
    850 ; CHECK-LABEL: test_maskz_vaddpd:
    851 ; CHECK:       # %bb.0:
    852 ; CHECK-NEXT:    vptestmq %zmm2, %zmm2, %k1
    853 ; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z}
    854 ; CHECK-NEXT:    retq
    855                                       <8 x i64> %mask1) nounwind readnone {
    856   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
    857   %x = fadd <8 x double> %i, %j
    858   %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer
    859   ret <8 x double> %r
    860 }
    861 
    862 define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i,
    863 ; CHECK-LABEL: test_mask_fold_vaddpd:
    864 ; CHECK:       # %bb.0:
    865 ; CHECK-NEXT:    vptestmq %zmm2, %zmm2, %k1
    866 ; CHECK-NEXT:    vaddpd (%rdi), %zmm1, %zmm0 {%k1}
    867 ; CHECK-NEXT:    retq
    868                                      <8 x double>* %j,  <8 x i64> %mask1)
    869                                      nounwind {
    870   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
    871   %tmp = load <8 x double>, <8 x double>* %j, align 8
    872   %x = fadd <8 x double> %i, %tmp
    873   %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> %dst
    874   ret <8 x double> %r
    875 }
    876 
    877 define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j,
    878 ; CHECK-LABEL: test_maskz_fold_vaddpd:
    879 ; CHECK:       # %bb.0:
    880 ; CHECK-NEXT:    vptestmq %zmm1, %zmm1, %k1
    881 ; CHECK-NEXT:    vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z}
    882 ; CHECK-NEXT:    retq
    883                                       <8 x i64> %mask1) nounwind {
    884   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
    885   %tmp = load <8 x double>, <8 x double>* %j, align 8
    886   %x = fadd <8 x double> %i, %tmp
    887   %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer
    888   ret <8 x double> %r
    889 }
    890 
    891 define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, double* %j) nounwind {
    892 ; CHECK-LABEL: test_broadcast_vaddpd:
    893 ; CHECK:       # %bb.0:
    894 ; CHECK-NEXT:    vaddpd (%rdi){1to8}, %zmm0, %zmm0
    895 ; CHECK-NEXT:    retq
    896   %tmp = load double, double* %j
    897   %b = insertelement <8 x double> undef, double %tmp, i32 0
    898   %c = shufflevector <8 x double> %b, <8 x double> undef,
    899                      <8 x i32> zeroinitializer
    900   %x = fadd <8 x double> %c, %i
    901   ret <8 x double> %x
    902 }
    903 
    904 define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> %i,
    905 ; CHECK-LABEL: test_mask_broadcast_vaddpd:
    906 ; CHECK:       # %bb.0:
    907 ; CHECK-NEXT:    vptestmq %zmm2, %zmm2, %k1
    908 ; CHECK-NEXT:    vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1}
    909 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
    910 ; CHECK-NEXT:    retq
    911                                       double* %j, <8 x i64> %mask1) nounwind {
    912   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
    913   %tmp = load double, double* %j
    914   %b = insertelement <8 x double> undef, double %tmp, i32 0
    915   %c = shufflevector <8 x double> %b, <8 x double> undef,
    916                      <8 x i32> zeroinitializer
    917   %x = fadd <8 x double> %c, %i
    918   %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> %i
    919   ret <8 x double> %r
    920 }
    921 
    922 define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j,
    923 ; CHECK-LABEL: test_maskz_broadcast_vaddpd:
    924 ; CHECK:       # %bb.0:
    925 ; CHECK-NEXT:    vptestmq %zmm1, %zmm1, %k1
    926 ; CHECK-NEXT:    vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
    927 ; CHECK-NEXT:    retq
    928                                        <8 x i64> %mask1) nounwind {
    929   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
    930   %tmp = load double, double* %j
    931   %b = insertelement <8 x double> undef, double %tmp, i32 0
    932   %c = shufflevector <8 x double> %b, <8 x double> undef,
    933                      <8 x i32> zeroinitializer
    934   %x = fadd <8 x double> %c, %i
    935   %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer
    936   ret <8 x double> %r
    937 }
    938 
    939 define <16 x float>  @test_fxor(<16 x float> %a) {
    940 ; AVX512F-LABEL: test_fxor:
    941 ; AVX512F:       # %bb.0:
    942 ; AVX512F-NEXT:    vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
    943 ; AVX512F-NEXT:    retq
    944 ;
    945 ; AVX512VL-LABEL: test_fxor:
    946 ; AVX512VL:       # %bb.0:
    947 ; AVX512VL-NEXT:    vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
    948 ; AVX512VL-NEXT:    retq
    949 ;
    950 ; AVX512BW-LABEL: test_fxor:
    951 ; AVX512BW:       # %bb.0:
    952 ; AVX512BW-NEXT:    vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
    953 ; AVX512BW-NEXT:    retq
    954 ;
    955 ; AVX512DQ-LABEL: test_fxor:
    956 ; AVX512DQ:       # %bb.0:
    957 ; AVX512DQ-NEXT:    vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0
    958 ; AVX512DQ-NEXT:    retq
    959 ;
    960 ; SKX-LABEL: test_fxor:
    961 ; SKX:       # %bb.0:
    962 ; SKX-NEXT:    vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0
    963 ; SKX-NEXT:    retq
    964 
    965   %res = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
    966   ret <16 x float>%res
    967 }
    968 
    969 define <8 x float>  @test_fxor_8f32(<8 x float> %a) {
    970 ; AVX512F-LABEL: test_fxor_8f32:
    971 ; AVX512F:       # %bb.0:
    972 ; AVX512F-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0,-0,-0,-0,-0,-0,-0,-0]
    973 ; AVX512F-NEXT:    vxorps %ymm1, %ymm0, %ymm0
    974 ; AVX512F-NEXT:    retq
    975 ;
    976 ; AVX512VL-LABEL: test_fxor_8f32:
    977 ; AVX512VL:       # %bb.0:
    978 ; AVX512VL-NEXT:    vpxord {{.*}}(%rip){1to8}, %ymm0, %ymm0
    979 ; AVX512VL-NEXT:    retq
    980 ;
    981 ; AVX512BW-LABEL: test_fxor_8f32:
    982 ; AVX512BW:       # %bb.0:
    983 ; AVX512BW-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0,-0,-0,-0,-0,-0,-0,-0]
    984 ; AVX512BW-NEXT:    vxorps %ymm1, %ymm0, %ymm0
    985 ; AVX512BW-NEXT:    retq
    986 ;
    987 ; AVX512DQ-LABEL: test_fxor_8f32:
    988 ; AVX512DQ:       # %bb.0:
    989 ; AVX512DQ-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0,-0,-0,-0,-0,-0,-0,-0]
    990 ; AVX512DQ-NEXT:    vxorps %ymm1, %ymm0, %ymm0
    991 ; AVX512DQ-NEXT:    retq
    992 ;
    993 ; SKX-LABEL: test_fxor_8f32:
    994 ; SKX:       # %bb.0:
    995 ; SKX-NEXT:    vxorps {{.*}}(%rip){1to8}, %ymm0, %ymm0
    996 ; SKX-NEXT:    retq
    997   %res = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
    998   ret <8 x float>%res
    999 }
   1000 
   1001 define <8 x double> @fabs_v8f64(<8 x double> %p)
   1002 ; AVX512F-LABEL: fabs_v8f64:
   1003 ; AVX512F:       # %bb.0:
   1004 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
   1005 ; AVX512F-NEXT:    retq
   1006 ;
   1007 ; AVX512VL-LABEL: fabs_v8f64:
   1008 ; AVX512VL:       # %bb.0:
   1009 ; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
   1010 ; AVX512VL-NEXT:    retq
   1011 ;
   1012 ; AVX512BW-LABEL: fabs_v8f64:
   1013 ; AVX512BW:       # %bb.0:
   1014 ; AVX512BW-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
   1015 ; AVX512BW-NEXT:    retq
   1016 ;
   1017 ; AVX512DQ-LABEL: fabs_v8f64:
   1018 ; AVX512DQ:       # %bb.0:
   1019 ; AVX512DQ-NEXT:    vandpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
   1020 ; AVX512DQ-NEXT:    retq
   1021 ;
   1022 ; SKX-LABEL: fabs_v8f64:
   1023 ; SKX:       # %bb.0:
   1024 ; SKX-NEXT:    vandpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
   1025 ; SKX-NEXT:    retq
   1026 {
   1027   %t = call <8 x double> @llvm.fabs.v8f64(<8 x double> %p)
   1028   ret <8 x double> %t
   1029 }
   1030 declare <8 x double> @llvm.fabs.v8f64(<8 x double> %p)
   1031 
   1032 define <16 x float> @fabs_v16f32(<16 x float> %p)
   1033 ; AVX512F-LABEL: fabs_v16f32:
   1034 ; AVX512F:       # %bb.0:
   1035 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
   1036 ; AVX512F-NEXT:    retq
   1037 ;
   1038 ; AVX512VL-LABEL: fabs_v16f32:
   1039 ; AVX512VL:       # %bb.0:
   1040 ; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
   1041 ; AVX512VL-NEXT:    retq
   1042 ;
   1043 ; AVX512BW-LABEL: fabs_v16f32:
   1044 ; AVX512BW:       # %bb.0:
   1045 ; AVX512BW-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
   1046 ; AVX512BW-NEXT:    retq
   1047 ;
   1048 ; AVX512DQ-LABEL: fabs_v16f32:
   1049 ; AVX512DQ:       # %bb.0:
   1050 ; AVX512DQ-NEXT:    vandps {{.*}}(%rip){1to16}, %zmm0, %zmm0
   1051 ; AVX512DQ-NEXT:    retq
   1052 ;
   1053 ; SKX-LABEL: fabs_v16f32:
   1054 ; SKX:       # %bb.0:
   1055 ; SKX-NEXT:    vandps {{.*}}(%rip){1to16}, %zmm0, %zmm0
   1056 ; SKX-NEXT:    retq
   1057 {
   1058   %t = call <16 x float> @llvm.fabs.v16f32(<16 x float> %p)
   1059   ret <16 x float> %t
   1060 }
   1061 declare <16 x float> @llvm.fabs.v16f32(<16 x float> %p)
   1062