Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
      3 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX2
      4 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX512
      5 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL32 -check-prefix=AVX512 -check-prefix=AVX512BW
      6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX-64
      7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL64 -check-prefix=NO-AVX512BW-64 -check-prefix=AVX2-64
      8 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL64 -check-prefix=NO-AVX512BW-64 -check-prefix=AVX512F-64
      9 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL64 -check-prefix=AVX512F-64 -check-prefix=AVX512BW-64
     10 
     11 ;===-----------------------------------------------------------------------------===
     12 ;    This test checks the ability to recognize a cross element pattern of
     13 ;    constants and perform the load via broadcasting a smaller constant
     14 ;    vector.
     15 ;    For example:
     16 ;    <i32 0, i32 1, i32 0, i32 1> => broadcast of the constant vector <i32 0, i32 1>
     17 ;===-----------------------------------------------------------------------------===
     18 
     19 define <16 x i8> @f16xi8_i16(<16 x i8> %a) {
     20 ; AVX-LABEL: f16xi8_i16:
     21 ; AVX:       # %bb.0:
     22 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
     23 ; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
     24 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
     25 ; AVX-NEXT:    retl
     26 ;
     27 ; ALL32-LABEL: f16xi8_i16:
     28 ; ALL32:       # %bb.0:
     29 ; ALL32-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [256,256,256,256,256,256,256,256]
     30 ; ALL32-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
     31 ; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
     32 ; ALL32-NEXT:    retl
     33 ;
     34 ; AVX-64-LABEL: f16xi8_i16:
     35 ; AVX-64:       # %bb.0:
     36 ; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
     37 ; AVX-64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
     38 ; AVX-64-NEXT:    vpand %xmm1, %xmm0, %xmm0
     39 ; AVX-64-NEXT:    retq
     40 ;
     41 ; ALL64-LABEL: f16xi8_i16:
     42 ; ALL64:       # %bb.0:
     43 ; ALL64-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [256,256,256,256,256,256,256,256]
     44 ; ALL64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
     45 ; ALL64-NEXT:    vpand %xmm1, %xmm0, %xmm0
     46 ; ALL64-NEXT:    retq
     47   %res1 = add <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %a
     48   %res2 = and <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %res1
     49   ret <16 x i8> %res2
     50 }
     51 
     52 
     53 define <16 x i8> @f16xi8_i32(<16 x i8> %a) {
     54 ; AVX-LABEL: f16xi8_i32:
     55 ; AVX:       # %bb.0:
     56 ; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
     57 ; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
     58 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
     59 ; AVX-NEXT:    retl
     60 ;
     61 ; ALL32-LABEL: f16xi8_i32:
     62 ; ALL32:       # %bb.0:
     63 ; ALL32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976]
     64 ; ALL32-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
     65 ; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
     66 ; ALL32-NEXT:    retl
     67 ;
     68 ; AVX-64-LABEL: f16xi8_i32:
     69 ; AVX-64:       # %bb.0:
     70 ; AVX-64-NEXT:    vbroadcastss {{.*#+}} xmm1 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
     71 ; AVX-64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
     72 ; AVX-64-NEXT:    vpand %xmm1, %xmm0, %xmm0
     73 ; AVX-64-NEXT:    retq
     74 ;
     75 ; ALL64-LABEL: f16xi8_i32:
     76 ; ALL64:       # %bb.0:
     77 ; ALL64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976]
     78 ; ALL64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
     79 ; ALL64-NEXT:    vpand %xmm1, %xmm0, %xmm0
     80 ; ALL64-NEXT:    retq
     81   %res1 = add <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a
     82   %res2 = and <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1
     83   ret <16 x i8> %res2
     84 }
     85 
     86 
     87 define <16 x i8> @f16xi8_i64(<16 x i8> %a) {
     88 ; AVX-LABEL: f16xi8_i64:
     89 ; AVX:       # %bb.0:
     90 ; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
     91 ; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
     92 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
     93 ; AVX-NEXT:    retl
     94 ;
     95 ; ALL32-LABEL: f16xi8_i64:
     96 ; ALL32:       # %bb.0:
     97 ; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
     98 ; ALL32-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
     99 ; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
    100 ; ALL32-NEXT:    retl
    101 ;
    102 ; AVX-64-LABEL: f16xi8_i64:
    103 ; AVX-64:       # %bb.0:
    104 ; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
    105 ; AVX-64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
    106 ; AVX-64-NEXT:    vpand %xmm1, %xmm0, %xmm0
    107 ; AVX-64-NEXT:    retq
    108 ;
    109 ; ALL64-LABEL: f16xi8_i64:
    110 ; ALL64:       # %bb.0:
    111 ; ALL64-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [506097522914230528,506097522914230528]
    112 ; ALL64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
    113 ; ALL64-NEXT:    vpand %xmm1, %xmm0, %xmm0
    114 ; ALL64-NEXT:    retq
    115   %res1 = add <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a
    116   %res2 = and <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1
    117   ret <16 x i8> %res2
    118 }
    119 
    120 
    121 define <32 x i8> @f32xi8_i16(<32 x i8> %a) {
    122 ; AVX-LABEL: f32xi8_i16:
    123 ; AVX:       # %bb.0:
    124 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    125 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
    126 ; AVX-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
    127 ; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    128 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    129 ; AVX-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
    130 ; AVX-NEXT:    retl
    131 ;
    132 ; ALL32-LABEL: f32xi8_i16:
    133 ; ALL32:       # %bb.0:
    134 ; ALL32-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
    135 ; ALL32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
    136 ; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
    137 ; ALL32-NEXT:    retl
    138 ;
    139 ; AVX-64-LABEL: f32xi8_i16:
    140 ; AVX-64:       # %bb.0:
    141 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
    142 ; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
    143 ; AVX-64-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
    144 ; AVX-64-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    145 ; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    146 ; AVX-64-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
    147 ; AVX-64-NEXT:    retq
    148 ;
    149 ; ALL64-LABEL: f32xi8_i16:
    150 ; ALL64:       # %bb.0:
    151 ; ALL64-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
    152 ; ALL64-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
    153 ; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
    154 ; ALL64-NEXT:    retq
    155   %res1 = add <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %a
    156   %res2 = and <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %res1
    157   ret <32 x i8> %res2
    158 }
    159 
    160 
    161 define <32 x i8> @f32xi8_i32(<32 x i8> %a) {
    162 ; AVX-LABEL: f32xi8_i32:
    163 ; AVX:       # %bb.0:
    164 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    165 ; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
    166 ; AVX-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
    167 ; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    168 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    169 ; AVX-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
    170 ; AVX-NEXT:    retl
    171 ;
    172 ; ALL32-LABEL: f32xi8_i32:
    173 ; ALL32:       # %bb.0:
    174 ; ALL32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
    175 ; ALL32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
    176 ; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
    177 ; ALL32-NEXT:    retl
    178 ;
    179 ; AVX-64-LABEL: f32xi8_i32:
    180 ; AVX-64:       # %bb.0:
    181 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
    182 ; AVX-64-NEXT:    vbroadcastss {{.*#+}} xmm2 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
    183 ; AVX-64-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
    184 ; AVX-64-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    185 ; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    186 ; AVX-64-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
    187 ; AVX-64-NEXT:    retq
    188 ;
    189 ; ALL64-LABEL: f32xi8_i32:
    190 ; ALL64:       # %bb.0:
    191 ; ALL64-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
    192 ; ALL64-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
    193 ; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
    194 ; ALL64-NEXT:    retq
    195   %res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a
    196   %res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1
    197   ret <32 x i8> %res2
    198 }
    199 
    200 
    201 define <32 x i8> @f32xi8_i64(<32 x i8> %a) {
    202 ; AVX-LABEL: f32xi8_i64:
    203 ; AVX:       # %bb.0:
    204 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    205 ; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
    206 ; AVX-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
    207 ; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    208 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    209 ; AVX-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
    210 ; AVX-NEXT:    retl
    211 ;
    212 ; ALL32-LABEL: f32xi8_i64:
    213 ; ALL32:       # %bb.0:
    214 ; ALL32-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275]
    215 ; ALL32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
    216 ; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
    217 ; ALL32-NEXT:    retl
    218 ;
    219 ; AVX-64-LABEL: f32xi8_i64:
    220 ; AVX-64:       # %bb.0:
    221 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
    222 ; AVX-64-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
    223 ; AVX-64-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
    224 ; AVX-64-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    225 ; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    226 ; AVX-64-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
    227 ; AVX-64-NEXT:    retq
    228 ;
    229 ; ALL64-LABEL: f32xi8_i64:
    230 ; ALL64:       # %bb.0:
    231 ; ALL64-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
    232 ; ALL64-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
    233 ; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
    234 ; ALL64-NEXT:    retq
    235   %res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a
    236   %res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1
    237   ret <32 x i8> %res2
    238 }
    239 
    240 
    241 define <32 x i8> @f32xi8_i128(<32 x i8> %a) {
    242 ; AVX-LABEL: f32xi8_i128:
    243 ; AVX:       # %bb.0:
    244 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    245 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
    246 ; AVX-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
    247 ; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    248 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    249 ; AVX-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
    250 ; AVX-NEXT:    retl
    251 ;
    252 ; ALL32-LABEL: f32xi8_i128:
    253 ; ALL32:       # %bb.0:
    254 ; ALL32-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
    255 ; ALL32-NEXT:    # ymm1 = mem[0,1,0,1]
    256 ; ALL32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
    257 ; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
    258 ; ALL32-NEXT:    retl
    259 ;
    260 ; AVX-64-LABEL: f32xi8_i128:
    261 ; AVX-64:       # %bb.0:
    262 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
    263 ; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
    264 ; AVX-64-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
    265 ; AVX-64-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    266 ; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    267 ; AVX-64-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
    268 ; AVX-64-NEXT:    retq
    269 ;
    270 ; ALL64-LABEL: f32xi8_i128:
    271 ; ALL64:       # %bb.0:
    272 ; ALL64-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
    273 ; ALL64-NEXT:    # ymm1 = mem[0,1,0,1]
    274 ; ALL64-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
    275 ; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
    276 ; ALL64-NEXT:    retq
    277   %res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %a
    278   %res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %res1
    279   ret <32 x i8> %res2
    280 }
    281 
    282 
    283 define <64 x i8> @f64xi8_i16(<64 x i8> %a) {
    284 ; AVX-LABEL: f64xi8_i16:
    285 ; AVX:       # %bb.0:
    286 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
    287 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
    288 ; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
    289 ; AVX-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
    290 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
    291 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
    292 ; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
    293 ; AVX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    294 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    295 ; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
    296 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
    297 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
    298 ; AVX-NEXT:    retl
    299 ;
    300 ; NO-AVX512BW-LABEL: f64xi8_i16:
    301 ; NO-AVX512BW:       # %bb.0:
    302 ; NO-AVX512BW-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
    303 ; NO-AVX512BW-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
    304 ; NO-AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    305 ; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm0, %ymm0
    306 ; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
    307 ; NO-AVX512BW-NEXT:    retl
    308 ;
    309 ; AVX512BW-LABEL: f64xi8_i16:
    310 ; AVX512BW:       # %bb.0:
    311 ; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
    312 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
    313 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    314 ; AVX512BW-NEXT:    retl
    315 ;
    316 ; AVX-64-LABEL: f64xi8_i16:
    317 ; AVX-64:       # %bb.0:
    318 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
    319 ; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
    320 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
    321 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
    322 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
    323 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm2
    324 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
    325 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    326 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    327 ; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
    328 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
    329 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
    330 ; AVX-64-NEXT:    retq
    331 ;
    332 ; NO-AVX512BW-64-LABEL: f64xi8_i16:
    333 ; NO-AVX512BW-64:       # %bb.0:
    334 ; NO-AVX512BW-64-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
    335 ; NO-AVX512BW-64-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
    336 ; NO-AVX512BW-64-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    337 ; NO-AVX512BW-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
    338 ; NO-AVX512BW-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
    339 ; NO-AVX512BW-64-NEXT:    retq
    340 ;
    341 ; AVX512BW-64-LABEL: f64xi8_i16:
    342 ; AVX512BW-64:       # %bb.0:
    343 ; AVX512BW-64-NEXT:    vpbroadcastw {{.*#+}} zmm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
    344 ; AVX512BW-64-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
    345 ; AVX512BW-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    346 ; AVX512BW-64-NEXT:    retq
    347   %res1 = add <64 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %a
    348   %res2 = and <64 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %res1
    349   ret <64 x i8> %res2
    350 }
    351 
    352 
    353 define <64 x i8> @f64i8_i32(<64 x i8> %a) {
    354 ; AVX-LABEL: f64i8_i32:
    355 ; AVX:       # %bb.0:
    356 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
    357 ; AVX-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
    358 ; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
    359 ; AVX-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
    360 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
    361 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
    362 ; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
    363 ; AVX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    364 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    365 ; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
    366 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
    367 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
    368 ; AVX-NEXT:    retl
    369 ;
    370 ; NO-AVX512BW-LABEL: f64i8_i32:
    371 ; NO-AVX512BW:       # %bb.0:
    372 ; NO-AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
    373 ; NO-AVX512BW-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
    374 ; NO-AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    375 ; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm0, %ymm0
    376 ; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
    377 ; NO-AVX512BW-NEXT:    retl
    378 ;
    379 ; AVX512BW-LABEL: f64i8_i32:
    380 ; AVX512BW:       # %bb.0:
    381 ; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
    382 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
    383 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    384 ; AVX512BW-NEXT:    retl
    385 ;
    386 ; AVX-64-LABEL: f64i8_i32:
    387 ; AVX-64:       # %bb.0:
    388 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
    389 ; AVX-64-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
    390 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
    391 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
    392 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
    393 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm2
    394 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
    395 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    396 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    397 ; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
    398 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
    399 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
    400 ; AVX-64-NEXT:    retq
    401 ;
    402 ; NO-AVX512BW-64-LABEL: f64i8_i32:
    403 ; NO-AVX512BW-64:       # %bb.0:
    404 ; NO-AVX512BW-64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
    405 ; NO-AVX512BW-64-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
    406 ; NO-AVX512BW-64-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    407 ; NO-AVX512BW-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
    408 ; NO-AVX512BW-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
    409 ; NO-AVX512BW-64-NEXT:    retq
    410 ;
    411 ; AVX512BW-64-LABEL: f64i8_i32:
    412 ; AVX512BW-64:       # %bb.0:
    413 ; AVX512BW-64-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
    414 ; AVX512BW-64-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
    415 ; AVX512BW-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    416 ; AVX512BW-64-NEXT:    retq
    417   %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a
    418   %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1
    419   ret <64 x i8> %res2
    420 }
    421 
    422 
    423 define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
    424 ; AVX-LABEL: f64xi8_i64:
    425 ; AVX:       # %bb.0:
    426 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
    427 ; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
    428 ; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
    429 ; AVX-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
    430 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
    431 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
    432 ; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
    433 ; AVX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    434 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    435 ; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
    436 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
    437 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
    438 ; AVX-NEXT:    retl
    439 ;
    440 ; NO-AVX512BW-LABEL: f64xi8_i64:
    441 ; NO-AVX512BW:       # %bb.0:
    442 ; NO-AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275]
    443 ; NO-AVX512BW-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
    444 ; NO-AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    445 ; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm0, %ymm0
    446 ; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
    447 ; NO-AVX512BW-NEXT:    retl
    448 ;
    449 ; AVX512BW-LABEL: f64xi8_i64:
    450 ; AVX512BW:       # %bb.0:
    451 ; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275]
    452 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
    453 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    454 ; AVX512BW-NEXT:    retl
    455 ;
    456 ; AVX-64-LABEL: f64xi8_i64:
    457 ; AVX-64:       # %bb.0:
    458 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
    459 ; AVX-64-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
    460 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
    461 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
    462 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
    463 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm2
    464 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
    465 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    466 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    467 ; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
    468 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
    469 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
    470 ; AVX-64-NEXT:    retq
    471 ;
    472 ; NO-AVX512BW-64-LABEL: f64xi8_i64:
    473 ; NO-AVX512BW-64:       # %bb.0:
    474 ; NO-AVX512BW-64-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
    475 ; NO-AVX512BW-64-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
    476 ; NO-AVX512BW-64-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    477 ; NO-AVX512BW-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
    478 ; NO-AVX512BW-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
    479 ; NO-AVX512BW-64-NEXT:    retq
    480 ;
    481 ; AVX512BW-64-LABEL: f64xi8_i64:
    482 ; AVX512BW-64:       # %bb.0:
    483 ; AVX512BW-64-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528]
    484 ; AVX512BW-64-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
    485 ; AVX512BW-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    486 ; AVX512BW-64-NEXT:    retq
    487   %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a
    488   %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1
    489   ret <64 x i8> %res2
    490 }
    491 
    492 
    493 define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
    494 ; AVX-LABEL: f64xi8_i128:
    495 ; AVX:       # %bb.0:
    496 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
    497 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
    498 ; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
    499 ; AVX-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
    500 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
    501 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
    502 ; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
    503 ; AVX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    504 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    505 ; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
    506 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
    507 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
    508 ; AVX-NEXT:    retl
    509 ;
    510 ; NO-AVX512BW-LABEL: f64xi8_i128:
    511 ; NO-AVX512BW:       # %bb.0:
    512 ; NO-AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
    513 ; NO-AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
    514 ; NO-AVX512BW-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
    515 ; NO-AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    516 ; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm0, %ymm0
    517 ; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
    518 ; NO-AVX512BW-NEXT:    retl
    519 ;
    520 ; AVX512BW-LABEL: f64xi8_i128:
    521 ; AVX512BW:       # %bb.0:
    522 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
    523 ; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
    524 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
    525 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    526 ; AVX512BW-NEXT:    retl
    527 ;
    528 ; AVX-64-LABEL: f64xi8_i128:
    529 ; AVX-64:       # %bb.0:
    530 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
    531 ; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
    532 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
    533 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
    534 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
    535 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm2
    536 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
    537 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    538 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    539 ; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
    540 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
    541 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
    542 ; AVX-64-NEXT:    retq
    543 ;
    544 ; NO-AVX512BW-64-LABEL: f64xi8_i128:
    545 ; NO-AVX512BW-64:       # %bb.0:
    546 ; NO-AVX512BW-64-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
    547 ; NO-AVX512BW-64-NEXT:    # ymm2 = mem[0,1,0,1]
    548 ; NO-AVX512BW-64-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
    549 ; NO-AVX512BW-64-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    550 ; NO-AVX512BW-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
    551 ; NO-AVX512BW-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
    552 ; NO-AVX512BW-64-NEXT:    retq
    553 ;
    554 ; AVX512BW-64-LABEL: f64xi8_i128:
    555 ; AVX512BW-64:       # %bb.0:
    556 ; AVX512BW-64-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
    557 ; AVX512BW-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
    558 ; AVX512BW-64-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
    559 ; AVX512BW-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    560 ; AVX512BW-64-NEXT:    retq
    561   %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %a
    562   %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %res1
    563   ret <64 x i8> %res2
    564 }
    565 
    566 
    567 define <64 x i8> @f64xi8_i256(<64 x i8> %a) {
    568 ; AVX-LABEL: f64xi8_i256:
    569 ; AVX:       # %bb.0:
    570 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
    571 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
    572 ; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
    573 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
    574 ; AVX-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
    575 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
    576 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
    577 ; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
    578 ; AVX-NEXT:    vpaddb %xmm4, %xmm0, %xmm0
    579 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    580 ; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
    581 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
    582 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
    583 ; AVX-NEXT:    retl
    584 ;
    585 ; NO-AVX512BW-LABEL: f64xi8_i256:
    586 ; NO-AVX512BW:       # %bb.0:
    587 ; NO-AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
    588 ; NO-AVX512BW-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
    589 ; NO-AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    590 ; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm0, %ymm0
    591 ; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
    592 ; NO-AVX512BW-NEXT:    retl
    593 ;
    594 ; AVX512BW-LABEL: f64xi8_i256:
    595 ; AVX512BW:       # %bb.0:
    596 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
    597 ; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
    598 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
    599 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    600 ; AVX512BW-NEXT:    retl
    601 ;
    602 ; AVX-64-LABEL: f64xi8_i256:
    603 ; AVX-64:       # %bb.0:
    604 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
    605 ; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
    606 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
    607 ; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
    608 ; AVX-64-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
    609 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
    610 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm2
    611 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
    612 ; AVX-64-NEXT:    vpaddb %xmm4, %xmm0, %xmm0
    613 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    614 ; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
    615 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
    616 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
    617 ; AVX-64-NEXT:    retq
    618 ;
    619 ; NO-AVX512BW-64-LABEL: f64xi8_i256:
    620 ; NO-AVX512BW-64:       # %bb.0:
    621 ; NO-AVX512BW-64-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
    622 ; NO-AVX512BW-64-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
    623 ; NO-AVX512BW-64-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    624 ; NO-AVX512BW-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
    625 ; NO-AVX512BW-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
    626 ; NO-AVX512BW-64-NEXT:    retq
    627 ;
    628 ; AVX512BW-64-LABEL: f64xi8_i256:
    629 ; AVX512BW-64:       # %bb.0:
    630 ; AVX512BW-64-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
    631 ; AVX512BW-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
    632 ; AVX512BW-64-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
    633 ; AVX512BW-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    634 ; AVX512BW-64-NEXT:    retq
    635   %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, %a
    636   %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, %res1
    637   ret <64 x i8> %res2
    638 }
    639 
    640 
    641 define <8 x i16> @f8xi16_i32(<8 x i16> %a) {
    642 ; AVX-LABEL: f8xi16_i32:
    643 ; AVX:       # %bb.0:
    644 ; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
    645 ; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
    646 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
    647 ; AVX-NEXT:    retl
    648 ;
    649 ; ALL32-LABEL: f8xi16_i32:
    650 ; ALL32:       # %bb.0:
    651 ; ALL32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [65536,65536,65536,65536]
    652 ; ALL32-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
    653 ; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
    654 ; ALL32-NEXT:    retl
    655 ;
    656 ; AVX-64-LABEL: f8xi16_i32:
    657 ; AVX-64:       # %bb.0:
    658 ; AVX-64-NEXT:    vbroadcastss {{.*#+}} xmm1 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
    659 ; AVX-64-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
    660 ; AVX-64-NEXT:    vpand %xmm1, %xmm0, %xmm0
    661 ; AVX-64-NEXT:    retq
    662 ;
    663 ; ALL64-LABEL: f8xi16_i32:
    664 ; ALL64:       # %bb.0:
    665 ; ALL64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [65536,65536,65536,65536]
    666 ; ALL64-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
    667 ; ALL64-NEXT:    vpand %xmm1, %xmm0, %xmm0
    668 ; ALL64-NEXT:    retq
    669   %res1 = add <8 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a
    670   %res2 = and <8 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1
    671   ret <8 x i16> %res2
    672 }
    673 
    674 
    675 define <8 x i16> @f8xi16_i64(<8 x i16> %a) {
    676 ; AVX-LABEL: f8xi16_i64:
    677 ; AVX:       # %bb.0:
    678 ; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
    679 ; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
    680 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
    681 ; AVX-NEXT:    retl
    682 ;
    683 ; ALL32-LABEL: f8xi16_i64:
    684 ; ALL32:       # %bb.0:
    685 ; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
    686 ; ALL32-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
    687 ; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
    688 ; ALL32-NEXT:    retl
    689 ;
    690 ; AVX-64-LABEL: f8xi16_i64:
    691 ; AVX-64:       # %bb.0:
    692 ; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
    693 ; AVX-64-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
    694 ; AVX-64-NEXT:    vpand %xmm1, %xmm0, %xmm0
    695 ; AVX-64-NEXT:    retq
    696 ;
    697 ; ALL64-LABEL: f8xi16_i64:
    698 ; ALL64:       # %bb.0:
    699 ; ALL64-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [844433520132096,844433520132096]
    700 ; ALL64-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
    701 ; ALL64-NEXT:    vpand %xmm1, %xmm0, %xmm0
    702 ; ALL64-NEXT:    retq
    703   %res1 = add <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a
    704   %res2 = and <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1
    705   ret <8 x i16> %res2
    706 }
    707 
    708 
    709 define <16 x i16> @f16xi16_i32(<16 x i16> %a) {
    710 ; AVX-LABEL: f16xi16_i32:
    711 ; AVX:       # %bb.0:
    712 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    713 ; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
    714 ; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
    715 ; AVX-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
    716 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    717 ; AVX-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
    718 ; AVX-NEXT:    retl
    719 ;
    720 ; ALL32-LABEL: f16xi16_i32:
    721 ; ALL32:       # %bb.0:
    722 ; ALL32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [65536,65536,65536,65536,65536,65536,65536,65536]
    723 ; ALL32-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    724 ; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
    725 ; ALL32-NEXT:    retl
    726 ;
    727 ; AVX-64-LABEL: f16xi16_i32:
    728 ; AVX-64:       # %bb.0:
    729 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
    730 ; AVX-64-NEXT:    vbroadcastss {{.*#+}} xmm2 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
    731 ; AVX-64-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
    732 ; AVX-64-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
    733 ; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    734 ; AVX-64-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
    735 ; AVX-64-NEXT:    retq
    736 ;
    737 ; ALL64-LABEL: f16xi16_i32:
    738 ; ALL64:       # %bb.0:
    739 ; ALL64-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [65536,65536,65536,65536,65536,65536,65536,65536]
    740 ; ALL64-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    741 ; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
    742 ; ALL64-NEXT:    retq
    743   %res1 = add <16 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a
    744   %res2 = and <16 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1
    745   ret <16 x i16> %res2
    746 }
    747 
    748 
    749 define <16 x i16> @f16xi16_i64(<16 x i16> %a) {
    750 ; AVX-LABEL: f16xi16_i64:
    751 ; AVX:       # %bb.0:
    752 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    753 ; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
    754 ; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
    755 ; AVX-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
    756 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    757 ; AVX-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
    758 ; AVX-NEXT:    retl
    759 ;
    760 ; ALL32-LABEL: f16xi16_i64:
    761 ; ALL32:       # %bb.0:
    762 ; ALL32-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309]
    763 ; ALL32-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    764 ; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
    765 ; ALL32-NEXT:    retl
    766 ;
    767 ; AVX-64-LABEL: f16xi16_i64:
    768 ; AVX-64:       # %bb.0:
    769 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
    770 ; AVX-64-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
    771 ; AVX-64-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
    772 ; AVX-64-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
    773 ; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    774 ; AVX-64-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
    775 ; AVX-64-NEXT:    retq
    776 ;
    777 ; ALL64-LABEL: f16xi16_i64:
    778 ; ALL64:       # %bb.0:
    779 ; ALL64-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [844433520132096,844433520132096,844433520132096,844433520132096]
    780 ; ALL64-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    781 ; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
    782 ; ALL64-NEXT:    retq
    783   %res1 = add <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a
    784   %res2 = and <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1
    785   ret <16 x i16> %res2
    786 }
    787 
    788 
    789 define <16 x i16> @f16xi16_i128(<16 x i16> %a) {
    790 ; AVX-LABEL: f16xi16_i128:
    791 ; AVX:       # %bb.0:
    792 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    793 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
    794 ; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
    795 ; AVX-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
    796 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    797 ; AVX-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
    798 ; AVX-NEXT:    retl
    799 ;
    800 ; ALL32-LABEL: f16xi16_i128:
    801 ; ALL32:       # %bb.0:
    802 ; ALL32-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
    803 ; ALL32-NEXT:    # ymm1 = mem[0,1,0,1]
    804 ; ALL32-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    805 ; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
    806 ; ALL32-NEXT:    retl
    807 ;
    808 ; AVX-64-LABEL: f16xi16_i128:
    809 ; AVX-64:       # %bb.0:
    810 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
    811 ; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
    812 ; AVX-64-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
    813 ; AVX-64-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
    814 ; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    815 ; AVX-64-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
    816 ; AVX-64-NEXT:    retq
    817 ;
    818 ; ALL64-LABEL: f16xi16_i128:
    819 ; ALL64:       # %bb.0:
    820 ; ALL64-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
    821 ; ALL64-NEXT:    # ymm1 = mem[0,1,0,1]
    822 ; ALL64-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    823 ; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
    824 ; ALL64-NEXT:    retq
    825   %res1 = add <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %a
    826   %res2 = and <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %res1
    827   ret <16 x i16> %res2
    828 }
    829 
    830 
    831 define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
    832 ; AVX-LABEL: f32xi16_i32:
    833 ; AVX:       # %bb.0:
    834 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
    835 ; AVX-NEXT:    vbroadcastss {{.*#+}} xmm3 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
    836 ; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
    837 ; AVX-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
    838 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
    839 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
    840 ; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
    841 ; AVX-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
    842 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    843 ; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
    844 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
    845 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
    846 ; AVX-NEXT:    retl
    847 ;
    848 ; NO-AVX512BW-LABEL: f32xi16_i32:
    849 ; NO-AVX512BW:       # %bb.0:
    850 ; NO-AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536]
    851 ; NO-AVX512BW-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
    852 ; NO-AVX512BW-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
    853 ; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm0, %ymm0
    854 ; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
    855 ; NO-AVX512BW-NEXT:    retl
    856 ;
    857 ; AVX512BW-LABEL: f32xi16_i32:
    858 ; AVX512BW:       # %bb.0:
    859 ; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536]
    860 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
    861 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    862 ; AVX512BW-NEXT:    retl
    863 ;
    864 ; AVX-64-LABEL: f32xi16_i32:
    865 ; AVX-64:       # %bb.0:
    866 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
    867 ; AVX-64-NEXT:    vbroadcastss {{.*#+}} xmm3 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
    868 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
    869 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
    870 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
    871 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm2
    872 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
    873 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
    874 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    875 ; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
    876 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
    877 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
    878 ; AVX-64-NEXT:    retq
    879 ;
    880 ; NO-AVX512BW-64-LABEL: f32xi16_i32:
    881 ; NO-AVX512BW-64:       # %bb.0:
    882 ; NO-AVX512BW-64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536]
    883 ; NO-AVX512BW-64-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
    884 ; NO-AVX512BW-64-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
    885 ; NO-AVX512BW-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
    886 ; NO-AVX512BW-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
    887 ; NO-AVX512BW-64-NEXT:    retq
    888 ;
    889 ; AVX512BW-64-LABEL: f32xi16_i32:
    890 ; AVX512BW-64:       # %bb.0:
    891 ; AVX512BW-64-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536]
    892 ; AVX512BW-64-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
    893 ; AVX512BW-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    894 ; AVX512BW-64-NEXT:    retq
    895   %res1 = add <32 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a
    896   %res2 = and <32 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1
    897   ret <32 x i16> %res2
    898 }
    899 
    900 
    901 define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
    902 ; AVX-LABEL: f32xi16_i64:
    903 ; AVX:       # %bb.0:
    904 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
    905 ; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
    906 ; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
    907 ; AVX-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
    908 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
    909 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
    910 ; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
    911 ; AVX-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
    912 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    913 ; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
    914 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
    915 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
    916 ; AVX-NEXT:    retl
    917 ;
    918 ; NO-AVX512BW-LABEL: f32xi16_i64:
    919 ; NO-AVX512BW:       # %bb.0:
    920 ; NO-AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309]
    921 ; NO-AVX512BW-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
    922 ; NO-AVX512BW-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
    923 ; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm0, %ymm0
    924 ; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
    925 ; NO-AVX512BW-NEXT:    retl
    926 ;
    927 ; AVX512BW-LABEL: f32xi16_i64:
    928 ; AVX512BW:       # %bb.0:
    929 ; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309]
    930 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
    931 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    932 ; AVX512BW-NEXT:    retl
    933 ;
    934 ; AVX-64-LABEL: f32xi16_i64:
    935 ; AVX-64:       # %bb.0:
    936 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
    937 ; AVX-64-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
    938 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
    939 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
    940 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
    941 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm2
    942 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
    943 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
    944 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    945 ; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
    946 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
    947 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
    948 ; AVX-64-NEXT:    retq
    949 ;
    950 ; NO-AVX512BW-64-LABEL: f32xi16_i64:
    951 ; NO-AVX512BW-64:       # %bb.0:
    952 ; NO-AVX512BW-64-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096]
    953 ; NO-AVX512BW-64-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
    954 ; NO-AVX512BW-64-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
    955 ; NO-AVX512BW-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
    956 ; NO-AVX512BW-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
    957 ; NO-AVX512BW-64-NEXT:    retq
    958 ;
    959 ; AVX512BW-64-LABEL: f32xi16_i64:
    960 ; AVX512BW-64:       # %bb.0:
    961 ; AVX512BW-64-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096]
    962 ; AVX512BW-64-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
    963 ; AVX512BW-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    964 ; AVX512BW-64-NEXT:    retq
    965   %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a
    966   %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1
    967   ret <32 x i16> %res2
    968 }
    969 
    970 
    971 define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
    972 ; AVX-LABEL: f32xi16_i128:
    973 ; AVX:       # %bb.0:
    974 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
    975 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7]
    976 ; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
    977 ; AVX-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
    978 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
    979 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
    980 ; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
    981 ; AVX-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
    982 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    983 ; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
    984 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
    985 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
    986 ; AVX-NEXT:    retl
    987 ;
    988 ; NO-AVX512BW-LABEL: f32xi16_i128:
    989 ; NO-AVX512BW:       # %bb.0:
    990 ; NO-AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
    991 ; NO-AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
    992 ; NO-AVX512BW-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
    993 ; NO-AVX512BW-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
    994 ; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm0, %ymm0
    995 ; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
    996 ; NO-AVX512BW-NEXT:    retl
    997 ;
    998 ; AVX512BW-LABEL: f32xi16_i128:
    999 ; AVX512BW:       # %bb.0:
   1000 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
   1001 ; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
   1002 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
   1003 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   1004 ; AVX512BW-NEXT:    retl
   1005 ;
   1006 ; AVX-64-LABEL: f32xi16_i128:
   1007 ; AVX-64:       # %bb.0:
   1008 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1009 ; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7]
   1010 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
   1011 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
   1012 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1013 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1014 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
   1015 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
   1016 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1017 ; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
   1018 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
   1019 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
   1020 ; AVX-64-NEXT:    retq
   1021 ;
   1022 ; NO-AVX512BW-64-LABEL: f32xi16_i128:
   1023 ; NO-AVX512BW-64:       # %bb.0:
   1024 ; NO-AVX512BW-64-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
   1025 ; NO-AVX512BW-64-NEXT:    # ymm2 = mem[0,1,0,1]
   1026 ; NO-AVX512BW-64-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
   1027 ; NO-AVX512BW-64-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
   1028 ; NO-AVX512BW-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
   1029 ; NO-AVX512BW-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
   1030 ; NO-AVX512BW-64-NEXT:    retq
   1031 ;
   1032 ; AVX512BW-64-LABEL: f32xi16_i128:
   1033 ; AVX512BW-64:       # %bb.0:
   1034 ; AVX512BW-64-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
   1035 ; AVX512BW-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
   1036 ; AVX512BW-64-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
   1037 ; AVX512BW-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   1038 ; AVX512BW-64-NEXT:    retq
   1039   %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %a
   1040   %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %res1
   1041   ret <32 x i16> %res2
   1042 }
   1043 
   1044 
   1045 define <32 x i16> @f32xi16_i256(<32 x i16> %a) {
   1046 ; AVX-LABEL: f32xi16_i256:
   1047 ; AVX:       # %bb.0:
   1048 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1049 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15]
   1050 ; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
   1051 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7]
   1052 ; AVX-NEXT:    vpaddw %xmm4, %xmm1, %xmm1
   1053 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1054 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1055 ; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
   1056 ; AVX-NEXT:    vpaddw %xmm4, %xmm0, %xmm0
   1057 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1058 ; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
   1059 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
   1060 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
   1061 ; AVX-NEXT:    retl
   1062 ;
   1063 ; NO-AVX512BW-LABEL: f32xi16_i256:
   1064 ; NO-AVX512BW:       # %bb.0:
   1065 ; NO-AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
   1066 ; NO-AVX512BW-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
   1067 ; NO-AVX512BW-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
   1068 ; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm0, %ymm0
   1069 ; NO-AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
   1070 ; NO-AVX512BW-NEXT:    retl
   1071 ;
   1072 ; AVX512BW-LABEL: f32xi16_i256:
   1073 ; AVX512BW:       # %bb.0:
   1074 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
   1075 ; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
   1076 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
   1077 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   1078 ; AVX512BW-NEXT:    retl
   1079 ;
   1080 ; AVX-64-LABEL: f32xi16_i256:
   1081 ; AVX-64:       # %bb.0:
   1082 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1083 ; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15]
   1084 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
   1085 ; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7]
   1086 ; AVX-64-NEXT:    vpaddw %xmm4, %xmm1, %xmm1
   1087 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1088 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1089 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
   1090 ; AVX-64-NEXT:    vpaddw %xmm4, %xmm0, %xmm0
   1091 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1092 ; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
   1093 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
   1094 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
   1095 ; AVX-64-NEXT:    retq
   1096 ;
   1097 ; NO-AVX512BW-64-LABEL: f32xi16_i256:
   1098 ; NO-AVX512BW-64:       # %bb.0:
   1099 ; NO-AVX512BW-64-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
   1100 ; NO-AVX512BW-64-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
   1101 ; NO-AVX512BW-64-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
   1102 ; NO-AVX512BW-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
   1103 ; NO-AVX512BW-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
   1104 ; NO-AVX512BW-64-NEXT:    retq
   1105 ;
   1106 ; AVX512BW-64-LABEL: f32xi16_i256:
   1107 ; AVX512BW-64:       # %bb.0:
   1108 ; AVX512BW-64-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
   1109 ; AVX512BW-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
   1110 ; AVX512BW-64-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
   1111 ; AVX512BW-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   1112 ; AVX512BW-64-NEXT:    retq
   1113   %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a
   1114   %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %res1
   1115   ret <32 x i16> %res2
   1116 }
   1117 
   1118 
   1119 
   1120 define <4 x i32> @f4xi32_i64(<4 x i32> %a) {
   1121 ; AVX-LABEL: f4xi32_i64:
   1122 ; AVX:       # %bb.0:
   1123 ; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
   1124 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
   1125 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1126 ; AVX-NEXT:    retl
   1127 ;
   1128 ; ALL32-LABEL: f4xi32_i64:
   1129 ; ALL32:       # %bb.0:
   1130 ; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
   1131 ; ALL32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
   1132 ; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1133 ; ALL32-NEXT:    retl
   1134 ;
   1135 ; AVX-64-LABEL: f4xi32_i64:
   1136 ; AVX-64:       # %bb.0:
   1137 ; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
   1138 ; AVX-64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
   1139 ; AVX-64-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1140 ; AVX-64-NEXT:    retq
   1141 ;
   1142 ; ALL64-LABEL: f4xi32_i64:
   1143 ; ALL64:       # %bb.0:
   1144 ; ALL64-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4294967296,4294967296]
   1145 ; ALL64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
   1146 ; ALL64-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1147 ; ALL64-NEXT:    retq
   1148   %res1 = add <4 x i32> <i32 0, i32 1, i32 0, i32 1>, %a
   1149   %res2 = and <4 x i32> <i32 0, i32 1, i32 0, i32 1>, %res1
   1150   ret <4 x i32> %res2
   1151 }
   1152 
   1153 
   1154 define <8 x i32> @f8xi32_i64(<8 x i32> %a) {
   1155 ; AVX-LABEL: f8xi32_i64:
   1156 ; AVX:       # %bb.0:
   1157 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1158 ; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
   1159 ; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
   1160 ; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
   1161 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1162 ; AVX-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
   1163 ; AVX-NEXT:    retl
   1164 ;
   1165 ; ALL32-LABEL: f8xi32_i64:
   1166 ; ALL32:       # %bb.0:
   1167 ; ALL32-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314]
   1168 ; ALL32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   1169 ; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1170 ; ALL32-NEXT:    retl
   1171 ;
   1172 ; AVX-64-LABEL: f8xi32_i64:
   1173 ; AVX-64:       # %bb.0:
   1174 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1175 ; AVX-64-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
   1176 ; AVX-64-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
   1177 ; AVX-64-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
   1178 ; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1179 ; AVX-64-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
   1180 ; AVX-64-NEXT:    retq
   1181 ;
   1182 ; ALL64-LABEL: f8xi32_i64:
   1183 ; ALL64:       # %bb.0:
   1184 ; ALL64-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967296,4294967296,4294967296,4294967296]
   1185 ; ALL64-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   1186 ; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1187 ; ALL64-NEXT:    retq
   1188   %res1 = add <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %a
   1189   %res2 = and <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %res1
   1190   ret <8 x i32> %res2
   1191 }
   1192 
   1193 
   1194 define <8 x i32> @f8xi32_i128(<8 x i32> %a) {
   1195 ; AVX-LABEL: f8xi32_i128:
   1196 ; AVX:       # %bb.0:
   1197 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1198 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3]
   1199 ; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
   1200 ; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
   1201 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1202 ; AVX-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
   1203 ; AVX-NEXT:    retl
   1204 ;
   1205 ; ALL32-LABEL: f8xi32_i128:
   1206 ; ALL32:       # %bb.0:
   1207 ; ALL32-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3]
   1208 ; ALL32-NEXT:    # ymm1 = mem[0,1,0,1]
   1209 ; ALL32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   1210 ; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1211 ; ALL32-NEXT:    retl
   1212 ;
   1213 ; AVX-64-LABEL: f8xi32_i128:
   1214 ; AVX-64:       # %bb.0:
   1215 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1216 ; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3]
   1217 ; AVX-64-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
   1218 ; AVX-64-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
   1219 ; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1220 ; AVX-64-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
   1221 ; AVX-64-NEXT:    retq
   1222 ;
   1223 ; ALL64-LABEL: f8xi32_i128:
   1224 ; ALL64:       # %bb.0:
   1225 ; ALL64-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3]
   1226 ; ALL64-NEXT:    # ymm1 = mem[0,1,0,1]
   1227 ; ALL64-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
   1228 ; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1229 ; ALL64-NEXT:    retq
   1230   %res1 = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %a
   1231   %res2 = and <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %res1
   1232   ret <8 x i32> %res2
   1233 }
   1234 
   1235 
   1236 define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
   1237 ; AVX-LABEL: f16xi32_i64:
   1238 ; AVX:       # %bb.0:
   1239 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1240 ; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
   1241 ; AVX-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
   1242 ; AVX-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
   1243 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1244 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1245 ; AVX-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
   1246 ; AVX-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
   1247 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1248 ; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1]
   1249 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
   1250 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
   1251 ; AVX-NEXT:    retl
   1252 ;
   1253 ; AVX2-LABEL: f16xi32_i64:
   1254 ; AVX2:       # %bb.0:
   1255 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314]
   1256 ; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
   1257 ; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
   1258 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
   1259 ; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
   1260 ; AVX2-NEXT:    retl
   1261 ;
   1262 ; AVX512-LABEL: f16xi32_i64:
   1263 ; AVX512:       # %bb.0:
   1264 ; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314]
   1265 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1266 ; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   1267 ; AVX512-NEXT:    retl
   1268 ;
   1269 ; AVX-64-LABEL: f16xi32_i64:
   1270 ; AVX-64:       # %bb.0:
   1271 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1272 ; AVX-64-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
   1273 ; AVX-64-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
   1274 ; AVX-64-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
   1275 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1276 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1277 ; AVX-64-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
   1278 ; AVX-64-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
   1279 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1280 ; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1]
   1281 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
   1282 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
   1283 ; AVX-64-NEXT:    retq
   1284 ;
   1285 ; AVX2-64-LABEL: f16xi32_i64:
   1286 ; AVX2-64:       # %bb.0:
   1287 ; AVX2-64-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296]
   1288 ; AVX2-64-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
   1289 ; AVX2-64-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
   1290 ; AVX2-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
   1291 ; AVX2-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
   1292 ; AVX2-64-NEXT:    retq
   1293 ;
   1294 ; AVX512F-64-LABEL: f16xi32_i64:
   1295 ; AVX512F-64:       # %bb.0:
   1296 ; AVX512F-64-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296]
   1297 ; AVX512F-64-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1298 ; AVX512F-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   1299 ; AVX512F-64-NEXT:    retq
   1300   %res1 = add <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %a
   1301   %res2 = and <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %res1
   1302   ret <16 x i32> %res2
   1303 }
   1304 
   1305 
   1306 define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
   1307 ; AVX-LABEL: f16xi32_i128:
   1308 ; AVX:       # %bb.0:
   1309 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1310 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3]
   1311 ; AVX-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
   1312 ; AVX-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
   1313 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1314 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1315 ; AVX-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
   1316 ; AVX-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
   1317 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1318 ; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
   1319 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
   1320 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
   1321 ; AVX-NEXT:    retl
   1322 ;
   1323 ; AVX2-LABEL: f16xi32_i128:
   1324 ; AVX2:       # %bb.0:
   1325 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
   1326 ; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
   1327 ; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
   1328 ; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
   1329 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
   1330 ; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
   1331 ; AVX2-NEXT:    retl
   1332 ;
   1333 ; AVX512-LABEL: f16xi32_i128:
   1334 ; AVX512:       # %bb.0:
   1335 ; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
   1336 ; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
   1337 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1338 ; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   1339 ; AVX512-NEXT:    retl
   1340 ;
   1341 ; AVX-64-LABEL: f16xi32_i128:
   1342 ; AVX-64:       # %bb.0:
   1343 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1344 ; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3]
   1345 ; AVX-64-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
   1346 ; AVX-64-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
   1347 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1348 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1349 ; AVX-64-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
   1350 ; AVX-64-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
   1351 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1352 ; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
   1353 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
   1354 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
   1355 ; AVX-64-NEXT:    retq
   1356 ;
   1357 ; AVX2-64-LABEL: f16xi32_i128:
   1358 ; AVX2-64:       # %bb.0:
   1359 ; AVX2-64-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
   1360 ; AVX2-64-NEXT:    # ymm2 = mem[0,1,0,1]
   1361 ; AVX2-64-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
   1362 ; AVX2-64-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
   1363 ; AVX2-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
   1364 ; AVX2-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
   1365 ; AVX2-64-NEXT:    retq
   1366 ;
   1367 ; AVX512F-64-LABEL: f16xi32_i128:
   1368 ; AVX512F-64:       # %bb.0:
   1369 ; AVX512F-64-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
   1370 ; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
   1371 ; AVX512F-64-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
   1372 ; AVX512F-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   1373 ; AVX512F-64-NEXT:    retq
   1374   %res1 = add <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %a
   1375   %res2 = and <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %res1
   1376   ret <16 x i32> %res2
   1377 }
   1378 
   1379 
   1380 define <4 x i64> @f4xi64_i128(<4 x i64> %a) {
   1381 ; AVX-LABEL: f4xi64_i128:
   1382 ; AVX:       # %bb.0:
   1383 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1384 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,1,0]
   1385 ; AVX-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
   1386 ; AVX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
   1387 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1388 ; AVX-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
   1389 ; AVX-NEXT:    retl
   1390 ;
   1391 ; ALL32-LABEL: f4xi64_i128:
   1392 ; ALL32:       # %bb.0:
   1393 ; ALL32-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0]
   1394 ; ALL32-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
   1395 ; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1396 ; ALL32-NEXT:    retl
   1397 ;
   1398 ; AVX-64-LABEL: f4xi64_i128:
   1399 ; AVX-64:       # %bb.0:
   1400 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1401 ; AVX-64-NEXT:    movl $1, %eax
   1402 ; AVX-64-NEXT:    vmovq %rax, %xmm2
   1403 ; AVX-64-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
   1404 ; AVX-64-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
   1405 ; AVX-64-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
   1406 ; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1407 ; AVX-64-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
   1408 ; AVX-64-NEXT:    retq
   1409 ;
   1410 ; ALL64-LABEL: f4xi64_i128:
   1411 ; ALL64:       # %bb.0:
   1412 ; ALL64-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,1,0,1]
   1413 ; ALL64-NEXT:    # ymm1 = mem[0,1,0,1]
   1414 ; ALL64-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
   1415 ; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1416 ; ALL64-NEXT:    retq
   1417   %res1 = add <4 x i64> <i64 0, i64 1, i64 0, i64 1>, %a
   1418   %res2 = and <4 x i64> <i64 0, i64 1, i64 0, i64 1>, %res1
   1419   ret <4 x i64> %res2
   1420 }
   1421 
   1422 
   1423 define <8 x i64> @f8xi64_i128(<8 x i64> %a) {
   1424 ; AVX-LABEL: f8xi64_i128:
   1425 ; AVX:       # %bb.0:
   1426 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1427 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,0,1,0]
   1428 ; AVX-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
   1429 ; AVX-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
   1430 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1431 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1432 ; AVX-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
   1433 ; AVX-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
   1434 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1435 ; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0]
   1436 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
   1437 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
   1438 ; AVX-NEXT:    retl
   1439 ;
   1440 ; AVX2-LABEL: f8xi64_i128:
   1441 ; AVX2:       # %bb.0:
   1442 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0]
   1443 ; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
   1444 ; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
   1445 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
   1446 ; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
   1447 ; AVX2-NEXT:    retl
   1448 ;
   1449 ; AVX512-LABEL: f8xi64_i128:
   1450 ; AVX512:       # %bb.0:
   1451 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0]
   1452 ; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
   1453 ; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   1454 ; AVX512-NEXT:    retl
   1455 ;
   1456 ; AVX-64-LABEL: f8xi64_i128:
   1457 ; AVX-64:       # %bb.0:
   1458 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1459 ; AVX-64-NEXT:    movl $1, %eax
   1460 ; AVX-64-NEXT:    vmovq %rax, %xmm3
   1461 ; AVX-64-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7]
   1462 ; AVX-64-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
   1463 ; AVX-64-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
   1464 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1465 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1466 ; AVX-64-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
   1467 ; AVX-64-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
   1468 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1469 ; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,1,0,1]
   1470 ; AVX-64-NEXT:    # ymm2 = mem[0,1,0,1]
   1471 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
   1472 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
   1473 ; AVX-64-NEXT:    retq
   1474 ;
   1475 ; AVX2-64-LABEL: f8xi64_i128:
   1476 ; AVX2-64:       # %bb.0:
   1477 ; AVX2-64-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,0,1]
   1478 ; AVX2-64-NEXT:    # ymm2 = mem[0,1,0,1]
   1479 ; AVX2-64-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
   1480 ; AVX2-64-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
   1481 ; AVX2-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
   1482 ; AVX2-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
   1483 ; AVX2-64-NEXT:    retq
   1484 ;
   1485 ; AVX512F-64-LABEL: f8xi64_i128:
   1486 ; AVX512F-64:       # %bb.0:
   1487 ; AVX512F-64-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,0,1,0,1,0,1]
   1488 ; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
   1489 ; AVX512F-64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
   1490 ; AVX512F-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   1491 ; AVX512F-64-NEXT:    retq
   1492   %res1 = add <8 x i64> <i64 0, i64 1, i64 0, i64 1, i64 0, i64 1, i64 0, i64 1>, %a
   1493   %res2 = and <8 x i64> <i64 0, i64 1, i64 0, i64 1, i64 0, i64 1, i64 0, i64 1>, %res1
   1494   ret <8 x i64> %res2
   1495 }
   1496 
   1497 
   1498 define <8 x i64> @f8xi64_i256(<8 x i64> %a) {
   1499 ; AVX-LABEL: f8xi64_i256:
   1500 ; AVX:       # %bb.0:
   1501 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1502 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,0,3,0]
   1503 ; AVX-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
   1504 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,0,1,0]
   1505 ; AVX-NEXT:    vpaddq %xmm4, %xmm1, %xmm1
   1506 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1507 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1508 ; AVX-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
   1509 ; AVX-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
   1510 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1511 ; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0]
   1512 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
   1513 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
   1514 ; AVX-NEXT:    retl
   1515 ;
   1516 ; AVX2-LABEL: f8xi64_i256:
   1517 ; AVX2:       # %bb.0:
   1518 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0]
   1519 ; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
   1520 ; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
   1521 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
   1522 ; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
   1523 ; AVX2-NEXT:    retl
   1524 ;
   1525 ; AVX512-LABEL: f8xi64_i256:
   1526 ; AVX512:       # %bb.0:
   1527 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,0,0,1,0,2,0,3,0]
   1528 ; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
   1529 ; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   1530 ; AVX512-NEXT:    retl
   1531 ;
   1532 ; AVX-64-LABEL: f8xi64_i256:
   1533 ; AVX-64:       # %bb.0:
   1534 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1535 ; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,3]
   1536 ; AVX-64-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
   1537 ; AVX-64-NEXT:    movl $1, %eax
   1538 ; AVX-64-NEXT:    vmovq %rax, %xmm4
   1539 ; AVX-64-NEXT:    vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
   1540 ; AVX-64-NEXT:    vpaddq %xmm4, %xmm1, %xmm1
   1541 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1542 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1543 ; AVX-64-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
   1544 ; AVX-64-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
   1545 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1546 ; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3]
   1547 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
   1548 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
   1549 ; AVX-64-NEXT:    retq
   1550 ;
   1551 ; AVX2-64-LABEL: f8xi64_i256:
   1552 ; AVX2-64:       # %bb.0:
   1553 ; AVX2-64-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3]
   1554 ; AVX2-64-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
   1555 ; AVX2-64-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
   1556 ; AVX2-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
   1557 ; AVX2-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
   1558 ; AVX2-64-NEXT:    retq
   1559 ;
   1560 ; AVX512F-64-LABEL: f8xi64_i256:
   1561 ; AVX512F-64:       # %bb.0:
   1562 ; AVX512F-64-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3]
   1563 ; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
   1564 ; AVX512F-64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
   1565 ; AVX512F-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   1566 ; AVX512F-64-NEXT:    retq
   1567   %res1 = add <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>, %a
   1568   %res2 = and <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>, %res1
   1569   ret <8 x i64> %res2
   1570 }
   1571 
   1572 
   1573 define <4 x float> @f4xf32_f64(<4 x float> %a) {
   1574 ; AVX-LABEL: f4xf32_f64:
   1575 ; AVX:       # %bb.0:
   1576 ; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
   1577 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
   1578 ; AVX-NEXT:    vdivps %xmm0, %xmm1, %xmm0
   1579 ; AVX-NEXT:    retl
   1580 ;
   1581 ; ALL32-LABEL: f4xf32_f64:
   1582 ; ALL32:       # %bb.0:
   1583 ; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
   1584 ; ALL32-NEXT:    vaddps %xmm1, %xmm0, %xmm0
   1585 ; ALL32-NEXT:    vdivps %xmm0, %xmm1, %xmm0
   1586 ; ALL32-NEXT:    retl
   1587 ;
   1588 ; AVX-64-LABEL: f4xf32_f64:
   1589 ; AVX-64:       # %bb.0:
   1590 ; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
   1591 ; AVX-64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
   1592 ; AVX-64-NEXT:    vdivps %xmm0, %xmm1, %xmm0
   1593 ; AVX-64-NEXT:    retq
   1594 ;
   1595 ; ALL64-LABEL: f4xf32_f64:
   1596 ; ALL64:       # %bb.0:
   1597 ; ALL64-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760]
   1598 ; ALL64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
   1599 ; ALL64-NEXT:    vdivps %xmm0, %xmm1, %xmm0
   1600 ; ALL64-NEXT:    retq
   1601   %res1 = fadd <4 x float> <float 2.0, float 1.0, float 2.0, float 1.0>, %a
   1602   %res2 = fdiv <4 x float> <float 2.0, float 1.0, float 2.0, float 1.0>, %res1
   1603   ret <4 x float> %res2
   1604 }
   1605 
   1606 
   1607 define <8 x float> @f8xf32_f64(<8 x float> %a) {
   1608 ; AVX-LABEL: f8xf32_f64:
   1609 ; AVX:       # %bb.0:
   1610 ; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
   1611 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
   1612 ; AVX-NEXT:    vdivps %ymm0, %ymm1, %ymm0
   1613 ; AVX-NEXT:    retl
   1614 ;
   1615 ; ALL32-LABEL: f8xf32_f64:
   1616 ; ALL32:       # %bb.0:
   1617 ; ALL32-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
   1618 ; ALL32-NEXT:    vaddps %ymm1, %ymm0, %ymm0
   1619 ; ALL32-NEXT:    vdivps %ymm0, %ymm1, %ymm0
   1620 ; ALL32-NEXT:    retl
   1621 ;
   1622 ; AVX-64-LABEL: f8xf32_f64:
   1623 ; AVX-64:       # %bb.0:
   1624 ; AVX-64-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
   1625 ; AVX-64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
   1626 ; AVX-64-NEXT:    vdivps %ymm0, %ymm1, %ymm0
   1627 ; AVX-64-NEXT:    retq
   1628 ;
   1629 ; ALL64-LABEL: f8xf32_f64:
   1630 ; ALL64:       # %bb.0:
   1631 ; ALL64-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
   1632 ; ALL64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
   1633 ; ALL64-NEXT:    vdivps %ymm0, %ymm1, %ymm0
   1634 ; ALL64-NEXT:    retq
   1635   %res1 = fadd <8 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %a
   1636   %res2 = fdiv <8 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %res1
   1637   ret <8 x float> %res2
   1638 }
   1639 
   1640 
   1641 define <8 x float> @f8xf32_f128(<8 x float> %a) {
   1642 ; AVX-LABEL: f8xf32_f128:
   1643 ; AVX:       # %bb.0:
   1644 ; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
   1645 ; AVX-NEXT:    # ymm1 = mem[0,1,0,1]
   1646 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
   1647 ; AVX-NEXT:    vdivps %ymm0, %ymm1, %ymm0
   1648 ; AVX-NEXT:    retl
   1649 ;
   1650 ; ALL32-LABEL: f8xf32_f128:
   1651 ; ALL32:       # %bb.0:
   1652 ; ALL32-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
   1653 ; ALL32-NEXT:    # ymm1 = mem[0,1,0,1]
   1654 ; ALL32-NEXT:    vaddps %ymm1, %ymm0, %ymm0
   1655 ; ALL32-NEXT:    vdivps %ymm0, %ymm1, %ymm0
   1656 ; ALL32-NEXT:    retl
   1657 ;
   1658 ; AVX-64-LABEL: f8xf32_f128:
   1659 ; AVX-64:       # %bb.0:
   1660 ; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
   1661 ; AVX-64-NEXT:    # ymm1 = mem[0,1,0,1]
   1662 ; AVX-64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
   1663 ; AVX-64-NEXT:    vdivps %ymm0, %ymm1, %ymm0
   1664 ; AVX-64-NEXT:    retq
   1665 ;
   1666 ; ALL64-LABEL: f8xf32_f128:
   1667 ; ALL64:       # %bb.0:
   1668 ; ALL64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
   1669 ; ALL64-NEXT:    # ymm1 = mem[0,1,0,1]
   1670 ; ALL64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
   1671 ; ALL64-NEXT:    vdivps %ymm0, %ymm1, %ymm0
   1672 ; ALL64-NEXT:    retq
   1673   %res1 = fadd <8 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %a
   1674   %res2 = fdiv <8 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %res1
   1675   ret <8 x float> %res2
   1676 }
   1677 
   1678 
   1679 define <16 x float> @f16xf32_f64(<16 x float> %a) {
   1680 ; AVX-LABEL: f16xf32_f64:
   1681 ; AVX:       # %bb.0:
   1682 ; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
   1683 ; AVX-NEXT:    vaddps %ymm2, %ymm1, %ymm1
   1684 ; AVX-NEXT:    vaddps %ymm2, %ymm0, %ymm0
   1685 ; AVX-NEXT:    vdivps %ymm0, %ymm2, %ymm0
   1686 ; AVX-NEXT:    vdivps %ymm1, %ymm2, %ymm1
   1687 ; AVX-NEXT:    retl
   1688 ;
   1689 ; AVX2-LABEL: f16xf32_f64:
   1690 ; AVX2:       # %bb.0:
   1691 ; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
   1692 ; AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
   1693 ; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
   1694 ; AVX2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
   1695 ; AVX2-NEXT:    vdivps %ymm1, %ymm2, %ymm1
   1696 ; AVX2-NEXT:    retl
   1697 ;
   1698 ; AVX512-LABEL: f16xf32_f64:
   1699 ; AVX512:       # %bb.0:
   1700 ; AVX512-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
   1701 ; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
   1702 ; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
   1703 ; AVX512-NEXT:    retl
   1704 ;
   1705 ; AVX-64-LABEL: f16xf32_f64:
   1706 ; AVX-64:       # %bb.0:
   1707 ; AVX-64-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
   1708 ; AVX-64-NEXT:    vaddps %ymm2, %ymm1, %ymm1
   1709 ; AVX-64-NEXT:    vaddps %ymm2, %ymm0, %ymm0
   1710 ; AVX-64-NEXT:    vdivps %ymm0, %ymm2, %ymm0
   1711 ; AVX-64-NEXT:    vdivps %ymm1, %ymm2, %ymm1
   1712 ; AVX-64-NEXT:    retq
   1713 ;
   1714 ; AVX2-64-LABEL: f16xf32_f64:
   1715 ; AVX2-64:       # %bb.0:
   1716 ; AVX2-64-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
   1717 ; AVX2-64-NEXT:    vaddps %ymm2, %ymm1, %ymm1
   1718 ; AVX2-64-NEXT:    vaddps %ymm2, %ymm0, %ymm0
   1719 ; AVX2-64-NEXT:    vdivps %ymm0, %ymm2, %ymm0
   1720 ; AVX2-64-NEXT:    vdivps %ymm1, %ymm2, %ymm1
   1721 ; AVX2-64-NEXT:    retq
   1722 ;
   1723 ; AVX512F-64-LABEL: f16xf32_f64:
   1724 ; AVX512F-64:       # %bb.0:
   1725 ; AVX512F-64-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
   1726 ; AVX512F-64-NEXT:    vaddps %zmm1, %zmm0, %zmm0
   1727 ; AVX512F-64-NEXT:    vdivps %zmm0, %zmm1, %zmm0
   1728 ; AVX512F-64-NEXT:    retq
   1729   %res1 = fadd <16 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %a
   1730   %res2 = fdiv <16 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %res1
   1731   ret <16 x float> %res2
   1732 }
   1733 
   1734 
   1735 define <16 x float> @f16xf32_f128(<16 x float> %a) {
   1736 ; AVX-LABEL: f16xf32_f128:
   1737 ; AVX:       # %bb.0:
   1738 ; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
   1739 ; AVX-NEXT:    # ymm2 = mem[0,1,0,1]
   1740 ; AVX-NEXT:    vaddps %ymm2, %ymm1, %ymm1
   1741 ; AVX-NEXT:    vaddps %ymm2, %ymm0, %ymm0
   1742 ; AVX-NEXT:    vdivps %ymm0, %ymm2, %ymm0
   1743 ; AVX-NEXT:    vdivps %ymm1, %ymm2, %ymm1
   1744 ; AVX-NEXT:    retl
   1745 ;
   1746 ; AVX2-LABEL: f16xf32_f128:
   1747 ; AVX2:       # %bb.0:
   1748 ; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
   1749 ; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
   1750 ; AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
   1751 ; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
   1752 ; AVX2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
   1753 ; AVX2-NEXT:    vdivps %ymm1, %ymm2, %ymm1
   1754 ; AVX2-NEXT:    retl
   1755 ;
   1756 ; AVX512-LABEL: f16xf32_f128:
   1757 ; AVX512:       # %bb.0:
   1758 ; AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
   1759 ; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
   1760 ; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
   1761 ; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
   1762 ; AVX512-NEXT:    retl
   1763 ;
   1764 ; AVX-64-LABEL: f16xf32_f128:
   1765 ; AVX-64:       # %bb.0:
   1766 ; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
   1767 ; AVX-64-NEXT:    # ymm2 = mem[0,1,0,1]
   1768 ; AVX-64-NEXT:    vaddps %ymm2, %ymm1, %ymm1
   1769 ; AVX-64-NEXT:    vaddps %ymm2, %ymm0, %ymm0
   1770 ; AVX-64-NEXT:    vdivps %ymm0, %ymm2, %ymm0
   1771 ; AVX-64-NEXT:    vdivps %ymm1, %ymm2, %ymm1
   1772 ; AVX-64-NEXT:    retq
   1773 ;
   1774 ; AVX2-64-LABEL: f16xf32_f128:
   1775 ; AVX2-64:       # %bb.0:
   1776 ; AVX2-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
   1777 ; AVX2-64-NEXT:    # ymm2 = mem[0,1,0,1]
   1778 ; AVX2-64-NEXT:    vaddps %ymm2, %ymm1, %ymm1
   1779 ; AVX2-64-NEXT:    vaddps %ymm2, %ymm0, %ymm0
   1780 ; AVX2-64-NEXT:    vdivps %ymm0, %ymm2, %ymm0
   1781 ; AVX2-64-NEXT:    vdivps %ymm1, %ymm2, %ymm1
   1782 ; AVX2-64-NEXT:    retq
   1783 ;
   1784 ; AVX512F-64-LABEL: f16xf32_f128:
   1785 ; AVX512F-64:       # %bb.0:
   1786 ; AVX512F-64-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
   1787 ; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
   1788 ; AVX512F-64-NEXT:    vaddps %zmm1, %zmm0, %zmm0
   1789 ; AVX512F-64-NEXT:    vdivps %zmm0, %zmm1, %zmm0
   1790 ; AVX512F-64-NEXT:    retq
   1791   %res1 = fadd <16 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %a
   1792   %res2 = fdiv <16 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %res1
   1793   ret <16 x float> %res2
   1794 }
   1795 
   1796 
   1797 define <16 x float> @f16xf32_f256(<16 x float> %a) {
   1798 ; AVX-LABEL: f16xf32_f256:
   1799 ; AVX:       # %bb.0:
   1800 ; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
   1801 ; AVX-NEXT:    vaddps %ymm2, %ymm1, %ymm1
   1802 ; AVX-NEXT:    vaddps %ymm2, %ymm0, %ymm0
   1803 ; AVX-NEXT:    vdivps %ymm0, %ymm2, %ymm0
   1804 ; AVX-NEXT:    vdivps %ymm1, %ymm2, %ymm1
   1805 ; AVX-NEXT:    retl
   1806 ;
   1807 ; AVX2-LABEL: f16xf32_f256:
   1808 ; AVX2:       # %bb.0:
   1809 ; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
   1810 ; AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
   1811 ; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
   1812 ; AVX2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
   1813 ; AVX2-NEXT:    vdivps %ymm1, %ymm2, %ymm1
   1814 ; AVX2-NEXT:    retl
   1815 ;
   1816 ; AVX512-LABEL: f16xf32_f256:
   1817 ; AVX512:       # %bb.0:
   1818 ; AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
   1819 ; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
   1820 ; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
   1821 ; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
   1822 ; AVX512-NEXT:    retl
   1823 ;
   1824 ; AVX-64-LABEL: f16xf32_f256:
   1825 ; AVX-64:       # %bb.0:
   1826 ; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
   1827 ; AVX-64-NEXT:    vaddps %ymm2, %ymm1, %ymm1
   1828 ; AVX-64-NEXT:    vaddps %ymm2, %ymm0, %ymm0
   1829 ; AVX-64-NEXT:    vdivps %ymm0, %ymm2, %ymm0
   1830 ; AVX-64-NEXT:    vdivps %ymm1, %ymm2, %ymm1
   1831 ; AVX-64-NEXT:    retq
   1832 ;
   1833 ; AVX2-64-LABEL: f16xf32_f256:
   1834 ; AVX2-64:       # %bb.0:
   1835 ; AVX2-64-NEXT:    vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
   1836 ; AVX2-64-NEXT:    vaddps %ymm2, %ymm1, %ymm1
   1837 ; AVX2-64-NEXT:    vaddps %ymm2, %ymm0, %ymm0
   1838 ; AVX2-64-NEXT:    vdivps %ymm0, %ymm2, %ymm0
   1839 ; AVX2-64-NEXT:    vdivps %ymm1, %ymm2, %ymm1
   1840 ; AVX2-64-NEXT:    retq
   1841 ;
   1842 ; AVX512F-64-LABEL: f16xf32_f256:
   1843 ; AVX512F-64:       # %bb.0:
   1844 ; AVX512F-64-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
   1845 ; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
   1846 ; AVX512F-64-NEXT:    vaddps %zmm1, %zmm0, %zmm0
   1847 ; AVX512F-64-NEXT:    vdivps %zmm0, %zmm1, %zmm0
   1848 ; AVX512F-64-NEXT:    retq
   1849   %res1 = fadd <16 x float> <float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>, %a
   1850   %res2 = fdiv <16 x float> <float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>, %res1
   1851   ret <16 x float> %res2
   1852 }
   1853 
   1854 
   1855 define <4 x double> @f4xf64_f128(<4 x double> %a) {
   1856 ; AVX-LABEL: f4xf64_f128:
   1857 ; AVX:       # %bb.0:
   1858 ; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
   1859 ; AVX-NEXT:    # ymm1 = mem[0,1,0,1]
   1860 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
   1861 ; AVX-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
   1862 ; AVX-NEXT:    retl
   1863 ;
   1864 ; ALL32-LABEL: f4xf64_f128:
   1865 ; ALL32:       # %bb.0:
   1866 ; ALL32-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
   1867 ; ALL32-NEXT:    # ymm1 = mem[0,1,0,1]
   1868 ; ALL32-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
   1869 ; ALL32-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
   1870 ; ALL32-NEXT:    retl
   1871 ;
   1872 ; AVX-64-LABEL: f4xf64_f128:
   1873 ; AVX-64:       # %bb.0:
   1874 ; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
   1875 ; AVX-64-NEXT:    # ymm1 = mem[0,1,0,1]
   1876 ; AVX-64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
   1877 ; AVX-64-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
   1878 ; AVX-64-NEXT:    retq
   1879 ;
   1880 ; ALL64-LABEL: f4xf64_f128:
   1881 ; ALL64:       # %bb.0:
   1882 ; ALL64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
   1883 ; ALL64-NEXT:    # ymm1 = mem[0,1,0,1]
   1884 ; ALL64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
   1885 ; ALL64-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
   1886 ; ALL64-NEXT:    retq
   1887   %res1 = fadd <4 x double> <double 2.0, double 1.0, double 2.0, double 1.0>, %a
   1888   %res2 = fdiv <4 x double> <double 2.0, double 1.0, double 2.0, double 1.0>, %res1
   1889   ret <4 x double> %res2
   1890 }
   1891 
   1892 
   1893 define <8 x double> @f8xf64_f128(<8 x double> %a) {
   1894 ; AVX-LABEL: f8xf64_f128:
   1895 ; AVX:       # %bb.0:
   1896 ; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
   1897 ; AVX-NEXT:    # ymm2 = mem[0,1,0,1]
   1898 ; AVX-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
   1899 ; AVX-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
   1900 ; AVX-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
   1901 ; AVX-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
   1902 ; AVX-NEXT:    retl
   1903 ;
   1904 ; AVX2-LABEL: f8xf64_f128:
   1905 ; AVX2:       # %bb.0:
   1906 ; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
   1907 ; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
   1908 ; AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
   1909 ; AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
   1910 ; AVX2-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
   1911 ; AVX2-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
   1912 ; AVX2-NEXT:    retl
   1913 ;
   1914 ; AVX512-LABEL: f8xf64_f128:
   1915 ; AVX512:       # %bb.0:
   1916 ; AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
   1917 ; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
   1918 ; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
   1919 ; AVX512-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
   1920 ; AVX512-NEXT:    retl
   1921 ;
   1922 ; AVX-64-LABEL: f8xf64_f128:
   1923 ; AVX-64:       # %bb.0:
   1924 ; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
   1925 ; AVX-64-NEXT:    # ymm2 = mem[0,1,0,1]
   1926 ; AVX-64-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
   1927 ; AVX-64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
   1928 ; AVX-64-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
   1929 ; AVX-64-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
   1930 ; AVX-64-NEXT:    retq
   1931 ;
   1932 ; AVX2-64-LABEL: f8xf64_f128:
   1933 ; AVX2-64:       # %bb.0:
   1934 ; AVX2-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
   1935 ; AVX2-64-NEXT:    # ymm2 = mem[0,1,0,1]
   1936 ; AVX2-64-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
   1937 ; AVX2-64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
   1938 ; AVX2-64-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
   1939 ; AVX2-64-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
   1940 ; AVX2-64-NEXT:    retq
   1941 ;
   1942 ; AVX512F-64-LABEL: f8xf64_f128:
   1943 ; AVX512F-64:       # %bb.0:
   1944 ; AVX512F-64-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
   1945 ; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
   1946 ; AVX512F-64-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
   1947 ; AVX512F-64-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
   1948 ; AVX512F-64-NEXT:    retq
   1949   %res1 = fadd <8 x double> <double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0>, %a
   1950   %res2 = fdiv <8 x double> <double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0>, %res1
   1951   ret <8 x double> %res2
   1952 }
   1953 
   1954 
   1955 ; AVX512:       .LCPI37
   1956 ; AVX512-NEXT:  .quad	4616189618054758400     # double 4
   1957 ; AVX512-NEXT:  .quad	4607182418800017408     # double 1
   1958 ; AVX512-NEXT:  .quad	4611686018427387904     # double 2
   1959 ; AVX512-NEXT:  .quad	4613937818241073152     # double 3
   1960 ; AVX512-NOT:   .quad
   1961 
   1962 define <8 x double> @f8xf64_f256(<8 x double> %a) {
   1963 ; AVX-LABEL: f8xf64_f256:
   1964 ; AVX:       # %bb.0:
   1965 ; AVX-NEXT:    vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
   1966 ; AVX-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
   1967 ; AVX-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
   1968 ; AVX-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
   1969 ; AVX-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
   1970 ; AVX-NEXT:    retl
   1971 ;
   1972 ; AVX2-LABEL: f8xf64_f256:
   1973 ; AVX2:       # %bb.0:
   1974 ; AVX2-NEXT:    vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
   1975 ; AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
   1976 ; AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
   1977 ; AVX2-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
   1978 ; AVX2-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
   1979 ; AVX2-NEXT:    retl
   1980 ;
   1981 ; AVX512-LABEL: f8xf64_f256:
   1982 ; AVX512:       # %bb.0:
   1983 ; AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
   1984 ; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
   1985 ; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
   1986 ; AVX512-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
   1987 ; AVX512-NEXT:    retl
   1988 ;
   1989 ; AVX-64-LABEL: f8xf64_f256:
   1990 ; AVX-64:       # %bb.0:
   1991 ; AVX-64-NEXT:    vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
   1992 ; AVX-64-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
   1993 ; AVX-64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
   1994 ; AVX-64-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
   1995 ; AVX-64-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
   1996 ; AVX-64-NEXT:    retq
   1997 ;
   1998 ; AVX2-64-LABEL: f8xf64_f256:
   1999 ; AVX2-64:       # %bb.0:
   2000 ; AVX2-64-NEXT:    vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
   2001 ; AVX2-64-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
   2002 ; AVX2-64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
   2003 ; AVX2-64-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
   2004 ; AVX2-64-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
   2005 ; AVX2-64-NEXT:    retq
   2006 ;
   2007 ; AVX512F-64-LABEL: f8xf64_f256:
   2008 ; AVX512F-64:       # %bb.0:
   2009 ; AVX512F-64-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
   2010 ; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
   2011 ; AVX512F-64-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
   2012 ; AVX512F-64-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
   2013 ; AVX512F-64-NEXT:    retq
   2014   %res1 = fadd <8 x double> <double 4.0, double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0>, %a
   2015   %res2 = fdiv <8 x double> <double 4.0, double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0>, %res1
   2016   ret <8 x double> %res2
   2017 }
   2018 
   2019 
   2020 
   2021 define <8 x i16> @f8xi16_i32_NaN(<8 x i16> %a) {
   2022 ; AVX-LABEL: f8xi16_i32_NaN:
   2023 ; AVX:       # %bb.0:
   2024 ; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
   2025 ; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
   2026 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
   2027 ; AVX-NEXT:    retl
   2028 ;
   2029 ; ALL32-LABEL: f8xi16_i32_NaN:
   2030 ; ALL32:       # %bb.0:
   2031 ; ALL32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776]
   2032 ; ALL32-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
   2033 ; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
   2034 ; ALL32-NEXT:    retl
   2035 ;
   2036 ; AVX-64-LABEL: f8xi16_i32_NaN:
   2037 ; AVX-64:       # %bb.0:
   2038 ; AVX-64-NEXT:    vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
   2039 ; AVX-64-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
   2040 ; AVX-64-NEXT:    vpand %xmm1, %xmm0, %xmm0
   2041 ; AVX-64-NEXT:    retq
   2042 ;
   2043 ; ALL64-LABEL: f8xi16_i32_NaN:
   2044 ; ALL64:       # %bb.0:
   2045 ; ALL64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776]
   2046 ; ALL64-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
   2047 ; ALL64-NEXT:    vpand %xmm1, %xmm0, %xmm0
   2048 ; ALL64-NEXT:    retq
   2049   %res1 = add <8 x i16> <i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70>, %a
   2050   %res2 = and <8 x i16> <i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70>, %res1
   2051   ret <8 x i16> %res2
   2052 }
   2053