Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
      4 
      5 define <16 x i8> @avg_v16i8_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %src, i16 %mask) nounwind {
      6 ; AVX512F-LABEL: avg_v16i8_mask:
      7 ; AVX512F:       # %bb.0:
      8 ; AVX512F-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
      9 ; AVX512F-NEXT:    kmovw %edi, %k1
     10 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
     11 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
     12 ; AVX512F-NEXT:    vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
     13 ; AVX512F-NEXT:    vzeroupper
     14 ; AVX512F-NEXT:    retq
     15 ;
     16 ; AVX512BWVL-LABEL: avg_v16i8_mask:
     17 ; AVX512BWVL:       # %bb.0:
     18 ; AVX512BWVL-NEXT:    kmovd %edi, %k1
     19 ; AVX512BWVL-NEXT:    vpavgb %xmm1, %xmm0, %xmm2 {%k1}
     20 ; AVX512BWVL-NEXT:    vmovdqa %xmm2, %xmm0
     21 ; AVX512BWVL-NEXT:    retq
     22   %za = zext <16 x i8> %a to <16 x i16>
     23   %zb = zext <16 x i8> %b to <16 x i16>
     24   %add = add nuw nsw <16 x i16> %za, %zb
     25   %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
     26   %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
     27   %trunc = trunc <16 x i16> %lshr to <16 x i8>
     28   %mask1 = bitcast i16 %mask to <16 x i1>
     29   %res = select <16 x i1> %mask1, <16 x i8> %trunc, <16 x i8> %src
     30   ret <16 x i8> %res
     31 }
     32 
     33 define <16 x i8> @avg_v16i8_maskz(<16 x i8> %a, <16 x i8> %b, i16 %mask) nounwind {
     34 ; AVX512F-LABEL: avg_v16i8_maskz:
     35 ; AVX512F:       # %bb.0:
     36 ; AVX512F-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
     37 ; AVX512F-NEXT:    kmovw %edi, %k1
     38 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
     39 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
     40 ; AVX512F-NEXT:    vpand %xmm0, %xmm1, %xmm0
     41 ; AVX512F-NEXT:    vzeroupper
     42 ; AVX512F-NEXT:    retq
     43 ;
     44 ; AVX512BWVL-LABEL: avg_v16i8_maskz:
     45 ; AVX512BWVL:       # %bb.0:
     46 ; AVX512BWVL-NEXT:    kmovd %edi, %k1
     47 ; AVX512BWVL-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 {%k1} {z}
     48 ; AVX512BWVL-NEXT:    retq
     49   %za = zext <16 x i8> %a to <16 x i16>
     50   %zb = zext <16 x i8> %b to <16 x i16>
     51   %add = add nuw nsw <16 x i16> %za, %zb
     52   %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
     53   %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
     54   %trunc = trunc <16 x i16> %lshr to <16 x i8>
     55   %mask1 = bitcast i16 %mask to <16 x i1>
     56   %res = select <16 x i1> %mask1, <16 x i8> %trunc, <16 x i8> zeroinitializer
     57   ret <16 x i8> %res
     58 }
     59 
     60 define <32 x i8> @avg_v32i8_mask(<32 x i8> %a, <32 x i8> %b, <32 x i8> %src, i32 %mask) nounwind {
     61 ; AVX512F-LABEL: avg_v32i8_mask:
     62 ; AVX512F:       # %bb.0:
     63 ; AVX512F-NEXT:    kmovw %edi, %k1
     64 ; AVX512F-NEXT:    shrl $16, %edi
     65 ; AVX512F-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
     66 ; AVX512F-NEXT:    kmovw %edi, %k2
     67 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
     68 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
     69 ; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
     70 ; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
     71 ; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
     72 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
     73 ; AVX512F-NEXT:    retq
     74 ;
     75 ; AVX512BWVL-LABEL: avg_v32i8_mask:
     76 ; AVX512BWVL:       # %bb.0:
     77 ; AVX512BWVL-NEXT:    kmovd %edi, %k1
     78 ; AVX512BWVL-NEXT:    vpavgb %ymm1, %ymm0, %ymm2 {%k1}
     79 ; AVX512BWVL-NEXT:    vmovdqa %ymm2, %ymm0
     80 ; AVX512BWVL-NEXT:    retq
     81   %za = zext <32 x i8> %a to <32 x i16>
     82   %zb = zext <32 x i8> %b to <32 x i16>
     83   %add = add nuw nsw <32 x i16> %za, %zb
     84   %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
     85   %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
     86   %trunc = trunc <32 x i16> %lshr to <32 x i8>
     87   %mask1 = bitcast i32 %mask to <32 x i1>
     88   %res = select <32 x i1> %mask1, <32 x i8> %trunc, <32 x i8> %src
     89   ret <32 x i8> %res
     90 }
     91 
     92 define <32 x i8> @avg_v32i8_maskz(<32 x i8> %a, <32 x i8> %b, i32 %mask) nounwind {
     93 ; AVX512F-LABEL: avg_v32i8_maskz:
     94 ; AVX512F:       # %bb.0:
     95 ; AVX512F-NEXT:    kmovw %edi, %k1
     96 ; AVX512F-NEXT:    shrl $16, %edi
     97 ; AVX512F-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
     98 ; AVX512F-NEXT:    kmovw %edi, %k2
     99 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
    100 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
    101 ; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
    102 ; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
    103 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
    104 ; AVX512F-NEXT:    vpand %ymm0, %ymm1, %ymm0
    105 ; AVX512F-NEXT:    retq
    106 ;
    107 ; AVX512BWVL-LABEL: avg_v32i8_maskz:
    108 ; AVX512BWVL:       # %bb.0:
    109 ; AVX512BWVL-NEXT:    kmovd %edi, %k1
    110 ; AVX512BWVL-NEXT:    vpavgb %ymm1, %ymm0, %ymm0 {%k1} {z}
    111 ; AVX512BWVL-NEXT:    retq
    112   %za = zext <32 x i8> %a to <32 x i16>
    113   %zb = zext <32 x i8> %b to <32 x i16>
    114   %add = add nuw nsw <32 x i16> %za, %zb
    115   %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
    116   %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
    117   %trunc = trunc <32 x i16> %lshr to <32 x i8>
    118   %mask1 = bitcast i32 %mask to <32 x i1>
    119   %res = select <32 x i1> %mask1, <32 x i8> %trunc, <32 x i8> zeroinitializer
    120   ret <32 x i8> %res
    121 }
    122 
    123 define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64 %mask) nounwind {
    124 ; AVX512F-LABEL: avg_v64i8_mask:
    125 ; AVX512F:       # %bb.0:
    126 ; AVX512F-NEXT:    movq %rdi, %rax
    127 ; AVX512F-NEXT:    movq %rdi, %rcx
    128 ; AVX512F-NEXT:    kmovw %edi, %k1
    129 ; AVX512F-NEXT:    movl %edi, %edx
    130 ; AVX512F-NEXT:    shrl $16, %edx
    131 ; AVX512F-NEXT:    shrq $32, %rax
    132 ; AVX512F-NEXT:    shrq $48, %rcx
    133 ; AVX512F-NEXT:    vpavgb %ymm3, %ymm1, %ymm1
    134 ; AVX512F-NEXT:    vpavgb %ymm2, %ymm0, %ymm0
    135 ; AVX512F-NEXT:    kmovw %ecx, %k2
    136 ; AVX512F-NEXT:    kmovw %eax, %k3
    137 ; AVX512F-NEXT:    kmovw %edx, %k4
    138 ; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
    139 ; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
    140 ; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k4} {z}
    141 ; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
    142 ; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
    143 ; AVX512F-NEXT:    vpblendvb %ymm2, %ymm0, %ymm4, %ymm0
    144 ; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z}
    145 ; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
    146 ; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
    147 ; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
    148 ; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
    149 ; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm5, %ymm1
    150 ; AVX512F-NEXT:    retq
    151 ;
    152 ; AVX512BWVL-LABEL: avg_v64i8_mask:
    153 ; AVX512BWVL:       # %bb.0:
    154 ; AVX512BWVL-NEXT:    kmovq %rdi, %k1
    155 ; AVX512BWVL-NEXT:    vpavgb %zmm1, %zmm0, %zmm2 {%k1}
    156 ; AVX512BWVL-NEXT:    vmovdqa64 %zmm2, %zmm0
    157 ; AVX512BWVL-NEXT:    retq
    158   %za = zext <64 x i8> %a to <64 x i16>
    159   %zb = zext <64 x i8> %b to <64 x i16>
    160   %add = add nuw nsw <64 x i16> %za, %zb
    161   %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
    162   %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
    163   %trunc = trunc <64 x i16> %lshr to <64 x i8>
    164   %mask1 = bitcast i64 %mask to <64 x i1>
    165   %res = select <64 x i1> %mask1, <64 x i8> %trunc, <64 x i8> %src
    166   ret <64 x i8> %res
    167 }
    168 
    169 define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind {
    170 ; AVX512F-LABEL: avg_v64i8_maskz:
    171 ; AVX512F:       # %bb.0:
    172 ; AVX512F-NEXT:    movq %rdi, %rax
    173 ; AVX512F-NEXT:    movq %rdi, %rcx
    174 ; AVX512F-NEXT:    kmovw %edi, %k1
    175 ; AVX512F-NEXT:    movl %edi, %edx
    176 ; AVX512F-NEXT:    shrl $16, %edx
    177 ; AVX512F-NEXT:    shrq $32, %rax
    178 ; AVX512F-NEXT:    shrq $48, %rcx
    179 ; AVX512F-NEXT:    vpavgb %ymm3, %ymm1, %ymm1
    180 ; AVX512F-NEXT:    vpavgb %ymm2, %ymm0, %ymm0
    181 ; AVX512F-NEXT:    kmovw %ecx, %k2
    182 ; AVX512F-NEXT:    kmovw %eax, %k3
    183 ; AVX512F-NEXT:    kmovw %edx, %k4
    184 ; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
    185 ; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
    186 ; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k4} {z}
    187 ; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
    188 ; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
    189 ; AVX512F-NEXT:    vpand %ymm0, %ymm2, %ymm0
    190 ; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z}
    191 ; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
    192 ; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
    193 ; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
    194 ; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
    195 ; AVX512F-NEXT:    vpand %ymm1, %ymm2, %ymm1
    196 ; AVX512F-NEXT:    retq
    197 ;
    198 ; AVX512BWVL-LABEL: avg_v64i8_maskz:
    199 ; AVX512BWVL:       # %bb.0:
    200 ; AVX512BWVL-NEXT:    kmovq %rdi, %k1
    201 ; AVX512BWVL-NEXT:    vpavgb %zmm1, %zmm0, %zmm0 {%k1} {z}
    202 ; AVX512BWVL-NEXT:    retq
    203   %za = zext <64 x i8> %a to <64 x i16>
    204   %zb = zext <64 x i8> %b to <64 x i16>
    205   %add = add nuw nsw <64 x i16> %za, %zb
    206   %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
    207   %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
    208   %trunc = trunc <64 x i16> %lshr to <64 x i8>
    209   %mask1 = bitcast i64 %mask to <64 x i1>
    210   %res = select <64 x i1> %mask1, <64 x i8> %trunc, <64 x i8> zeroinitializer
    211   ret <64 x i8> %res
    212 }
    213 
    214 define <8 x i16> @avg_v8i16_mask(<8 x i16> %a, <8 x i16> %b, <8 x i16> %src, i8 %mask) nounwind {
    215 ; AVX512F-LABEL: avg_v8i16_mask:
    216 ; AVX512F:       # %bb.0:
    217 ; AVX512F-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
    218 ; AVX512F-NEXT:    kmovw %edi, %k1
    219 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
    220 ; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
    221 ; AVX512F-NEXT:    vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
    222 ; AVX512F-NEXT:    vzeroupper
    223 ; AVX512F-NEXT:    retq
    224 ;
    225 ; AVX512BWVL-LABEL: avg_v8i16_mask:
    226 ; AVX512BWVL:       # %bb.0:
    227 ; AVX512BWVL-NEXT:    kmovd %edi, %k1
    228 ; AVX512BWVL-NEXT:    vpavgw %xmm1, %xmm0, %xmm2 {%k1}
    229 ; AVX512BWVL-NEXT:    vmovdqa %xmm2, %xmm0
    230 ; AVX512BWVL-NEXT:    retq
    231   %za = zext <8 x i16> %a to <8 x i32>
    232   %zb = zext <8 x i16> %b to <8 x i32>
    233   %add = add nuw nsw <8 x i32> %za, %zb
    234   %add1 = add nuw nsw <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    235   %lshr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    236   %trunc = trunc <8 x i32> %lshr to <8 x i16>
    237   %mask1 = bitcast i8 %mask to <8 x i1>
    238   %res = select <8 x i1> %mask1, <8 x i16> %trunc, <8 x i16> %src
    239   ret <8 x i16> %res
    240 }
    241 
    242 define <8 x i16> @avg_v8i16_maskz(<8 x i16> %a, <8 x i16> %b, i8 %mask) nounwind {
    243 ; AVX512F-LABEL: avg_v8i16_maskz:
    244 ; AVX512F:       # %bb.0:
    245 ; AVX512F-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
    246 ; AVX512F-NEXT:    kmovw %edi, %k1
    247 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
    248 ; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
    249 ; AVX512F-NEXT:    vpand %xmm0, %xmm1, %xmm0
    250 ; AVX512F-NEXT:    vzeroupper
    251 ; AVX512F-NEXT:    retq
    252 ;
    253 ; AVX512BWVL-LABEL: avg_v8i16_maskz:
    254 ; AVX512BWVL:       # %bb.0:
    255 ; AVX512BWVL-NEXT:    kmovd %edi, %k1
    256 ; AVX512BWVL-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 {%k1} {z}
    257 ; AVX512BWVL-NEXT:    retq
    258   %za = zext <8 x i16> %a to <8 x i32>
    259   %zb = zext <8 x i16> %b to <8 x i32>
    260   %add = add nuw nsw <8 x i32> %za, %zb
    261   %add1 = add nuw nsw <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    262   %lshr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    263   %trunc = trunc <8 x i32> %lshr to <8 x i16>
    264   %mask1 = bitcast i8 %mask to <8 x i1>
    265   %res = select <8 x i1> %mask1, <8 x i16> %trunc, <8 x i16> zeroinitializer
    266   ret <8 x i16> %res
    267 }
    268 
    269 define <16 x i16> @avg_v16i16_mask(<16 x i16> %a, <16 x i16> %b, <16 x i16> %src, i16 %mask) nounwind {
    270 ; AVX512F-LABEL: avg_v16i16_mask:
    271 ; AVX512F:       # %bb.0:
    272 ; AVX512F-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
    273 ; AVX512F-NEXT:    kmovw %edi, %k1
    274 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
    275 ; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
    276 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
    277 ; AVX512F-NEXT:    retq
    278 ;
    279 ; AVX512BWVL-LABEL: avg_v16i16_mask:
    280 ; AVX512BWVL:       # %bb.0:
    281 ; AVX512BWVL-NEXT:    kmovd %edi, %k1
    282 ; AVX512BWVL-NEXT:    vpavgw %ymm1, %ymm0, %ymm2 {%k1}
    283 ; AVX512BWVL-NEXT:    vmovdqa %ymm2, %ymm0
    284 ; AVX512BWVL-NEXT:    retq
    285   %za = zext <16 x i16> %a to <16 x i32>
    286   %zb = zext <16 x i16> %b to <16 x i32>
    287   %add = add nuw nsw <16 x i32> %za, %zb
    288   %add1 = add nuw nsw <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    289   %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    290   %trunc = trunc <16 x i32> %lshr to <16 x i16>
    291   %mask1 = bitcast i16 %mask to <16 x i1>
    292   %res = select <16 x i1> %mask1, <16 x i16> %trunc, <16 x i16> %src
    293   ret <16 x i16> %res
    294 }
    295 
    296 define <16 x i16> @avg_v16i16_maskz(<16 x i16> %a, <16 x i16> %b, i16 %mask) nounwind {
    297 ; AVX512F-LABEL: avg_v16i16_maskz:
    298 ; AVX512F:       # %bb.0:
    299 ; AVX512F-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
    300 ; AVX512F-NEXT:    kmovw %edi, %k1
    301 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
    302 ; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
    303 ; AVX512F-NEXT:    vpand %ymm0, %ymm1, %ymm0
    304 ; AVX512F-NEXT:    retq
    305 ;
    306 ; AVX512BWVL-LABEL: avg_v16i16_maskz:
    307 ; AVX512BWVL:       # %bb.0:
    308 ; AVX512BWVL-NEXT:    kmovd %edi, %k1
    309 ; AVX512BWVL-NEXT:    vpavgw %ymm1, %ymm0, %ymm0 {%k1} {z}
    310 ; AVX512BWVL-NEXT:    retq
    311   %za = zext <16 x i16> %a to <16 x i32>
    312   %zb = zext <16 x i16> %b to <16 x i32>
    313   %add = add nuw nsw <16 x i32> %za, %zb
    314   %add1 = add nuw nsw <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    315   %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    316   %trunc = trunc <16 x i32> %lshr to <16 x i16>
    317   %mask1 = bitcast i16 %mask to <16 x i1>
    318   %res = select <16 x i1> %mask1, <16 x i16> %trunc, <16 x i16> zeroinitializer
    319   ret <16 x i16> %res
    320 }
    321 
    322 define <32 x i16> @avg_v32i16_mask(<32 x i16> %a, <32 x i16> %b, <32 x i16> %src, i32 %mask) nounwind {
    323 ; AVX512F-LABEL: avg_v32i16_mask:
    324 ; AVX512F:       # %bb.0:
    325 ; AVX512F-NEXT:    kmovw %edi, %k1
    326 ; AVX512F-NEXT:    shrl $16, %edi
    327 ; AVX512F-NEXT:    vpavgw %ymm3, %ymm1, %ymm1
    328 ; AVX512F-NEXT:    vpavgw %ymm2, %ymm0, %ymm0
    329 ; AVX512F-NEXT:    kmovw %edi, %k2
    330 ; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
    331 ; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
    332 ; AVX512F-NEXT:    vpblendvb %ymm2, %ymm0, %ymm4, %ymm0
    333 ; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
    334 ; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
    335 ; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm5, %ymm1
    336 ; AVX512F-NEXT:    retq
    337 ;
    338 ; AVX512BWVL-LABEL: avg_v32i16_mask:
    339 ; AVX512BWVL:       # %bb.0:
    340 ; AVX512BWVL-NEXT:    kmovd %edi, %k1
    341 ; AVX512BWVL-NEXT:    vpavgw %zmm1, %zmm0, %zmm2 {%k1}
    342 ; AVX512BWVL-NEXT:    vmovdqa64 %zmm2, %zmm0
    343 ; AVX512BWVL-NEXT:    retq
    344   %za = zext <32 x i16> %a to <32 x i32>
    345   %zb = zext <32 x i16> %b to <32 x i32>
    346   %add = add nuw nsw <32 x i32> %za, %zb
    347   %add1 = add nuw nsw <32 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    348   %lshr = lshr <32 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    349   %trunc = trunc <32 x i32> %lshr to <32 x i16>
    350   %mask1 = bitcast i32 %mask to <32 x i1>
    351   %res = select <32 x i1> %mask1, <32 x i16> %trunc, <32 x i16> %src
    352   ret <32 x i16> %res
    353 }
    354 
    355 define <32 x i16> @avg_v32i16_maskz(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind {
    356 ; AVX512F-LABEL: avg_v32i16_maskz:
    357 ; AVX512F:       # %bb.0:
    358 ; AVX512F-NEXT:    kmovw %edi, %k1
    359 ; AVX512F-NEXT:    shrl $16, %edi
    360 ; AVX512F-NEXT:    vpavgw %ymm3, %ymm1, %ymm1
    361 ; AVX512F-NEXT:    vpavgw %ymm2, %ymm0, %ymm0
    362 ; AVX512F-NEXT:    kmovw %edi, %k2
    363 ; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
    364 ; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
    365 ; AVX512F-NEXT:    vpand %ymm0, %ymm2, %ymm0
    366 ; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
    367 ; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
    368 ; AVX512F-NEXT:    vpand %ymm1, %ymm2, %ymm1
    369 ; AVX512F-NEXT:    retq
    370 ;
    371 ; AVX512BWVL-LABEL: avg_v32i16_maskz:
    372 ; AVX512BWVL:       # %bb.0:
    373 ; AVX512BWVL-NEXT:    kmovd %edi, %k1
    374 ; AVX512BWVL-NEXT:    vpavgw %zmm1, %zmm0, %zmm0 {%k1} {z}
    375 ; AVX512BWVL-NEXT:    retq
    376   %za = zext <32 x i16> %a to <32 x i32>
    377   %zb = zext <32 x i16> %b to <32 x i32>
    378   %add = add nuw nsw <32 x i32> %za, %zb
    379   %add1 = add nuw nsw <32 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    380   %lshr = lshr <32 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    381   %trunc = trunc <32 x i32> %lshr to <32 x i16>
    382   %mask1 = bitcast i32 %mask to <32 x i1>
    383   %res = select <32 x i1> %mask1, <32 x i16> %trunc, <32 x i16> zeroinitializer
    384   ret <32 x i16> %res
    385 }
    386