Home | History | Annotate | Download | only in X86
      1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512BW
      4 
      5 define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) {
      6 ; SSE2-LABEL: avg_v4i8:
      7 ; SSE2:       # BB#0:
      8 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
      9 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
     10 ; SSE2-NEXT:    pavgb %xmm0, %xmm1
     11 ; SSE2-NEXT:    movd %xmm1, (%rax)
     12 ; SSE2-NEXT:    retq
     13 ;
     14 ; AVX2-LABEL: avg_v4i8:
     15 ; AVX2:       # BB#0:
     16 ; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
     17 ; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
     18 ; AVX2-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
     19 ; AVX2-NEXT:    vmovd %xmm0, (%rax)
     20 ; AVX2-NEXT:    retq
     21 ;
     22 ; AVX512BW-LABEL: avg_v4i8:
     23 ; AVX512BW:       # BB#0:
     24 ; AVX512BW-NEXT:    vmovd (%rdi), %xmm0
     25 ; AVX512BW-NEXT:    vmovd (%rsi), %xmm1
     26 ; AVX512BW-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
     27 ; AVX512BW-NEXT:    vmovd %xmm0, (%rax)
     28 ; AVX512BW-NEXT:    retq
     29   %1 = load <4 x i8>, <4 x i8>* %a
     30   %2 = load <4 x i8>, <4 x i8>* %b
     31   %3 = zext <4 x i8> %1 to <4 x i32>
     32   %4 = zext <4 x i8> %2 to <4 x i32>
     33   %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
     34   %6 = add nuw nsw <4 x i32> %5, %4
     35   %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
     36   %8 = trunc <4 x i32> %7 to <4 x i8>
     37   store <4 x i8> %8, <4 x i8>* undef, align 4
     38   ret void
     39 }
     40 
     41 define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) {
     42 ; SSE2-LABEL: avg_v8i8:
     43 ; SSE2:       # BB#0:
     44 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
     45 ; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
     46 ; SSE2-NEXT:    pavgb %xmm0, %xmm1
     47 ; SSE2-NEXT:    movq %xmm1, (%rax)
     48 ; SSE2-NEXT:    retq
     49 ;
     50 ; AVX2-LABEL: avg_v8i8:
     51 ; AVX2:       # BB#0:
     52 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
     53 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
     54 ; AVX2-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
     55 ; AVX2-NEXT:    vmovq %xmm0, (%rax)
     56 ; AVX2-NEXT:    retq
     57 ;
     58 ; AVX512BW-LABEL: avg_v8i8:
     59 ; AVX512BW:       # BB#0:
     60 ; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
     61 ; AVX512BW-NEXT:    vmovq (%rsi), %xmm1
     62 ; AVX512BW-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
     63 ; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
     64 ; AVX512BW-NEXT:    retq
     65   %1 = load <8 x i8>, <8 x i8>* %a
     66   %2 = load <8 x i8>, <8 x i8>* %b
     67   %3 = zext <8 x i8> %1 to <8 x i32>
     68   %4 = zext <8 x i8> %2 to <8 x i32>
     69   %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
     70   %6 = add nuw nsw <8 x i32> %5, %4
     71   %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
     72   %8 = trunc <8 x i32> %7 to <8 x i8>
     73   store <8 x i8> %8, <8 x i8>* undef, align 4
     74   ret void
     75 }
     76 
     77 define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) {
     78 ; SSE2-LABEL: avg_v16i8:
     79 ; SSE2:       # BB#0:
     80 ; SSE2-NEXT:    movdqa (%rsi), %xmm0
     81 ; SSE2-NEXT:    pavgb (%rdi), %xmm0
     82 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
     83 ; SSE2-NEXT:    retq
     84 ;
     85 ; AVX-LABEL: avg_v16i8:
     86 ; AVX:       # BB#0:
     87 ; AVX-NEXT:    vmovdqa (%rsi), %xmm0
     88 ; AVX-NEXT:    vpavgb (%rdi), %xmm0, %xmm0
     89 ; AVX-NEXT:    vmovdqu %xmm0, (%rax)
     90 ; AVX-NEXT:    retq
     91   %1 = load <16 x i8>, <16 x i8>* %a
     92   %2 = load <16 x i8>, <16 x i8>* %b
     93   %3 = zext <16 x i8> %1 to <16 x i32>
     94   %4 = zext <16 x i8> %2 to <16 x i32>
     95   %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
     96   %6 = add nuw nsw <16 x i32> %5, %4
     97   %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
     98   %8 = trunc <16 x i32> %7 to <16 x i8>
     99   store <16 x i8> %8, <16 x i8>* undef, align 4
    100   ret void
    101 }
    102 
    103 define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
    104 ; AVX2-LABEL: avg_v32i8:
    105 ; AVX2:       # BB#0:
    106 ; AVX2-NEXT:    vmovdqa (%rsi), %ymm0
    107 ; AVX2-NEXT:    vpavgb (%rdi), %ymm0, %ymm0
    108 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
    109 ; AVX2-NEXT:    vzeroupper
    110 ; AVX2-NEXT:    retq
    111 ;
    112 ; AVX512BW-LABEL: avg_v32i8:
    113 ; AVX512BW:       # BB#0:
    114 ; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm0
    115 ; AVX512BW-NEXT:    vpavgb (%rdi), %ymm0, %ymm0
    116 ; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
    117 ; AVX512BW-NEXT:    retq
    118   %1 = load <32 x i8>, <32 x i8>* %a
    119   %2 = load <32 x i8>, <32 x i8>* %b
    120   %3 = zext <32 x i8> %1 to <32 x i32>
    121   %4 = zext <32 x i8> %2 to <32 x i32>
    122   %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    123   %6 = add nuw nsw <32 x i32> %5, %4
    124   %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    125   %8 = trunc <32 x i32> %7 to <32 x i8>
    126   store <32 x i8> %8, <32 x i8>* undef, align 4
    127   ret void
    128 }
    129 
    130 define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
    131 ; AVX512BW-LABEL: avg_v64i8:
    132 ; AVX512BW:       # BB#0:
    133 ; AVX512BW-NEXT:    vmovdqu8 (%rsi), %zmm0
    134 ; AVX512BW-NEXT:    vpavgb (%rdi), %zmm0, %zmm0
    135 ; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rax)
    136 ; AVX512BW-NEXT:    retq
    137   %1 = load <64 x i8>, <64 x i8>* %a
    138   %2 = load <64 x i8>, <64 x i8>* %b
    139   %3 = zext <64 x i8> %1 to <64 x i32>
    140   %4 = zext <64 x i8> %2 to <64 x i32>
    141   %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    142   %6 = add nuw nsw <64 x i32> %5, %4
    143   %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    144   %8 = trunc <64 x i32> %7 to <64 x i8>
    145   store <64 x i8> %8, <64 x i8>* undef, align 4
    146   ret void
    147 }
    148 
    149 define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) {
    150 ; SSE2-LABEL: avg_v4i16:
    151 ; SSE2:       # BB#0:
    152 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    153 ; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    154 ; SSE2-NEXT:    pavgw %xmm0, %xmm1
    155 ; SSE2-NEXT:    movq %xmm1, (%rax)
    156 ; SSE2-NEXT:    retq
    157 ;
    158 ; AVX2-LABEL: avg_v4i16:
    159 ; AVX2:       # BB#0:
    160 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    161 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    162 ; AVX2-NEXT:    vpavgw %xmm0, %xmm1, %xmm0
    163 ; AVX2-NEXT:    vmovq %xmm0, (%rax)
    164 ; AVX2-NEXT:    retq
    165 ;
    166 ; AVX512BW-LABEL: avg_v4i16:
    167 ; AVX512BW:       # BB#0:
    168 ; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
    169 ; AVX512BW-NEXT:    vmovq (%rsi), %xmm1
    170 ; AVX512BW-NEXT:    vpavgw %xmm0, %xmm1, %xmm0
    171 ; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
    172 ; AVX512BW-NEXT:    retq
    173   %1 = load <4 x i16>, <4 x i16>* %a
    174   %2 = load <4 x i16>, <4 x i16>* %b
    175   %3 = zext <4 x i16> %1 to <4 x i32>
    176   %4 = zext <4 x i16> %2 to <4 x i32>
    177   %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
    178   %6 = add nuw nsw <4 x i32> %5, %4
    179   %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
    180   %8 = trunc <4 x i32> %7 to <4 x i16>
    181   store <4 x i16> %8, <4 x i16>* undef, align 4
    182   ret void
    183 }
    184 
    185 define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) {
    186 ; SSE2-LABEL: avg_v8i16:
    187 ; SSE2:       # BB#0:
    188 ; SSE2-NEXT:    movdqa (%rsi), %xmm0
    189 ; SSE2-NEXT:    pavgw (%rdi), %xmm0
    190 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
    191 ; SSE2-NEXT:    retq
    192 ;
    193 ; AVX-LABEL: avg_v8i16:
    194 ; AVX:       # BB#0:
    195 ; AVX-NEXT:    vmovdqa (%rsi), %xmm0
    196 ; AVX-NEXT:    vpavgw (%rdi), %xmm0, %xmm0
    197 ; AVX-NEXT:    vmovdqu %xmm0, (%rax)
    198 ; AVX-NEXT:    retq
    199   %1 = load <8 x i16>, <8 x i16>* %a
    200   %2 = load <8 x i16>, <8 x i16>* %b
    201   %3 = zext <8 x i16> %1 to <8 x i32>
    202   %4 = zext <8 x i16> %2 to <8 x i32>
    203   %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    204   %6 = add nuw nsw <8 x i32> %5, %4
    205   %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    206   %8 = trunc <8 x i32> %7 to <8 x i16>
    207   store <8 x i16> %8, <8 x i16>* undef, align 4
    208   ret void
    209 }
    210 
    211 define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
    212 ; AVX2-LABEL: avg_v16i16:
    213 ; AVX2:       # BB#0:
    214 ; AVX2-NEXT:    vmovdqa (%rsi), %ymm0
    215 ; AVX2-NEXT:    vpavgw (%rdi), %ymm0, %ymm0
    216 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
    217 ; AVX2-NEXT:    vzeroupper
    218 ; AVX2-NEXT:    retq
    219 ;
    220 ; AVX512BW-LABEL: avg_v16i16:
    221 ; AVX512BW:       # BB#0:
    222 ; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm0
    223 ; AVX512BW-NEXT:    vpavgw (%rdi), %ymm0, %ymm0
    224 ; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
    225 ; AVX512BW-NEXT:    retq
    226   %1 = load <16 x i16>, <16 x i16>* %a
    227   %2 = load <16 x i16>, <16 x i16>* %b
    228   %3 = zext <16 x i16> %1 to <16 x i32>
    229   %4 = zext <16 x i16> %2 to <16 x i32>
    230   %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    231   %6 = add nuw nsw <16 x i32> %5, %4
    232   %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    233   %8 = trunc <16 x i32> %7 to <16 x i16>
    234   store <16 x i16> %8, <16 x i16>* undef, align 4
    235   ret void
    236 }
    237 
    238 define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
    239 ; AVX512BW-LABEL: avg_v32i16:
    240 ; AVX512BW:       # BB#0:
    241 ; AVX512BW-NEXT:    vmovdqu16 (%rsi), %zmm0
    242 ; AVX512BW-NEXT:    vpavgw (%rdi), %zmm0, %zmm0
    243 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, (%rax)
    244 ; AVX512BW-NEXT:    retq
    245   %1 = load <32 x i16>, <32 x i16>* %a
    246   %2 = load <32 x i16>, <32 x i16>* %b
    247   %3 = zext <32 x i16> %1 to <32 x i32>
    248   %4 = zext <32 x i16> %2 to <32 x i32>
    249   %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    250   %6 = add nuw nsw <32 x i32> %5, %4
    251   %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    252   %8 = trunc <32 x i32> %7 to <32 x i16>
    253   store <32 x i16> %8, <32 x i16>* undef, align 4
    254   ret void
    255 }
    256 
    257 define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) {
    258 ; SSE2-LABEL: avg_v4i8_2:
    259 ; SSE2:       # BB#0:
    260 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    261 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
    262 ; SSE2-NEXT:    pavgb %xmm0, %xmm1
    263 ; SSE2-NEXT:    movd %xmm1, (%rax)
    264 ; SSE2-NEXT:    retq
    265 ;
    266 ; AVX2-LABEL: avg_v4i8_2:
    267 ; AVX2:       # BB#0:
    268 ; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    269 ; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
    270 ; AVX2-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
    271 ; AVX2-NEXT:    vmovd %xmm0, (%rax)
    272 ; AVX2-NEXT:    retq
    273 ;
    274 ; AVX512BW-LABEL: avg_v4i8_2:
    275 ; AVX512BW:       # BB#0:
    276 ; AVX512BW-NEXT:    vmovd (%rdi), %xmm0
    277 ; AVX512BW-NEXT:    vmovd (%rsi), %xmm1
    278 ; AVX512BW-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
    279 ; AVX512BW-NEXT:    vmovd %xmm0, (%rax)
    280 ; AVX512BW-NEXT:    retq
    281   %1 = load <4 x i8>, <4 x i8>* %a
    282   %2 = load <4 x i8>, <4 x i8>* %b
    283   %3 = zext <4 x i8> %1 to <4 x i32>
    284   %4 = zext <4 x i8> %2 to <4 x i32>
    285   %5 = add nuw nsw <4 x i32> %3, %4
    286   %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
    287   %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
    288   %8 = trunc <4 x i32> %7 to <4 x i8>
    289   store <4 x i8> %8, <4 x i8>* undef, align 4
    290   ret void
    291 }
    292 
    293 define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) {
    294 ; SSE2-LABEL: avg_v8i8_2:
    295 ; SSE2:       # BB#0:
    296 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    297 ; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    298 ; SSE2-NEXT:    pavgb %xmm0, %xmm1
    299 ; SSE2-NEXT:    movq %xmm1, (%rax)
    300 ; SSE2-NEXT:    retq
    301 ;
    302 ; AVX2-LABEL: avg_v8i8_2:
    303 ; AVX2:       # BB#0:
    304 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    305 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    306 ; AVX2-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
    307 ; AVX2-NEXT:    vmovq %xmm0, (%rax)
    308 ; AVX2-NEXT:    retq
    309 ;
    310 ; AVX512BW-LABEL: avg_v8i8_2:
    311 ; AVX512BW:       # BB#0:
    312 ; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
    313 ; AVX512BW-NEXT:    vmovq (%rsi), %xmm1
    314 ; AVX512BW-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
    315 ; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
    316 ; AVX512BW-NEXT:    retq
    317   %1 = load <8 x i8>, <8 x i8>* %a
    318   %2 = load <8 x i8>, <8 x i8>* %b
    319   %3 = zext <8 x i8> %1 to <8 x i32>
    320   %4 = zext <8 x i8> %2 to <8 x i32>
    321   %5 = add nuw nsw <8 x i32> %3, %4
    322   %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    323   %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    324   %8 = trunc <8 x i32> %7 to <8 x i8>
    325   store <8 x i8> %8, <8 x i8>* undef, align 4
    326   ret void
    327 }
    328 
    329 define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) {
    330 ; SSE2-LABEL: avg_v16i8_2:
    331 ; SSE2:       # BB#0:
    332 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
    333 ; SSE2-NEXT:    pavgb (%rsi), %xmm0
    334 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
    335 ; SSE2-NEXT:    retq
    336 ;
    337 ; AVX-LABEL: avg_v16i8_2:
    338 ; AVX:       # BB#0:
    339 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
    340 ; AVX-NEXT:    vpavgb (%rsi), %xmm0, %xmm0
    341 ; AVX-NEXT:    vmovdqu %xmm0, (%rax)
    342 ; AVX-NEXT:    retq
    343   %1 = load <16 x i8>, <16 x i8>* %a
    344   %2 = load <16 x i8>, <16 x i8>* %b
    345   %3 = zext <16 x i8> %1 to <16 x i32>
    346   %4 = zext <16 x i8> %2 to <16 x i32>
    347   %5 = add nuw nsw <16 x i32> %3, %4
    348   %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    349   %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    350   %8 = trunc <16 x i32> %7 to <16 x i8>
    351   store <16 x i8> %8, <16 x i8>* undef, align 4
    352   ret void
    353 }
    354 
    355 define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
    356 ; AVX2-LABEL: avg_v32i8_2:
    357 ; AVX2:       # BB#0:
    358 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
    359 ; AVX2-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
    360 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
    361 ; AVX2-NEXT:    vzeroupper
    362 ; AVX2-NEXT:    retq
    363 ;
    364 ; AVX512BW-LABEL: avg_v32i8_2:
    365 ; AVX512BW:       # BB#0:
    366 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
    367 ; AVX512BW-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
    368 ; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
    369 ; AVX512BW-NEXT:    retq
    370   %1 = load <32 x i8>, <32 x i8>* %a
    371   %2 = load <32 x i8>, <32 x i8>* %b
    372   %3 = zext <32 x i8> %1 to <32 x i32>
    373   %4 = zext <32 x i8> %2 to <32 x i32>
    374   %5 = add nuw nsw <32 x i32> %3, %4
    375   %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    376   %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    377   %8 = trunc <32 x i32> %7 to <32 x i8>
    378   store <32 x i8> %8, <32 x i8>* undef, align 4
    379   ret void
    380 }
    381 
    382 define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
    383 ; AVX512BW-LABEL: avg_v64i8_2:
    384 ; AVX512BW:       # BB#0:
    385 ; AVX512BW-NEXT:    vmovdqu8 (%rsi), %zmm0
    386 ; AVX512BW-NEXT:    vpavgb %zmm0, %zmm0, %zmm0
    387 ; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rax)
    388 ; AVX512BW-NEXT:    retq
    389   %1 = load <64 x i8>, <64 x i8>* %a
    390   %2 = load <64 x i8>, <64 x i8>* %b
    391   %3 = zext <64 x i8> %1 to <64 x i32>
    392   %4 = zext <64 x i8> %2 to <64 x i32>
    393   %5 = add nuw nsw <64 x i32> %4, %4
    394   %6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    395   %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    396   %8 = trunc <64 x i32> %7 to <64 x i8>
    397   store <64 x i8> %8, <64 x i8>* undef, align 4
    398   ret void
    399 }
    400 
    401 
    402 define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) {
    403 ; SSE2-LABEL: avg_v4i16_2:
    404 ; SSE2:       # BB#0:
    405 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    406 ; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    407 ; SSE2-NEXT:    pavgw %xmm0, %xmm1
    408 ; SSE2-NEXT:    movq %xmm1, (%rax)
    409 ; SSE2-NEXT:    retq
    410 ;
    411 ; AVX2-LABEL: avg_v4i16_2:
    412 ; AVX2:       # BB#0:
    413 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    414 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    415 ; AVX2-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
    416 ; AVX2-NEXT:    vmovq %xmm0, (%rax)
    417 ; AVX2-NEXT:    retq
    418 ;
    419 ; AVX512BW-LABEL: avg_v4i16_2:
    420 ; AVX512BW:       # BB#0:
    421 ; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
    422 ; AVX512BW-NEXT:    vmovq (%rsi), %xmm1
    423 ; AVX512BW-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
    424 ; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
    425 ; AVX512BW-NEXT:    retq
    426   %1 = load <4 x i16>, <4 x i16>* %a
    427   %2 = load <4 x i16>, <4 x i16>* %b
    428   %3 = zext <4 x i16> %1 to <4 x i32>
    429   %4 = zext <4 x i16> %2 to <4 x i32>
    430   %5 = add nuw nsw <4 x i32> %3, %4
    431   %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
    432   %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
    433   %8 = trunc <4 x i32> %7 to <4 x i16>
    434   store <4 x i16> %8, <4 x i16>* undef, align 4
    435   ret void
    436 }
    437 
    438 define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) {
    439 ; SSE2-LABEL: avg_v8i16_2:
    440 ; SSE2:       # BB#0:
    441 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
    442 ; SSE2-NEXT:    pavgw (%rsi), %xmm0
    443 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
    444 ; SSE2-NEXT:    retq
    445 ;
    446 ; AVX-LABEL: avg_v8i16_2:
    447 ; AVX:       # BB#0:
    448 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
    449 ; AVX-NEXT:    vpavgw (%rsi), %xmm0, %xmm0
    450 ; AVX-NEXT:    vmovdqu %xmm0, (%rax)
    451 ; AVX-NEXT:    retq
    452   %1 = load <8 x i16>, <8 x i16>* %a
    453   %2 = load <8 x i16>, <8 x i16>* %b
    454   %3 = zext <8 x i16> %1 to <8 x i32>
    455   %4 = zext <8 x i16> %2 to <8 x i32>
    456   %5 = add nuw nsw <8 x i32> %3, %4
    457   %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    458   %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    459   %8 = trunc <8 x i32> %7 to <8 x i16>
    460   store <8 x i16> %8, <8 x i16>* undef, align 4
    461   ret void
    462 }
    463 
    464 define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
    465 ; AVX2-LABEL: avg_v16i16_2:
    466 ; AVX2:       # BB#0:
    467 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
    468 ; AVX2-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
    469 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
    470 ; AVX2-NEXT:    vzeroupper
    471 ; AVX2-NEXT:    retq
    472 ;
    473 ; AVX512BW-LABEL: avg_v16i16_2:
    474 ; AVX512BW:       # BB#0:
    475 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
    476 ; AVX512BW-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
    477 ; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
    478 ; AVX512BW-NEXT:    retq
    479   %1 = load <16 x i16>, <16 x i16>* %a
    480   %2 = load <16 x i16>, <16 x i16>* %b
    481   %3 = zext <16 x i16> %1 to <16 x i32>
    482   %4 = zext <16 x i16> %2 to <16 x i32>
    483   %5 = add nuw nsw <16 x i32> %3, %4
    484   %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    485   %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    486   %8 = trunc <16 x i32> %7 to <16 x i16>
    487   store <16 x i16> %8, <16 x i16>* undef, align 4
    488   ret void
    489 }
    490 
    491 define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
    492 ; AVX512BW-LABEL: avg_v32i16_2:
    493 ; AVX512BW:       # BB#0:
    494 ; AVX512BW-NEXT:    vmovdqu16 (%rdi), %zmm0
    495 ; AVX512BW-NEXT:    vpavgw (%rsi), %zmm0, %zmm0
    496 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, (%rax)
    497 ; AVX512BW-NEXT:    retq
    498   %1 = load <32 x i16>, <32 x i16>* %a
    499   %2 = load <32 x i16>, <32 x i16>* %b
    500   %3 = zext <32 x i16> %1 to <32 x i32>
    501   %4 = zext <32 x i16> %2 to <32 x i32>
    502   %5 = add nuw nsw <32 x i32> %3, %4
    503   %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    504   %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    505   %8 = trunc <32 x i32> %7 to <32 x i16>
    506   store <32 x i16> %8, <32 x i16>* undef, align 4
    507   ret void
    508 }
    509 
    510 define void @avg_v4i8_const(<4 x i8>* %a) {
    511 ; SSE2-LABEL: avg_v4i8_const:
    512 ; SSE2:       # BB#0:
    513 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    514 ; SSE2-NEXT:    pavgb {{.*}}(%rip), %xmm0
    515 ; SSE2-NEXT:    movd %xmm0, (%rax)
    516 ; SSE2-NEXT:    retq
    517 ;
    518 ; AVX2-LABEL: avg_v4i8_const:
    519 ; AVX2:       # BB#0:
    520 ; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    521 ; AVX2-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
    522 ; AVX2-NEXT:    vmovd %xmm0, (%rax)
    523 ; AVX2-NEXT:    retq
    524 ;
    525 ; AVX512BW-LABEL: avg_v4i8_const:
    526 ; AVX512BW:       # BB#0:
    527 ; AVX512BW-NEXT:    vmovd (%rdi), %xmm0
    528 ; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
    529 ; AVX512BW-NEXT:    vmovd %xmm0, (%rax)
    530 ; AVX512BW-NEXT:    retq
    531   %1 = load <4 x i8>, <4 x i8>* %a
    532   %2 = zext <4 x i8> %1 to <4 x i32>
    533   %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
    534   %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
    535   %5 = trunc <4 x i32> %4 to <4 x i8>
    536   store <4 x i8> %5, <4 x i8>* undef, align 4
    537   ret void
    538 }
    539 
    540 define void @avg_v8i8_const(<8 x i8>* %a) {
    541 ; SSE2-LABEL: avg_v8i8_const:
    542 ; SSE2:       # BB#0:
    543 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    544 ; SSE2-NEXT:    pavgb {{.*}}(%rip), %xmm0
    545 ; SSE2-NEXT:    movq %xmm0, (%rax)
    546 ; SSE2-NEXT:    retq
    547 ;
    548 ; AVX2-LABEL: avg_v8i8_const:
    549 ; AVX2:       # BB#0:
    550 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    551 ; AVX2-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
    552 ; AVX2-NEXT:    vmovq %xmm0, (%rax)
    553 ; AVX2-NEXT:    retq
    554 ;
    555 ; AVX512BW-LABEL: avg_v8i8_const:
    556 ; AVX512BW:       # BB#0:
    557 ; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
    558 ; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
    559 ; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
    560 ; AVX512BW-NEXT:    retq
    561   %1 = load <8 x i8>, <8 x i8>* %a
    562   %2 = zext <8 x i8> %1 to <8 x i32>
    563   %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
    564   %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    565   %5 = trunc <8 x i32> %4 to <8 x i8>
    566   store <8 x i8> %5, <8 x i8>* undef, align 4
    567   ret void
    568 }
    569 
    570 define void @avg_v16i8_const(<16 x i8>* %a) {
    571 ; SSE2-LABEL: avg_v16i8_const:
    572 ; SSE2:       # BB#0:
    573 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
    574 ; SSE2-NEXT:    pavgb {{.*}}(%rip), %xmm0
    575 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
    576 ; SSE2-NEXT:    retq
    577 ;
    578 ; AVX-LABEL: avg_v16i8_const:
    579 ; AVX:       # BB#0:
    580 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
    581 ; AVX-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
    582 ; AVX-NEXT:    vmovdqu %xmm0, (%rax)
    583 ; AVX-NEXT:    retq
    584   %1 = load <16 x i8>, <16 x i8>* %a
    585   %2 = zext <16 x i8> %1 to <16 x i32>
    586   %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
    587   %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    588   %5 = trunc <16 x i32> %4 to <16 x i8>
    589   store <16 x i8> %5, <16 x i8>* undef, align 4
    590   ret void
    591 }
    592 
    593 define void @avg_v32i8_const(<32 x i8>* %a) {
    594 ; AVX2-LABEL: avg_v32i8_const:
    595 ; AVX2:       # BB#0:
    596 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
    597 ; AVX2-NEXT:    vpavgb {{.*}}(%rip), %ymm0, %ymm0
    598 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
    599 ; AVX2-NEXT:    vzeroupper
    600 ; AVX2-NEXT:    retq
    601 ;
    602 ; AVX512BW-LABEL: avg_v32i8_const:
    603 ; AVX512BW:       # BB#0:
    604 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
    605 ; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %ymm0, %ymm0
    606 ; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
    607 ; AVX512BW-NEXT:    retq
    608   %1 = load <32 x i8>, <32 x i8>* %a
    609   %2 = zext <32 x i8> %1 to <32 x i32>
    610   %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
    611   %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    612   %5 = trunc <32 x i32> %4 to <32 x i8>
    613   store <32 x i8> %5, <32 x i8>* undef, align 4
    614   ret void
    615 }
    616 
    617 define void @avg_v64i8_const(<64 x i8>* %a) {
    618 ; AVX512BW-LABEL: avg_v64i8_const:
    619 ; AVX512BW:       # BB#0:
    620 ; AVX512BW-NEXT:    vmovdqu8 (%rdi), %zmm0
    621 ; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %zmm0, %zmm0
    622 ; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rax)
    623 ; AVX512BW-NEXT:    retq
    624   %1 = load <64 x i8>, <64 x i8>* %a
    625   %2 = zext <64 x i8> %1 to <64 x i32>
    626   %3 = add nuw nsw <64 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
    627   %4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    628   %5 = trunc <64 x i32> %4 to <64 x i8>
    629   store <64 x i8> %5, <64 x i8>* undef, align 4
    630   ret void
    631 }
    632 
    633 define void @avg_v4i16_const(<4 x i16>* %a) {
    634 ; SSE2-LABEL: avg_v4i16_const:
    635 ; SSE2:       # BB#0:
    636 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    637 ; SSE2-NEXT:    pavgw {{.*}}(%rip), %xmm0
    638 ; SSE2-NEXT:    movq %xmm0, (%rax)
    639 ; SSE2-NEXT:    retq
    640 ;
    641 ; AVX2-LABEL: avg_v4i16_const:
    642 ; AVX2:       # BB#0:
    643 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    644 ; AVX2-NEXT:    vpavgw {{.*}}(%rip), %xmm0, %xmm0
    645 ; AVX2-NEXT:    vmovq %xmm0, (%rax)
    646 ; AVX2-NEXT:    retq
    647 ;
    648 ; AVX512BW-LABEL: avg_v4i16_const:
    649 ; AVX512BW:       # BB#0:
    650 ; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
    651 ; AVX512BW-NEXT:    vpavgw {{.*}}(%rip), %xmm0, %xmm0
    652 ; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
    653 ; AVX512BW-NEXT:    retq
    654   %1 = load <4 x i16>, <4 x i16>* %a
    655   %2 = zext <4 x i16> %1 to <4 x i32>
    656   %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
    657   %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
    658   %5 = trunc <4 x i32> %4 to <4 x i16>
    659   store <4 x i16> %5, <4 x i16>* undef, align 4
    660   ret void
    661 }
    662 
    663 define void @avg_v8i16_const(<8 x i16>* %a) {
    664 ; SSE2-LABEL: avg_v8i16_const:
    665 ; SSE2:       # BB#0:
    666 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
    667 ; SSE2-NEXT:    pavgw {{.*}}(%rip), %xmm0
    668 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
    669 ; SSE2-NEXT:    retq
    670 ;
    671 ; AVX-LABEL: avg_v8i16_const:
    672 ; AVX:       # BB#0:
    673 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
    674 ; AVX-NEXT:    vpavgw {{.*}}(%rip), %xmm0, %xmm0
    675 ; AVX-NEXT:    vmovdqu %xmm0, (%rax)
    676 ; AVX-NEXT:    retq
    677   %1 = load <8 x i16>, <8 x i16>* %a
    678   %2 = zext <8 x i16> %1 to <8 x i32>
    679   %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
    680   %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    681   %5 = trunc <8 x i32> %4 to <8 x i16>
    682   store <8 x i16> %5, <8 x i16>* undef, align 4
    683   ret void
    684 }
    685 
    686 define void @avg_v16i16_const(<16 x i16>* %a) {
    687 ; AVX2-LABEL: avg_v16i16_const:
    688 ; AVX2:       # BB#0:
    689 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
    690 ; AVX2-NEXT:    vpavgw {{.*}}(%rip), %ymm0, %ymm0
    691 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
    692 ; AVX2-NEXT:    vzeroupper
    693 ; AVX2-NEXT:    retq
    694 ;
    695 ; AVX512BW-LABEL: avg_v16i16_const:
    696 ; AVX512BW:       # BB#0:
    697 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
    698 ; AVX512BW-NEXT:    vpavgw {{.*}}(%rip), %ymm0, %ymm0
    699 ; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
    700 ; AVX512BW-NEXT:    retq
    701   %1 = load <16 x i16>, <16 x i16>* %a
    702   %2 = zext <16 x i16> %1 to <16 x i32>
    703   %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
    704   %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    705   %5 = trunc <16 x i32> %4 to <16 x i16>
    706   store <16 x i16> %5, <16 x i16>* undef, align 4
    707   ret void
    708 }
    709 
    710 define void @avg_v32i16_const(<32 x i16>* %a) {
    711 ; AVX512BW-LABEL: avg_v32i16_const:
    712 ; AVX512BW:       # BB#0:
    713 ; AVX512BW-NEXT:    vmovdqu16 (%rdi), %zmm0
    714 ; AVX512BW-NEXT:    vpavgw {{.*}}(%rip), %zmm0, %zmm0
    715 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, (%rax)
    716 ; AVX512BW-NEXT:    retq
    717   %1 = load <32 x i16>, <32 x i16>* %a
    718   %2 = zext <32 x i16> %1 to <32 x i32>
    719   %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
    720   %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    721   %5 = trunc <32 x i32> %4 to <32 x i16>
    722   store <32 x i16> %5, <32 x i16>* undef, align 4
    723   ret void
    724 }
    725