Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
      7 
      8 define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) nounwind {
      9 ; SSE2-LABEL: avg_v4i8:
     10 ; SSE2:       # %bb.0:
     11 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
     12 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
     13 ; SSE2-NEXT:    pavgb %xmm0, %xmm1
     14 ; SSE2-NEXT:    movd %xmm1, (%rax)
     15 ; SSE2-NEXT:    retq
     16 ;
     17 ; AVX-LABEL: avg_v4i8:
     18 ; AVX:       # %bb.0:
     19 ; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
     20 ; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
     21 ; AVX-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
     22 ; AVX-NEXT:    vmovd %xmm0, (%rax)
     23 ; AVX-NEXT:    retq
     24   %1 = load <4 x i8>, <4 x i8>* %a
     25   %2 = load <4 x i8>, <4 x i8>* %b
     26   %3 = zext <4 x i8> %1 to <4 x i32>
     27   %4 = zext <4 x i8> %2 to <4 x i32>
     28   %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
     29   %6 = add nuw nsw <4 x i32> %5, %4
     30   %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
     31   %8 = trunc <4 x i32> %7 to <4 x i8>
     32   store <4 x i8> %8, <4 x i8>* undef, align 4
     33   ret void
     34 }
     35 
     36 define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) nounwind {
     37 ; SSE2-LABEL: avg_v8i8:
     38 ; SSE2:       # %bb.0:
     39 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
     40 ; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
     41 ; SSE2-NEXT:    pavgb %xmm0, %xmm1
     42 ; SSE2-NEXT:    movq %xmm1, (%rax)
     43 ; SSE2-NEXT:    retq
     44 ;
     45 ; AVX-LABEL: avg_v8i8:
     46 ; AVX:       # %bb.0:
     47 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
     48 ; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
     49 ; AVX-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
     50 ; AVX-NEXT:    vmovq %xmm0, (%rax)
     51 ; AVX-NEXT:    retq
     52   %1 = load <8 x i8>, <8 x i8>* %a
     53   %2 = load <8 x i8>, <8 x i8>* %b
     54   %3 = zext <8 x i8> %1 to <8 x i32>
     55   %4 = zext <8 x i8> %2 to <8 x i32>
     56   %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
     57   %6 = add nuw nsw <8 x i32> %5, %4
     58   %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
     59   %8 = trunc <8 x i32> %7 to <8 x i8>
     60   store <8 x i8> %8, <8 x i8>* undef, align 4
     61   ret void
     62 }
     63 
     64 define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind {
     65 ; SSE2-LABEL: avg_v16i8:
     66 ; SSE2:       # %bb.0:
     67 ; SSE2-NEXT:    movdqa (%rsi), %xmm0
     68 ; SSE2-NEXT:    pavgb (%rdi), %xmm0
     69 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
     70 ; SSE2-NEXT:    retq
     71 ;
     72 ; AVX-LABEL: avg_v16i8:
     73 ; AVX:       # %bb.0:
     74 ; AVX-NEXT:    vmovdqa (%rsi), %xmm0
     75 ; AVX-NEXT:    vpavgb (%rdi), %xmm0, %xmm0
     76 ; AVX-NEXT:    vmovdqu %xmm0, (%rax)
     77 ; AVX-NEXT:    retq
     78   %1 = load <16 x i8>, <16 x i8>* %a
     79   %2 = load <16 x i8>, <16 x i8>* %b
     80   %3 = zext <16 x i8> %1 to <16 x i32>
     81   %4 = zext <16 x i8> %2 to <16 x i32>
     82   %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
     83   %6 = add nuw nsw <16 x i32> %5, %4
     84   %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
     85   %8 = trunc <16 x i32> %7 to <16 x i8>
     86   store <16 x i8> %8, <16 x i8>* undef, align 4
     87   ret void
     88 }
     89 
     90 define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind {
     91 ; SSE2-LABEL: avg_v32i8:
     92 ; SSE2:       # %bb.0:
     93 ; SSE2-NEXT:    movdqa (%rsi), %xmm0
     94 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
     95 ; SSE2-NEXT:    pavgb (%rdi), %xmm0
     96 ; SSE2-NEXT:    pavgb 16(%rdi), %xmm1
     97 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
     98 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
     99 ; SSE2-NEXT:    retq
    100 ;
    101 ; AVX1-LABEL: avg_v32i8:
    102 ; AVX1:       # %bb.0:
    103 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    104 ; AVX1-NEXT:    vmovdqa (%rsi), %ymm1
    105 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    106 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
    107 ; AVX1-NEXT:    vpavgb %xmm2, %xmm3, %xmm2
    108 ; AVX1-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
    109 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    110 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
    111 ; AVX1-NEXT:    vzeroupper
    112 ; AVX1-NEXT:    retq
    113 ;
    114 ; AVX2-LABEL: avg_v32i8:
    115 ; AVX2:       # %bb.0:
    116 ; AVX2-NEXT:    vmovdqa (%rsi), %ymm0
    117 ; AVX2-NEXT:    vpavgb (%rdi), %ymm0, %ymm0
    118 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
    119 ; AVX2-NEXT:    vzeroupper
    120 ; AVX2-NEXT:    retq
    121 ;
    122 ; AVX512-LABEL: avg_v32i8:
    123 ; AVX512:       # %bb.0:
    124 ; AVX512-NEXT:    vmovdqa (%rsi), %ymm0
    125 ; AVX512-NEXT:    vpavgb (%rdi), %ymm0, %ymm0
    126 ; AVX512-NEXT:    vmovdqu %ymm0, (%rax)
    127 ; AVX512-NEXT:    vzeroupper
    128 ; AVX512-NEXT:    retq
    129   %1 = load <32 x i8>, <32 x i8>* %a
    130   %2 = load <32 x i8>, <32 x i8>* %b
    131   %3 = zext <32 x i8> %1 to <32 x i32>
    132   %4 = zext <32 x i8> %2 to <32 x i32>
    133   %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    134   %6 = add nuw nsw <32 x i32> %5, %4
    135   %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    136   %8 = trunc <32 x i32> %7 to <32 x i8>
    137   store <32 x i8> %8, <32 x i8>* undef, align 4
    138   ret void
    139 }
    140 
    141 define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
    142 ; SSE2-LABEL: avg_v48i8:
    143 ; SSE2:       # %bb.0:
    144 ; SSE2-NEXT:    movdqa (%rdi), %xmm1
    145 ; SSE2-NEXT:    movdqa 16(%rdi), %xmm6
    146 ; SSE2-NEXT:    movdqa 32(%rdi), %xmm11
    147 ; SSE2-NEXT:    movdqa (%rsi), %xmm12
    148 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm13
    149 ; SSE2-NEXT:    movdqa 32(%rsi), %xmm0
    150 ; SSE2-NEXT:    pxor %xmm7, %xmm7
    151 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
    152 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
    153 ; SSE2-NEXT:    movdqa %xmm4, %xmm2
    154 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
    155 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
    156 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
    157 ; SSE2-NEXT:    movdqa %xmm1, %xmm10
    158 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
    159 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
    160 ; SSE2-NEXT:    movdqa %xmm6, %xmm5
    161 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
    162 ; SSE2-NEXT:    movdqa %xmm5, %xmm15
    163 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7]
    164 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
    165 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
    166 ; SSE2-NEXT:    movdqa %xmm6, %xmm14
    167 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
    168 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
    169 ; SSE2-NEXT:    movdqa %xmm12, %xmm3
    170 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15]
    171 ; SSE2-NEXT:    movdqa %xmm3, %xmm8
    172 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
    173 ; SSE2-NEXT:    paddd %xmm2, %xmm8
    174 ; SSE2-NEXT:    movdqa %xmm11, %xmm2
    175 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
    176 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
    177 ; SSE2-NEXT:    paddd %xmm4, %xmm3
    178 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
    179 ; SSE2-NEXT:    movdqa %xmm12, %xmm9
    180 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
    181 ; SSE2-NEXT:    paddd %xmm10, %xmm9
    182 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
    183 ; SSE2-NEXT:    paddd %xmm1, %xmm12
    184 ; SSE2-NEXT:    movdqa %xmm13, %xmm4
    185 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
    186 ; SSE2-NEXT:    movdqa %xmm4, %xmm10
    187 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
    188 ; SSE2-NEXT:    paddd %xmm15, %xmm10
    189 ; SSE2-NEXT:    movdqa %xmm2, %xmm15
    190 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7]
    191 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
    192 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7]
    193 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
    194 ; SSE2-NEXT:    paddd %xmm5, %xmm4
    195 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7]
    196 ; SSE2-NEXT:    movdqa %xmm13, %xmm1
    197 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
    198 ; SSE2-NEXT:    paddd %xmm14, %xmm1
    199 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3]
    200 ; SSE2-NEXT:    paddd %xmm6, %xmm13
    201 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
    202 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
    203 ; SSE2-NEXT:    movdqa %xmm6, %xmm14
    204 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
    205 ; SSE2-NEXT:    paddd %xmm15, %xmm14
    206 ; SSE2-NEXT:    movdqa %xmm11, %xmm5
    207 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
    208 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
    209 ; SSE2-NEXT:    paddd %xmm2, %xmm6
    210 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
    211 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    212 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
    213 ; SSE2-NEXT:    paddd %xmm5, %xmm2
    214 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3]
    215 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
    216 ; SSE2-NEXT:    paddd %xmm11, %xmm0
    217 ; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
    218 ; SSE2-NEXT:    psubd %xmm5, %xmm8
    219 ; SSE2-NEXT:    psubd %xmm5, %xmm3
    220 ; SSE2-NEXT:    psubd %xmm5, %xmm9
    221 ; SSE2-NEXT:    psubd %xmm5, %xmm12
    222 ; SSE2-NEXT:    psubd %xmm5, %xmm10
    223 ; SSE2-NEXT:    psubd %xmm5, %xmm4
    224 ; SSE2-NEXT:    psubd %xmm5, %xmm1
    225 ; SSE2-NEXT:    psubd %xmm5, %xmm13
    226 ; SSE2-NEXT:    psubd %xmm5, %xmm14
    227 ; SSE2-NEXT:    psubd %xmm5, %xmm6
    228 ; SSE2-NEXT:    psubd %xmm5, %xmm2
    229 ; SSE2-NEXT:    psubd %xmm5, %xmm0
    230 ; SSE2-NEXT:    psrld $1, %xmm3
    231 ; SSE2-NEXT:    psrld $1, %xmm8
    232 ; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,255,255]
    233 ; SSE2-NEXT:    pand %xmm7, %xmm8
    234 ; SSE2-NEXT:    pand %xmm7, %xmm3
    235 ; SSE2-NEXT:    packuswb %xmm8, %xmm3
    236 ; SSE2-NEXT:    psrld $1, %xmm12
    237 ; SSE2-NEXT:    psrld $1, %xmm9
    238 ; SSE2-NEXT:    pand %xmm7, %xmm9
    239 ; SSE2-NEXT:    pand %xmm7, %xmm12
    240 ; SSE2-NEXT:    packuswb %xmm9, %xmm12
    241 ; SSE2-NEXT:    packuswb %xmm3, %xmm12
    242 ; SSE2-NEXT:    psrld $1, %xmm4
    243 ; SSE2-NEXT:    psrld $1, %xmm10
    244 ; SSE2-NEXT:    pand %xmm7, %xmm10
    245 ; SSE2-NEXT:    pand %xmm7, %xmm4
    246 ; SSE2-NEXT:    packuswb %xmm10, %xmm4
    247 ; SSE2-NEXT:    psrld $1, %xmm13
    248 ; SSE2-NEXT:    psrld $1, %xmm1
    249 ; SSE2-NEXT:    pand %xmm7, %xmm1
    250 ; SSE2-NEXT:    pand %xmm7, %xmm13
    251 ; SSE2-NEXT:    packuswb %xmm1, %xmm13
    252 ; SSE2-NEXT:    packuswb %xmm4, %xmm13
    253 ; SSE2-NEXT:    psrld $1, %xmm6
    254 ; SSE2-NEXT:    psrld $1, %xmm14
    255 ; SSE2-NEXT:    pand %xmm7, %xmm14
    256 ; SSE2-NEXT:    pand %xmm7, %xmm6
    257 ; SSE2-NEXT:    packuswb %xmm14, %xmm6
    258 ; SSE2-NEXT:    psrld $1, %xmm0
    259 ; SSE2-NEXT:    psrld $1, %xmm2
    260 ; SSE2-NEXT:    pand %xmm7, %xmm2
    261 ; SSE2-NEXT:    pand %xmm7, %xmm0
    262 ; SSE2-NEXT:    packuswb %xmm2, %xmm0
    263 ; SSE2-NEXT:    packuswb %xmm6, %xmm0
    264 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
    265 ; SSE2-NEXT:    movdqu %xmm13, (%rax)
    266 ; SSE2-NEXT:    movdqu %xmm12, (%rax)
    267 ; SSE2-NEXT:    retq
    268 ;
    269 ; AVX1-LABEL: avg_v48i8:
    270 ; AVX1:       # %bb.0:
    271 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm2
    272 ; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm5
    273 ; AVX1-NEXT:    vmovdqa (%rsi), %ymm1
    274 ; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm0
    275 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
    276 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
    277 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
    278 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[3,1,2,3]
    279 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
    280 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,3]
    281 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
    282 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
    283 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
    284 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm12 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
    285 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[3,1,2,3]
    286 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm13 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
    287 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,3]
    288 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
    289 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
    290 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm5[2,3,0,1]
    291 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm14 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
    292 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3]
    293 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
    294 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm5[1,1,2,3]
    295 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
    296 ; AVX1-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
    297 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
    298 ; AVX1-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
    299 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    300 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
    301 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
    302 ; AVX1-NEXT:    vpaddd %xmm5, %xmm7, %xmm5
    303 ; AVX1-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
    304 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[3,1,2,3]
    305 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
    306 ; AVX1-NEXT:    vpaddd %xmm7, %xmm4, %xmm9
    307 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,3]
    308 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
    309 ; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm8
    310 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
    311 ; AVX1-NEXT:    vpaddd %xmm2, %xmm11, %xmm11
    312 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[2,3,0,1]
    313 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
    314 ; AVX1-NEXT:    vpaddd %xmm7, %xmm12, %xmm12
    315 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[3,1,2,3]
    316 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
    317 ; AVX1-NEXT:    vpaddd %xmm5, %xmm13, %xmm13
    318 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3]
    319 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
    320 ; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
    321 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
    322 ; AVX1-NEXT:    vpaddd %xmm1, %xmm15, %xmm15
    323 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
    324 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
    325 ; AVX1-NEXT:    vpaddd %xmm4, %xmm14, %xmm4
    326 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[3,1,2,3]
    327 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
    328 ; AVX1-NEXT:    vpaddd %xmm6, %xmm10, %xmm6
    329 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
    330 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
    331 ; AVX1-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
    332 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    333 ; AVX1-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
    334 ; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm7, %xmm7
    335 ; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
    336 ; AVX1-NEXT:    vpsubd %xmm7, %xmm1, %xmm10
    337 ; AVX1-NEXT:    vpsubd %xmm7, %xmm9, %xmm9
    338 ; AVX1-NEXT:    vpsubd %xmm7, %xmm8, %xmm8
    339 ; AVX1-NEXT:    vpsubd %xmm7, %xmm11, %xmm11
    340 ; AVX1-NEXT:    vpsubd %xmm7, %xmm12, %xmm12
    341 ; AVX1-NEXT:    vpsubd %xmm7, %xmm13, %xmm5
    342 ; AVX1-NEXT:    vpsubd %xmm7, %xmm3, %xmm3
    343 ; AVX1-NEXT:    vpsubd %xmm7, %xmm15, %xmm1
    344 ; AVX1-NEXT:    vpsubd %xmm7, %xmm4, %xmm4
    345 ; AVX1-NEXT:    vpsubd %xmm7, %xmm6, %xmm6
    346 ; AVX1-NEXT:    vpsubd %xmm7, %xmm2, %xmm2
    347 ; AVX1-NEXT:    vpsubd %xmm7, %xmm0, %xmm0
    348 ; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
    349 ; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
    350 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
    351 ; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm2
    352 ; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4
    353 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm4, %xmm2
    354 ; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
    355 ; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
    356 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
    357 ; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm3
    358 ; AVX1-NEXT:    vpsrld $1, %xmm12, %xmm4
    359 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm4, %xmm3
    360 ; AVX1-NEXT:    vpsrld $1, %xmm11, %xmm4
    361 ; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm5
    362 ; AVX1-NEXT:    vpackusdw %xmm5, %xmm4, %xmm4
    363 ; AVX1-NEXT:    vpsrld $1, %xmm9, %xmm5
    364 ; AVX1-NEXT:    vpsrld $1, %xmm10, %xmm6
    365 ; AVX1-NEXT:    vpackusdw %xmm5, %xmm6, %xmm5
    366 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
    367 ; AVX1-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
    368 ; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
    369 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
    370 ; AVX1-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
    371 ; AVX1-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
    372 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
    373 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
    374 ; AVX1-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
    375 ; AVX1-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
    376 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
    377 ; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
    378 ; AVX1-NEXT:    vmovups %ymm1, (%rax)
    379 ; AVX1-NEXT:    vzeroupper
    380 ; AVX1-NEXT:    retq
    381 ;
    382 ; AVX2-LABEL: avg_v48i8:
    383 ; AVX2:       # %bb.0:
    384 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm1
    385 ; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm2
    386 ; AVX2-NEXT:    vmovdqa (%rsi), %ymm3
    387 ; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm0
    388 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
    389 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]
    390 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
    391 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[3,1,2,3]
    392 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
    393 ; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5
    394 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
    395 ; AVX2-NEXT:    vpand %ymm9, %ymm5, %ymm5
    396 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm7 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
    397 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
    398 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
    399 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm11 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero
    400 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
    401 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
    402 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
    403 ; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm6
    404 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm6[2,3,0,1]
    405 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
    406 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm6[3,1,2,3]
    407 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
    408 ; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
    409 ; AVX2-NEXT:    vpand %ymm9, %ymm2, %ymm2
    410 ; AVX2-NEXT:    vpaddd %ymm2, %ymm5, %ymm2
    411 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
    412 ; AVX2-NEXT:    vpaddd %ymm4, %ymm7, %ymm4
    413 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
    414 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
    415 ; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
    416 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
    417 ; AVX2-NEXT:    vpaddd %ymm3, %ymm11, %ymm3
    418 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
    419 ; AVX2-NEXT:    vpaddd %ymm5, %ymm8, %ymm5
    420 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    421 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
    422 ; AVX2-NEXT:    vpaddd %ymm0, %ymm10, %ymm0
    423 ; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6
    424 ; AVX2-NEXT:    vpsubd %ymm6, %ymm2, %ymm2
    425 ; AVX2-NEXT:    vpsubd %ymm6, %ymm4, %ymm4
    426 ; AVX2-NEXT:    vpsubd %ymm6, %ymm1, %ymm1
    427 ; AVX2-NEXT:    vpsubd %ymm6, %ymm3, %ymm3
    428 ; AVX2-NEXT:    vpsubd %ymm6, %ymm5, %ymm5
    429 ; AVX2-NEXT:    vpsubd %ymm6, %ymm0, %ymm0
    430 ; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2
    431 ; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
    432 ; AVX2-NEXT:    vpsrld $1, %ymm5, %ymm5
    433 ; AVX2-NEXT:    vpsrld $1, %ymm3, %ymm3
    434 ; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1
    435 ; AVX2-NEXT:    vpsrld $1, %ymm4, %ymm4
    436 ; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm6
    437 ; AVX2-NEXT:    vpackusdw %xmm6, %xmm4, %xmm4
    438 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
    439 ; AVX2-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
    440 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm7
    441 ; AVX2-NEXT:    vpackusdw %xmm7, %xmm1, %xmm1
    442 ; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
    443 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
    444 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
    445 ; AVX2-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
    446 ; AVX2-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
    447 ; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
    448 ; AVX2-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
    449 ; AVX2-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
    450 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
    451 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
    452 ; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm2
    453 ; AVX2-NEXT:    vpackusdw %xmm2, %xmm5, %xmm2
    454 ; AVX2-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
    455 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
    456 ; AVX2-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
    457 ; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
    458 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
    459 ; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
    460 ; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
    461 ; AVX2-NEXT:    vzeroupper
    462 ; AVX2-NEXT:    retq
    463 ;
    464 ; AVX512F-LABEL: avg_v48i8:
    465 ; AVX512F:       # %bb.0:
    466 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    467 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
    468 ; AVX512F-NEXT:    vmovdqa (%rsi), %ymm2
    469 ; AVX512F-NEXT:    vmovdqa 32(%rsi), %ymm3
    470 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm4
    471 ; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm5
    472 ; AVX512F-NEXT:    vpavgb %xmm5, %xmm4, %xmm4
    473 ; AVX512F-NEXT:    vpavgb %xmm2, %xmm0, %xmm0
    474 ; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
    475 ; AVX512F-NEXT:    vpavgb %xmm3, %xmm1, %xmm1
    476 ; AVX512F-NEXT:    vmovdqu %xmm1, (%rax)
    477 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
    478 ; AVX512F-NEXT:    vzeroupper
    479 ; AVX512F-NEXT:    retq
    480 ;
    481 ; AVX512BW-LABEL: avg_v48i8:
    482 ; AVX512BW:       # %bb.0:
    483 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
    484 ; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm1
    485 ; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
    486 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm3
    487 ; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero
    488 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
    489 ; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
    490 ; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
    491 ; AVX512BW-NEXT:    vpaddd %zmm4, %zmm2, %zmm2
    492 ; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm4
    493 ; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
    494 ; AVX512BW-NEXT:    vpaddd %zmm4, %zmm3, %zmm3
    495 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
    496 ; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
    497 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    498 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
    499 ; AVX512BW-NEXT:    vpsubd %zmm1, %zmm2, %zmm2
    500 ; AVX512BW-NEXT:    vpsubd %zmm1, %zmm3, %zmm3
    501 ; AVX512BW-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
    502 ; AVX512BW-NEXT:    vpsrld $1, %zmm0, %zmm0
    503 ; AVX512BW-NEXT:    vpsrld $1, %zmm3, %zmm1
    504 ; AVX512BW-NEXT:    vpsrld $1, %zmm2, %zmm2
    505 ; AVX512BW-NEXT:    vpmovdw %zmm2, %ymm2
    506 ; AVX512BW-NEXT:    vpmovdw %zmm1, %ymm1
    507 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
    508 ; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
    509 ; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
    510 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
    511 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
    512 ; AVX512BW-NEXT:    vmovdqu %ymm1, (%rax)
    513 ; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, (%rax)
    514 ; AVX512BW-NEXT:    vzeroupper
    515 ; AVX512BW-NEXT:    retq
    516   %1 = load <48 x i8>, <48 x i8>* %a
    517   %2 = load <48 x i8>, <48 x i8>* %b
    518   %3 = zext <48 x i8> %1 to <48 x i32>
    519   %4 = zext <48 x i8> %2 to <48 x i32>
    520   %5 = add nuw nsw <48 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    521   %6 = add nuw nsw <48 x i32> %5, %4
    522   %7 = lshr <48 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    523   %8 = trunc <48 x i32> %7 to <48 x i8>
    524   store <48 x i8> %8, <48 x i8>* undef, align 4
    525   ret void
    526 }
    527 
    528 define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind {
    529 ; SSE2-LABEL: avg_v64i8:
    530 ; SSE2:       # %bb.0:
    531 ; SSE2-NEXT:    movdqa (%rsi), %xmm0
    532 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
    533 ; SSE2-NEXT:    movdqa 32(%rsi), %xmm2
    534 ; SSE2-NEXT:    movdqa 48(%rsi), %xmm3
    535 ; SSE2-NEXT:    pavgb (%rdi), %xmm0
    536 ; SSE2-NEXT:    pavgb 16(%rdi), %xmm1
    537 ; SSE2-NEXT:    pavgb 32(%rdi), %xmm2
    538 ; SSE2-NEXT:    pavgb 48(%rdi), %xmm3
    539 ; SSE2-NEXT:    movdqu %xmm3, (%rax)
    540 ; SSE2-NEXT:    movdqu %xmm2, (%rax)
    541 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
    542 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
    543 ; SSE2-NEXT:    retq
    544 ;
    545 ; AVX1-LABEL: avg_v64i8:
    546 ; AVX1:       # %bb.0:
    547 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    548 ; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
    549 ; AVX1-NEXT:    vmovdqa (%rsi), %ymm2
    550 ; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm3
    551 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
    552 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
    553 ; AVX1-NEXT:    vpavgb %xmm4, %xmm5, %xmm4
    554 ; AVX1-NEXT:    vpavgb %xmm0, %xmm2, %xmm0
    555 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
    556 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    557 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
    558 ; AVX1-NEXT:    vpavgb %xmm2, %xmm4, %xmm2
    559 ; AVX1-NEXT:    vpavgb %xmm1, %xmm3, %xmm1
    560 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
    561 ; AVX1-NEXT:    vmovups %ymm1, (%rax)
    562 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
    563 ; AVX1-NEXT:    vzeroupper
    564 ; AVX1-NEXT:    retq
    565 ;
    566 ; AVX2-LABEL: avg_v64i8:
    567 ; AVX2:       # %bb.0:
    568 ; AVX2-NEXT:    vmovdqa (%rsi), %ymm0
    569 ; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm1
    570 ; AVX2-NEXT:    vpavgb (%rdi), %ymm0, %ymm0
    571 ; AVX2-NEXT:    vpavgb 32(%rdi), %ymm1, %ymm1
    572 ; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
    573 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
    574 ; AVX2-NEXT:    vzeroupper
    575 ; AVX2-NEXT:    retq
    576 ;
    577 ; AVX512F-LABEL: avg_v64i8:
    578 ; AVX512F:       # %bb.0:
    579 ; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
    580 ; AVX512F-NEXT:    vmovdqa 32(%rsi), %ymm1
    581 ; AVX512F-NEXT:    vpavgb (%rdi), %ymm0, %ymm0
    582 ; AVX512F-NEXT:    vpavgb 32(%rdi), %ymm1, %ymm1
    583 ; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
    584 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
    585 ; AVX512F-NEXT:    vzeroupper
    586 ; AVX512F-NEXT:    retq
    587 ;
    588 ; AVX512BW-LABEL: avg_v64i8:
    589 ; AVX512BW:       # %bb.0:
    590 ; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm0
    591 ; AVX512BW-NEXT:    vpavgb (%rdi), %zmm0, %zmm0
    592 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rax)
    593 ; AVX512BW-NEXT:    vzeroupper
    594 ; AVX512BW-NEXT:    retq
    595   %1 = load <64 x i8>, <64 x i8>* %a
    596   %2 = load <64 x i8>, <64 x i8>* %b
    597   %3 = zext <64 x i8> %1 to <64 x i32>
    598   %4 = zext <64 x i8> %2 to <64 x i32>
    599   %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    600   %6 = add nuw nsw <64 x i32> %5, %4
    601   %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    602   %8 = trunc <64 x i32> %7 to <64 x i8>
    603   store <64 x i8> %8, <64 x i8>* undef, align 4
    604   ret void
    605 }
    606 
    607 define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) nounwind {
    608 ; SSE2-LABEL: avg_v4i16:
    609 ; SSE2:       # %bb.0:
    610 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    611 ; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    612 ; SSE2-NEXT:    pavgw %xmm0, %xmm1
    613 ; SSE2-NEXT:    movq %xmm1, (%rax)
    614 ; SSE2-NEXT:    retq
    615 ;
    616 ; AVX-LABEL: avg_v4i16:
    617 ; AVX:       # %bb.0:
    618 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    619 ; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    620 ; AVX-NEXT:    vpavgw %xmm0, %xmm1, %xmm0
    621 ; AVX-NEXT:    vmovq %xmm0, (%rax)
    622 ; AVX-NEXT:    retq
    623   %1 = load <4 x i16>, <4 x i16>* %a
    624   %2 = load <4 x i16>, <4 x i16>* %b
    625   %3 = zext <4 x i16> %1 to <4 x i32>
    626   %4 = zext <4 x i16> %2 to <4 x i32>
    627   %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
    628   %6 = add nuw nsw <4 x i32> %5, %4
    629   %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
    630   %8 = trunc <4 x i32> %7 to <4 x i16>
    631   store <4 x i16> %8, <4 x i16>* undef, align 4
    632   ret void
    633 }
    634 
    635 define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) nounwind {
    636 ; SSE2-LABEL: avg_v8i16:
    637 ; SSE2:       # %bb.0:
    638 ; SSE2-NEXT:    movdqa (%rsi), %xmm0
    639 ; SSE2-NEXT:    pavgw (%rdi), %xmm0
    640 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
    641 ; SSE2-NEXT:    retq
    642 ;
    643 ; AVX-LABEL: avg_v8i16:
    644 ; AVX:       # %bb.0:
    645 ; AVX-NEXT:    vmovdqa (%rsi), %xmm0
    646 ; AVX-NEXT:    vpavgw (%rdi), %xmm0, %xmm0
    647 ; AVX-NEXT:    vmovdqu %xmm0, (%rax)
    648 ; AVX-NEXT:    retq
    649   %1 = load <8 x i16>, <8 x i16>* %a
    650   %2 = load <8 x i16>, <8 x i16>* %b
    651   %3 = zext <8 x i16> %1 to <8 x i32>
    652   %4 = zext <8 x i16> %2 to <8 x i32>
    653   %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    654   %6 = add nuw nsw <8 x i32> %5, %4
    655   %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    656   %8 = trunc <8 x i32> %7 to <8 x i16>
    657   store <8 x i16> %8, <8 x i16>* undef, align 4
    658   ret void
    659 }
    660 
    661 define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind {
    662 ; SSE2-LABEL: avg_v16i16:
    663 ; SSE2:       # %bb.0:
    664 ; SSE2-NEXT:    movdqa (%rsi), %xmm0
    665 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
    666 ; SSE2-NEXT:    pavgw (%rdi), %xmm0
    667 ; SSE2-NEXT:    pavgw 16(%rdi), %xmm1
    668 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
    669 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
    670 ; SSE2-NEXT:    retq
    671 ;
    672 ; AVX1-LABEL: avg_v16i16:
    673 ; AVX1:       # %bb.0:
    674 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    675 ; AVX1-NEXT:    vmovdqa (%rsi), %ymm1
    676 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    677 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
    678 ; AVX1-NEXT:    vpavgw %xmm2, %xmm3, %xmm2
    679 ; AVX1-NEXT:    vpavgw %xmm0, %xmm1, %xmm0
    680 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    681 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
    682 ; AVX1-NEXT:    vzeroupper
    683 ; AVX1-NEXT:    retq
    684 ;
    685 ; AVX2-LABEL: avg_v16i16:
    686 ; AVX2:       # %bb.0:
    687 ; AVX2-NEXT:    vmovdqa (%rsi), %ymm0
    688 ; AVX2-NEXT:    vpavgw (%rdi), %ymm0, %ymm0
    689 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
    690 ; AVX2-NEXT:    vzeroupper
    691 ; AVX2-NEXT:    retq
    692 ;
    693 ; AVX512-LABEL: avg_v16i16:
    694 ; AVX512:       # %bb.0:
    695 ; AVX512-NEXT:    vmovdqa (%rsi), %ymm0
    696 ; AVX512-NEXT:    vpavgw (%rdi), %ymm0, %ymm0
    697 ; AVX512-NEXT:    vmovdqu %ymm0, (%rax)
    698 ; AVX512-NEXT:    vzeroupper
    699 ; AVX512-NEXT:    retq
    700   %1 = load <16 x i16>, <16 x i16>* %a
    701   %2 = load <16 x i16>, <16 x i16>* %b
    702   %3 = zext <16 x i16> %1 to <16 x i32>
    703   %4 = zext <16 x i16> %2 to <16 x i32>
    704   %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    705   %6 = add nuw nsw <16 x i32> %5, %4
    706   %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    707   %8 = trunc <16 x i32> %7 to <16 x i16>
    708   store <16 x i16> %8, <16 x i16>* undef, align 4
    709   ret void
    710 }
    711 
    712 define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind {
    713 ; SSE2-LABEL: avg_v32i16:
    714 ; SSE2:       # %bb.0:
    715 ; SSE2-NEXT:    movdqa (%rsi), %xmm0
    716 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
    717 ; SSE2-NEXT:    movdqa 32(%rsi), %xmm2
    718 ; SSE2-NEXT:    movdqa 48(%rsi), %xmm3
    719 ; SSE2-NEXT:    pavgw (%rdi), %xmm0
    720 ; SSE2-NEXT:    pavgw 16(%rdi), %xmm1
    721 ; SSE2-NEXT:    pavgw 32(%rdi), %xmm2
    722 ; SSE2-NEXT:    pavgw 48(%rdi), %xmm3
    723 ; SSE2-NEXT:    movdqu %xmm3, (%rax)
    724 ; SSE2-NEXT:    movdqu %xmm2, (%rax)
    725 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
    726 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
    727 ; SSE2-NEXT:    retq
    728 ;
    729 ; AVX1-LABEL: avg_v32i16:
    730 ; AVX1:       # %bb.0:
    731 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    732 ; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
    733 ; AVX1-NEXT:    vmovdqa (%rsi), %ymm2
    734 ; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm3
    735 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
    736 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
    737 ; AVX1-NEXT:    vpavgw %xmm4, %xmm5, %xmm4
    738 ; AVX1-NEXT:    vpavgw %xmm0, %xmm2, %xmm0
    739 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
    740 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    741 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
    742 ; AVX1-NEXT:    vpavgw %xmm2, %xmm4, %xmm2
    743 ; AVX1-NEXT:    vpavgw %xmm1, %xmm3, %xmm1
    744 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
    745 ; AVX1-NEXT:    vmovups %ymm1, (%rax)
    746 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
    747 ; AVX1-NEXT:    vzeroupper
    748 ; AVX1-NEXT:    retq
    749 ;
    750 ; AVX2-LABEL: avg_v32i16:
    751 ; AVX2:       # %bb.0:
    752 ; AVX2-NEXT:    vmovdqa (%rsi), %ymm0
    753 ; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm1
    754 ; AVX2-NEXT:    vpavgw (%rdi), %ymm0, %ymm0
    755 ; AVX2-NEXT:    vpavgw 32(%rdi), %ymm1, %ymm1
    756 ; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
    757 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
    758 ; AVX2-NEXT:    vzeroupper
    759 ; AVX2-NEXT:    retq
    760 ;
    761 ; AVX512F-LABEL: avg_v32i16:
    762 ; AVX512F:       # %bb.0:
    763 ; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
    764 ; AVX512F-NEXT:    vmovdqa 32(%rsi), %ymm1
    765 ; AVX512F-NEXT:    vpavgw (%rdi), %ymm0, %ymm0
    766 ; AVX512F-NEXT:    vpavgw 32(%rdi), %ymm1, %ymm1
    767 ; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
    768 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
    769 ; AVX512F-NEXT:    vzeroupper
    770 ; AVX512F-NEXT:    retq
    771 ;
    772 ; AVX512BW-LABEL: avg_v32i16:
    773 ; AVX512BW:       # %bb.0:
    774 ; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm0
    775 ; AVX512BW-NEXT:    vpavgw (%rdi), %zmm0, %zmm0
    776 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rax)
    777 ; AVX512BW-NEXT:    vzeroupper
    778 ; AVX512BW-NEXT:    retq
    779   %1 = load <32 x i16>, <32 x i16>* %a
    780   %2 = load <32 x i16>, <32 x i16>* %b
    781   %3 = zext <32 x i16> %1 to <32 x i32>
    782   %4 = zext <32 x i16> %2 to <32 x i32>
    783   %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    784   %6 = add nuw nsw <32 x i32> %5, %4
    785   %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    786   %8 = trunc <32 x i32> %7 to <32 x i16>
    787   store <32 x i16> %8, <32 x i16>* undef, align 4
    788   ret void
    789 }
    790 
    791 define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) nounwind {
    792 ; SSE2-LABEL: avg_v4i8_2:
    793 ; SSE2:       # %bb.0:
    794 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    795 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
    796 ; SSE2-NEXT:    pavgb %xmm0, %xmm1
    797 ; SSE2-NEXT:    movd %xmm1, (%rax)
    798 ; SSE2-NEXT:    retq
    799 ;
    800 ; AVX-LABEL: avg_v4i8_2:
    801 ; AVX:       # %bb.0:
    802 ; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    803 ; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
    804 ; AVX-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
    805 ; AVX-NEXT:    vmovd %xmm0, (%rax)
    806 ; AVX-NEXT:    retq
    807   %1 = load <4 x i8>, <4 x i8>* %a
    808   %2 = load <4 x i8>, <4 x i8>* %b
    809   %3 = zext <4 x i8> %1 to <4 x i32>
    810   %4 = zext <4 x i8> %2 to <4 x i32>
    811   %5 = add nuw nsw <4 x i32> %3, %4
    812   %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
    813   %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
    814   %8 = trunc <4 x i32> %7 to <4 x i8>
    815   store <4 x i8> %8, <4 x i8>* undef, align 4
    816   ret void
    817 }
    818 
    819 define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) nounwind {
    820 ; SSE2-LABEL: avg_v8i8_2:
    821 ; SSE2:       # %bb.0:
    822 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    823 ; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    824 ; SSE2-NEXT:    pavgb %xmm0, %xmm1
    825 ; SSE2-NEXT:    movq %xmm1, (%rax)
    826 ; SSE2-NEXT:    retq
    827 ;
    828 ; AVX-LABEL: avg_v8i8_2:
    829 ; AVX:       # %bb.0:
    830 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    831 ; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    832 ; AVX-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
    833 ; AVX-NEXT:    vmovq %xmm0, (%rax)
    834 ; AVX-NEXT:    retq
    835   %1 = load <8 x i8>, <8 x i8>* %a
    836   %2 = load <8 x i8>, <8 x i8>* %b
    837   %3 = zext <8 x i8> %1 to <8 x i32>
    838   %4 = zext <8 x i8> %2 to <8 x i32>
    839   %5 = add nuw nsw <8 x i32> %3, %4
    840   %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    841   %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    842   %8 = trunc <8 x i32> %7 to <8 x i8>
    843   store <8 x i8> %8, <8 x i8>* undef, align 4
    844   ret void
    845 }
    846 
    847 define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) nounwind {
    848 ; SSE2-LABEL: avg_v16i8_2:
    849 ; SSE2:       # %bb.0:
    850 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
    851 ; SSE2-NEXT:    pavgb (%rsi), %xmm0
    852 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
    853 ; SSE2-NEXT:    retq
    854 ;
    855 ; AVX-LABEL: avg_v16i8_2:
    856 ; AVX:       # %bb.0:
    857 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
    858 ; AVX-NEXT:    vpavgb (%rsi), %xmm0, %xmm0
    859 ; AVX-NEXT:    vmovdqu %xmm0, (%rax)
    860 ; AVX-NEXT:    retq
    861   %1 = load <16 x i8>, <16 x i8>* %a
    862   %2 = load <16 x i8>, <16 x i8>* %b
    863   %3 = zext <16 x i8> %1 to <16 x i32>
    864   %4 = zext <16 x i8> %2 to <16 x i32>
    865   %5 = add nuw nsw <16 x i32> %3, %4
    866   %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    867   %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    868   %8 = trunc <16 x i32> %7 to <16 x i8>
    869   store <16 x i8> %8, <16 x i8>* undef, align 4
    870   ret void
    871 }
    872 
    873 define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind {
    874 ; SSE2-LABEL: avg_v32i8_2:
    875 ; SSE2:       # %bb.0:
    876 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
    877 ; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
    878 ; SSE2-NEXT:    pavgb (%rsi), %xmm0
    879 ; SSE2-NEXT:    pavgb 16(%rsi), %xmm1
    880 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
    881 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
    882 ; SSE2-NEXT:    retq
    883 ;
    884 ; AVX1-LABEL: avg_v32i8_2:
    885 ; AVX1:       # %bb.0:
    886 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    887 ; AVX1-NEXT:    vmovdqa (%rsi), %ymm1
    888 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    889 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
    890 ; AVX1-NEXT:    vpavgb %xmm2, %xmm3, %xmm2
    891 ; AVX1-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
    892 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    893 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
    894 ; AVX1-NEXT:    vzeroupper
    895 ; AVX1-NEXT:    retq
    896 ;
    897 ; AVX2-LABEL: avg_v32i8_2:
    898 ; AVX2:       # %bb.0:
    899 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
    900 ; AVX2-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
    901 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
    902 ; AVX2-NEXT:    vzeroupper
    903 ; AVX2-NEXT:    retq
    904 ;
    905 ; AVX512-LABEL: avg_v32i8_2:
    906 ; AVX512:       # %bb.0:
    907 ; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
    908 ; AVX512-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
    909 ; AVX512-NEXT:    vmovdqu %ymm0, (%rax)
    910 ; AVX512-NEXT:    vzeroupper
    911 ; AVX512-NEXT:    retq
    912   %1 = load <32 x i8>, <32 x i8>* %a
    913   %2 = load <32 x i8>, <32 x i8>* %b
    914   %3 = zext <32 x i8> %1 to <32 x i32>
    915   %4 = zext <32 x i8> %2 to <32 x i32>
    916   %5 = add nuw nsw <32 x i32> %3, %4
    917   %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    918   %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    919   %8 = trunc <32 x i32> %7 to <32 x i8>
    920   store <32 x i8> %8, <32 x i8>* undef, align 4
    921   ret void
    922 }
    923 
    924 define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind {
    925 ; SSE2-LABEL: avg_v64i8_2:
    926 ; SSE2:       # %bb.0:
    927 ; SSE2-NEXT:    movdqa (%rsi), %xmm0
    928 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
    929 ; SSE2-NEXT:    movdqa 32(%rsi), %xmm2
    930 ; SSE2-NEXT:    movdqa 48(%rsi), %xmm3
    931 ; SSE2-NEXT:    pavgb %xmm0, %xmm0
    932 ; SSE2-NEXT:    pavgb %xmm1, %xmm1
    933 ; SSE2-NEXT:    pavgb %xmm2, %xmm2
    934 ; SSE2-NEXT:    pavgb %xmm3, %xmm3
    935 ; SSE2-NEXT:    movdqu %xmm3, (%rax)
    936 ; SSE2-NEXT:    movdqu %xmm2, (%rax)
    937 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
    938 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
    939 ; SSE2-NEXT:    retq
    940 ;
    941 ; AVX1-LABEL: avg_v64i8_2:
    942 ; AVX1:       # %bb.0:
    943 ; AVX1-NEXT:    vmovdqa (%rsi), %ymm0
    944 ; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm1
    945 ; AVX1-NEXT:    vpavgb %xmm0, %xmm0, %xmm2
    946 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    947 ; AVX1-NEXT:    vpavgb %xmm0, %xmm0, %xmm0
    948 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
    949 ; AVX1-NEXT:    vpavgb %xmm1, %xmm1, %xmm2
    950 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
    951 ; AVX1-NEXT:    vpavgb %xmm1, %xmm1, %xmm1
    952 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
    953 ; AVX1-NEXT:    vmovups %ymm1, (%rax)
    954 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
    955 ; AVX1-NEXT:    vzeroupper
    956 ; AVX1-NEXT:    retq
    957 ;
    958 ; AVX2-LABEL: avg_v64i8_2:
    959 ; AVX2:       # %bb.0:
    960 ; AVX2-NEXT:    vmovdqa (%rsi), %ymm0
    961 ; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm1
    962 ; AVX2-NEXT:    vpavgb %ymm0, %ymm0, %ymm0
    963 ; AVX2-NEXT:    vpavgb %ymm1, %ymm1, %ymm1
    964 ; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
    965 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
    966 ; AVX2-NEXT:    vzeroupper
    967 ; AVX2-NEXT:    retq
    968 ;
    969 ; AVX512F-LABEL: avg_v64i8_2:
    970 ; AVX512F:       # %bb.0:
    971 ; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
    972 ; AVX512F-NEXT:    vmovdqa 32(%rsi), %ymm1
    973 ; AVX512F-NEXT:    vpavgb %ymm0, %ymm0, %ymm0
    974 ; AVX512F-NEXT:    vpavgb %ymm1, %ymm1, %ymm1
    975 ; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
    976 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
    977 ; AVX512F-NEXT:    vzeroupper
    978 ; AVX512F-NEXT:    retq
    979 ;
    980 ; AVX512BW-LABEL: avg_v64i8_2:
    981 ; AVX512BW:       # %bb.0:
    982 ; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm0
    983 ; AVX512BW-NEXT:    vpavgb %zmm0, %zmm0, %zmm0
    984 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rax)
    985 ; AVX512BW-NEXT:    vzeroupper
    986 ; AVX512BW-NEXT:    retq
    987   %1 = load <64 x i8>, <64 x i8>* %a
    988   %2 = load <64 x i8>, <64 x i8>* %b
    989   %3 = zext <64 x i8> %1 to <64 x i32>
    990   %4 = zext <64 x i8> %2 to <64 x i32>
    991   %5 = add nuw nsw <64 x i32> %4, %4
    992   %6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    993   %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    994   %8 = trunc <64 x i32> %7 to <64 x i8>
    995   store <64 x i8> %8, <64 x i8>* undef, align 4
    996   ret void
    997 }
    998 
    999 
   1000 define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) nounwind {
   1001 ; SSE2-LABEL: avg_v4i16_2:
   1002 ; SSE2:       # %bb.0:
   1003 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
   1004 ; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
   1005 ; SSE2-NEXT:    pavgw %xmm0, %xmm1
   1006 ; SSE2-NEXT:    movq %xmm1, (%rax)
   1007 ; SSE2-NEXT:    retq
   1008 ;
   1009 ; AVX-LABEL: avg_v4i16_2:
   1010 ; AVX:       # %bb.0:
   1011 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
   1012 ; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
   1013 ; AVX-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
   1014 ; AVX-NEXT:    vmovq %xmm0, (%rax)
   1015 ; AVX-NEXT:    retq
   1016   %1 = load <4 x i16>, <4 x i16>* %a
   1017   %2 = load <4 x i16>, <4 x i16>* %b
   1018   %3 = zext <4 x i16> %1 to <4 x i32>
   1019   %4 = zext <4 x i16> %2 to <4 x i32>
   1020   %5 = add nuw nsw <4 x i32> %3, %4
   1021   %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
   1022   %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
   1023   %8 = trunc <4 x i32> %7 to <4 x i16>
   1024   store <4 x i16> %8, <4 x i16>* undef, align 4
   1025   ret void
   1026 }
   1027 
   1028 define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) nounwind {
   1029 ; SSE2-LABEL: avg_v8i16_2:
   1030 ; SSE2:       # %bb.0:
   1031 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
   1032 ; SSE2-NEXT:    pavgw (%rsi), %xmm0
   1033 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
   1034 ; SSE2-NEXT:    retq
   1035 ;
   1036 ; AVX-LABEL: avg_v8i16_2:
   1037 ; AVX:       # %bb.0:
   1038 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
   1039 ; AVX-NEXT:    vpavgw (%rsi), %xmm0, %xmm0
   1040 ; AVX-NEXT:    vmovdqu %xmm0, (%rax)
   1041 ; AVX-NEXT:    retq
   1042   %1 = load <8 x i16>, <8 x i16>* %a
   1043   %2 = load <8 x i16>, <8 x i16>* %b
   1044   %3 = zext <8 x i16> %1 to <8 x i32>
   1045   %4 = zext <8 x i16> %2 to <8 x i32>
   1046   %5 = add nuw nsw <8 x i32> %3, %4
   1047   %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   1048   %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   1049   %8 = trunc <8 x i32> %7 to <8 x i16>
   1050   store <8 x i16> %8, <8 x i16>* undef, align 4
   1051   ret void
   1052 }
   1053 
   1054 define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind {
   1055 ; SSE2-LABEL: avg_v16i16_2:
   1056 ; SSE2:       # %bb.0:
   1057 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
   1058 ; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
   1059 ; SSE2-NEXT:    pavgw (%rsi), %xmm0
   1060 ; SSE2-NEXT:    pavgw 16(%rsi), %xmm1
   1061 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
   1062 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
   1063 ; SSE2-NEXT:    retq
   1064 ;
   1065 ; AVX1-LABEL: avg_v16i16_2:
   1066 ; AVX1:       # %bb.0:
   1067 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
   1068 ; AVX1-NEXT:    vmovdqa (%rsi), %ymm1
   1069 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1070 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1071 ; AVX1-NEXT:    vpavgw %xmm2, %xmm3, %xmm2
   1072 ; AVX1-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
   1073 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1074 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
   1075 ; AVX1-NEXT:    vzeroupper
   1076 ; AVX1-NEXT:    retq
   1077 ;
   1078 ; AVX2-LABEL: avg_v16i16_2:
   1079 ; AVX2:       # %bb.0:
   1080 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
   1081 ; AVX2-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
   1082 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
   1083 ; AVX2-NEXT:    vzeroupper
   1084 ; AVX2-NEXT:    retq
   1085 ;
   1086 ; AVX512-LABEL: avg_v16i16_2:
   1087 ; AVX512:       # %bb.0:
   1088 ; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
   1089 ; AVX512-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
   1090 ; AVX512-NEXT:    vmovdqu %ymm0, (%rax)
   1091 ; AVX512-NEXT:    vzeroupper
   1092 ; AVX512-NEXT:    retq
   1093   %1 = load <16 x i16>, <16 x i16>* %a
   1094   %2 = load <16 x i16>, <16 x i16>* %b
   1095   %3 = zext <16 x i16> %1 to <16 x i32>
   1096   %4 = zext <16 x i16> %2 to <16 x i32>
   1097   %5 = add nuw nsw <16 x i32> %3, %4
   1098   %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   1099   %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   1100   %8 = trunc <16 x i32> %7 to <16 x i16>
   1101   store <16 x i16> %8, <16 x i16>* undef, align 4
   1102   ret void
   1103 }
   1104 
   1105 define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind {
   1106 ; SSE2-LABEL: avg_v32i16_2:
   1107 ; SSE2:       # %bb.0:
   1108 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
   1109 ; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
   1110 ; SSE2-NEXT:    movdqa 32(%rdi), %xmm2
   1111 ; SSE2-NEXT:    movdqa 48(%rdi), %xmm3
   1112 ; SSE2-NEXT:    pavgw (%rsi), %xmm0
   1113 ; SSE2-NEXT:    pavgw 16(%rsi), %xmm1
   1114 ; SSE2-NEXT:    pavgw 32(%rsi), %xmm2
   1115 ; SSE2-NEXT:    pavgw 48(%rsi), %xmm3
   1116 ; SSE2-NEXT:    movdqu %xmm3, (%rax)
   1117 ; SSE2-NEXT:    movdqu %xmm2, (%rax)
   1118 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
   1119 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
   1120 ; SSE2-NEXT:    retq
   1121 ;
   1122 ; AVX1-LABEL: avg_v32i16_2:
   1123 ; AVX1:       # %bb.0:
   1124 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
   1125 ; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
   1126 ; AVX1-NEXT:    vmovdqa (%rsi), %ymm2
   1127 ; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm3
   1128 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
   1129 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
   1130 ; AVX1-NEXT:    vpavgw %xmm4, %xmm5, %xmm4
   1131 ; AVX1-NEXT:    vpavgw %xmm2, %xmm0, %xmm0
   1132 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
   1133 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
   1134 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
   1135 ; AVX1-NEXT:    vpavgw %xmm2, %xmm4, %xmm2
   1136 ; AVX1-NEXT:    vpavgw %xmm3, %xmm1, %xmm1
   1137 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1138 ; AVX1-NEXT:    vmovups %ymm1, (%rax)
   1139 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
   1140 ; AVX1-NEXT:    vzeroupper
   1141 ; AVX1-NEXT:    retq
   1142 ;
   1143 ; AVX2-LABEL: avg_v32i16_2:
   1144 ; AVX2:       # %bb.0:
   1145 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
   1146 ; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
   1147 ; AVX2-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
   1148 ; AVX2-NEXT:    vpavgw 32(%rsi), %ymm1, %ymm1
   1149 ; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
   1150 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
   1151 ; AVX2-NEXT:    vzeroupper
   1152 ; AVX2-NEXT:    retq
   1153 ;
   1154 ; AVX512F-LABEL: avg_v32i16_2:
   1155 ; AVX512F:       # %bb.0:
   1156 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
   1157 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
   1158 ; AVX512F-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
   1159 ; AVX512F-NEXT:    vpavgw 32(%rsi), %ymm1, %ymm1
   1160 ; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
   1161 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
   1162 ; AVX512F-NEXT:    vzeroupper
   1163 ; AVX512F-NEXT:    retq
   1164 ;
   1165 ; AVX512BW-LABEL: avg_v32i16_2:
   1166 ; AVX512BW:       # %bb.0:
   1167 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
   1168 ; AVX512BW-NEXT:    vpavgw (%rsi), %zmm0, %zmm0
   1169 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rax)
   1170 ; AVX512BW-NEXT:    vzeroupper
   1171 ; AVX512BW-NEXT:    retq
   1172   %1 = load <32 x i16>, <32 x i16>* %a
   1173   %2 = load <32 x i16>, <32 x i16>* %b
   1174   %3 = zext <32 x i16> %1 to <32 x i32>
   1175   %4 = zext <32 x i16> %2 to <32 x i32>
   1176   %5 = add nuw nsw <32 x i32> %3, %4
   1177   %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   1178   %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   1179   %8 = trunc <32 x i32> %7 to <32 x i16>
   1180   store <32 x i16> %8, <32 x i16>* undef, align 4
   1181   ret void
   1182 }
   1183 
   1184 define void @avg_v4i8_const(<4 x i8>* %a) nounwind {
   1185 ; SSE2-LABEL: avg_v4i8_const:
   1186 ; SSE2:       # %bb.0:
   1187 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1188 ; SSE2-NEXT:    pavgb {{.*}}(%rip), %xmm0
   1189 ; SSE2-NEXT:    movd %xmm0, (%rax)
   1190 ; SSE2-NEXT:    retq
   1191 ;
   1192 ; AVX-LABEL: avg_v4i8_const:
   1193 ; AVX:       # %bb.0:
   1194 ; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1195 ; AVX-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
   1196 ; AVX-NEXT:    vmovd %xmm0, (%rax)
   1197 ; AVX-NEXT:    retq
   1198   %1 = load <4 x i8>, <4 x i8>* %a
   1199   %2 = zext <4 x i8> %1 to <4 x i32>
   1200   %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
   1201   %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
   1202   %5 = trunc <4 x i32> %4 to <4 x i8>
   1203   store <4 x i8> %5, <4 x i8>* undef, align 4
   1204   ret void
   1205 }
   1206 
   1207 define void @avg_v8i8_const(<8 x i8>* %a) nounwind {
   1208 ; SSE2-LABEL: avg_v8i8_const:
   1209 ; SSE2:       # %bb.0:
   1210 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
   1211 ; SSE2-NEXT:    pavgb {{.*}}(%rip), %xmm0
   1212 ; SSE2-NEXT:    movq %xmm0, (%rax)
   1213 ; SSE2-NEXT:    retq
   1214 ;
   1215 ; AVX-LABEL: avg_v8i8_const:
   1216 ; AVX:       # %bb.0:
   1217 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
   1218 ; AVX-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
   1219 ; AVX-NEXT:    vmovq %xmm0, (%rax)
   1220 ; AVX-NEXT:    retq
   1221   %1 = load <8 x i8>, <8 x i8>* %a
   1222   %2 = zext <8 x i8> %1 to <8 x i32>
   1223   %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
   1224   %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   1225   %5 = trunc <8 x i32> %4 to <8 x i8>
   1226   store <8 x i8> %5, <8 x i8>* undef, align 4
   1227   ret void
   1228 }
   1229 
   1230 define void @avg_v16i8_const(<16 x i8>* %a) nounwind {
   1231 ; SSE2-LABEL: avg_v16i8_const:
   1232 ; SSE2:       # %bb.0:
   1233 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
   1234 ; SSE2-NEXT:    pavgb {{.*}}(%rip), %xmm0
   1235 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
   1236 ; SSE2-NEXT:    retq
   1237 ;
   1238 ; AVX-LABEL: avg_v16i8_const:
   1239 ; AVX:       # %bb.0:
   1240 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
   1241 ; AVX-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
   1242 ; AVX-NEXT:    vmovdqu %xmm0, (%rax)
   1243 ; AVX-NEXT:    retq
   1244   %1 = load <16 x i8>, <16 x i8>* %a
   1245   %2 = zext <16 x i8> %1 to <16 x i32>
   1246   %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
   1247   %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   1248   %5 = trunc <16 x i32> %4 to <16 x i8>
   1249   store <16 x i8> %5, <16 x i8>* undef, align 4
   1250   ret void
   1251 }
   1252 
   1253 define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
   1254 ; SSE2-LABEL: avg_v32i8_const:
   1255 ; SSE2:       # %bb.0:
   1256 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
   1257 ; SSE2-NEXT:    movdqa (%rdi), %xmm1
   1258 ; SSE2-NEXT:    pavgb %xmm0, %xmm1
   1259 ; SSE2-NEXT:    pavgb 16(%rdi), %xmm0
   1260 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
   1261 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
   1262 ; SSE2-NEXT:    retq
   1263 ;
   1264 ; AVX1-LABEL: avg_v32i8_const:
   1265 ; AVX1:       # %bb.0:
   1266 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
   1267 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1268 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
   1269 ; AVX1-NEXT:    vpavgb %xmm2, %xmm1, %xmm1
   1270 ; AVX1-NEXT:    vpavgb %xmm2, %xmm0, %xmm0
   1271 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1272 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
   1273 ; AVX1-NEXT:    vzeroupper
   1274 ; AVX1-NEXT:    retq
   1275 ;
   1276 ; AVX2-LABEL: avg_v32i8_const:
   1277 ; AVX2:       # %bb.0:
   1278 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
   1279 ; AVX2-NEXT:    vpavgb {{.*}}(%rip), %ymm0, %ymm0
   1280 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
   1281 ; AVX2-NEXT:    vzeroupper
   1282 ; AVX2-NEXT:    retq
   1283 ;
   1284 ; AVX512-LABEL: avg_v32i8_const:
   1285 ; AVX512:       # %bb.0:
   1286 ; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
   1287 ; AVX512-NEXT:    vpavgb {{.*}}(%rip), %ymm0, %ymm0
   1288 ; AVX512-NEXT:    vmovdqu %ymm0, (%rax)
   1289 ; AVX512-NEXT:    vzeroupper
   1290 ; AVX512-NEXT:    retq
   1291   %1 = load <32 x i8>, <32 x i8>* %a
   1292   %2 = zext <32 x i8> %1 to <32 x i32>
   1293   %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
   1294   %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   1295   %5 = trunc <32 x i32> %4 to <32 x i8>
   1296   store <32 x i8> %5, <32 x i8>* undef, align 4
   1297   ret void
   1298 }
   1299 
   1300 define void @avg_v64i8_const(<64 x i8>* %a) nounwind {
   1301 ; SSE2-LABEL: avg_v64i8_const:
   1302 ; SSE2:       # %bb.0:
   1303 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
   1304 ; SSE2-NEXT:    movdqa (%rdi), %xmm1
   1305 ; SSE2-NEXT:    pavgb %xmm0, %xmm1
   1306 ; SSE2-NEXT:    movdqa 16(%rdi), %xmm2
   1307 ; SSE2-NEXT:    pavgb %xmm0, %xmm2
   1308 ; SSE2-NEXT:    movdqa 32(%rdi), %xmm3
   1309 ; SSE2-NEXT:    pavgb %xmm0, %xmm3
   1310 ; SSE2-NEXT:    pavgb 48(%rdi), %xmm0
   1311 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
   1312 ; SSE2-NEXT:    movdqu %xmm3, (%rax)
   1313 ; SSE2-NEXT:    movdqu %xmm2, (%rax)
   1314 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
   1315 ; SSE2-NEXT:    retq
   1316 ;
   1317 ; AVX1-LABEL: avg_v64i8_const:
   1318 ; AVX1:       # %bb.0:
   1319 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
   1320 ; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
   1321 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1322 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
   1323 ; AVX1-NEXT:    vpavgb %xmm3, %xmm2, %xmm2
   1324 ; AVX1-NEXT:    vpavgb %xmm3, %xmm0, %xmm0
   1325 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1326 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1327 ; AVX1-NEXT:    vpavgb %xmm3, %xmm2, %xmm2
   1328 ; AVX1-NEXT:    vpavgb %xmm3, %xmm1, %xmm1
   1329 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1330 ; AVX1-NEXT:    vmovups %ymm1, (%rax)
   1331 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
   1332 ; AVX1-NEXT:    vzeroupper
   1333 ; AVX1-NEXT:    retq
   1334 ;
   1335 ; AVX2-LABEL: avg_v64i8_const:
   1336 ; AVX2:       # %bb.0:
   1337 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
   1338 ; AVX2-NEXT:    vpavgb (%rdi), %ymm0, %ymm1
   1339 ; AVX2-NEXT:    vpavgb 32(%rdi), %ymm0, %ymm0
   1340 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
   1341 ; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
   1342 ; AVX2-NEXT:    vzeroupper
   1343 ; AVX2-NEXT:    retq
   1344 ;
   1345 ; AVX512F-LABEL: avg_v64i8_const:
   1346 ; AVX512F:       # %bb.0:
   1347 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
   1348 ; AVX512F-NEXT:    vpavgb (%rdi), %ymm0, %ymm1
   1349 ; AVX512F-NEXT:    vpavgb 32(%rdi), %ymm0, %ymm0
   1350 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
   1351 ; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
   1352 ; AVX512F-NEXT:    vzeroupper
   1353 ; AVX512F-NEXT:    retq
   1354 ;
   1355 ; AVX512BW-LABEL: avg_v64i8_const:
   1356 ; AVX512BW:       # %bb.0:
   1357 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
   1358 ; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %zmm0, %zmm0
   1359 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rax)
   1360 ; AVX512BW-NEXT:    vzeroupper
   1361 ; AVX512BW-NEXT:    retq
   1362   %1 = load <64 x i8>, <64 x i8>* %a
   1363   %2 = zext <64 x i8> %1 to <64 x i32>
   1364   %3 = add nuw nsw <64 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
   1365   %4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   1366   %5 = trunc <64 x i32> %4 to <64 x i8>
   1367   store <64 x i8> %5, <64 x i8>* undef, align 4
   1368   ret void
   1369 }
   1370 
   1371 define void @avg_v4i16_const(<4 x i16>* %a) nounwind {
   1372 ; SSE2-LABEL: avg_v4i16_const:
   1373 ; SSE2:       # %bb.0:
   1374 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
   1375 ; SSE2-NEXT:    pavgw {{.*}}(%rip), %xmm0
   1376 ; SSE2-NEXT:    movq %xmm0, (%rax)
   1377 ; SSE2-NEXT:    retq
   1378 ;
   1379 ; AVX-LABEL: avg_v4i16_const:
   1380 ; AVX:       # %bb.0:
   1381 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
   1382 ; AVX-NEXT:    vpavgw {{.*}}(%rip), %xmm0, %xmm0
   1383 ; AVX-NEXT:    vmovq %xmm0, (%rax)
   1384 ; AVX-NEXT:    retq
   1385   %1 = load <4 x i16>, <4 x i16>* %a
   1386   %2 = zext <4 x i16> %1 to <4 x i32>
   1387   %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
   1388   %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
   1389   %5 = trunc <4 x i32> %4 to <4 x i16>
   1390   store <4 x i16> %5, <4 x i16>* undef, align 4
   1391   ret void
   1392 }
   1393 
   1394 define void @avg_v8i16_const(<8 x i16>* %a) nounwind {
   1395 ; SSE2-LABEL: avg_v8i16_const:
   1396 ; SSE2:       # %bb.0:
   1397 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
   1398 ; SSE2-NEXT:    pavgw {{.*}}(%rip), %xmm0
   1399 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
   1400 ; SSE2-NEXT:    retq
   1401 ;
   1402 ; AVX-LABEL: avg_v8i16_const:
   1403 ; AVX:       # %bb.0:
   1404 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
   1405 ; AVX-NEXT:    vpavgw {{.*}}(%rip), %xmm0, %xmm0
   1406 ; AVX-NEXT:    vmovdqu %xmm0, (%rax)
   1407 ; AVX-NEXT:    retq
   1408   %1 = load <8 x i16>, <8 x i16>* %a
   1409   %2 = zext <8 x i16> %1 to <8 x i32>
   1410   %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
   1411   %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   1412   %5 = trunc <8 x i32> %4 to <8 x i16>
   1413   store <8 x i16> %5, <8 x i16>* undef, align 4
   1414   ret void
   1415 }
   1416 
   1417 define void @avg_v16i16_const(<16 x i16>* %a) nounwind {
   1418 ; SSE2-LABEL: avg_v16i16_const:
   1419 ; SSE2:       # %bb.0:
   1420 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
   1421 ; SSE2-NEXT:    movdqa (%rdi), %xmm1
   1422 ; SSE2-NEXT:    pavgw %xmm0, %xmm1
   1423 ; SSE2-NEXT:    pavgw 16(%rdi), %xmm0
   1424 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
   1425 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
   1426 ; SSE2-NEXT:    retq
   1427 ;
   1428 ; AVX1-LABEL: avg_v16i16_const:
   1429 ; AVX1:       # %bb.0:
   1430 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
   1431 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1432 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
   1433 ; AVX1-NEXT:    vpavgw %xmm2, %xmm1, %xmm1
   1434 ; AVX1-NEXT:    vpavgw %xmm2, %xmm0, %xmm0
   1435 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1436 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
   1437 ; AVX1-NEXT:    vzeroupper
   1438 ; AVX1-NEXT:    retq
   1439 ;
   1440 ; AVX2-LABEL: avg_v16i16_const:
   1441 ; AVX2:       # %bb.0:
   1442 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
   1443 ; AVX2-NEXT:    vpavgw {{.*}}(%rip), %ymm0, %ymm0
   1444 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
   1445 ; AVX2-NEXT:    vzeroupper
   1446 ; AVX2-NEXT:    retq
   1447 ;
   1448 ; AVX512-LABEL: avg_v16i16_const:
   1449 ; AVX512:       # %bb.0:
   1450 ; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
   1451 ; AVX512-NEXT:    vpavgw {{.*}}(%rip), %ymm0, %ymm0
   1452 ; AVX512-NEXT:    vmovdqu %ymm0, (%rax)
   1453 ; AVX512-NEXT:    vzeroupper
   1454 ; AVX512-NEXT:    retq
   1455   %1 = load <16 x i16>, <16 x i16>* %a
   1456   %2 = zext <16 x i16> %1 to <16 x i32>
   1457   %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
   1458   %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   1459   %5 = trunc <16 x i32> %4 to <16 x i16>
   1460   store <16 x i16> %5, <16 x i16>* undef, align 4
   1461   ret void
   1462 }
   1463 
   1464 define void @avg_v32i16_const(<32 x i16>* %a) nounwind {
   1465 ; SSE2-LABEL: avg_v32i16_const:
   1466 ; SSE2:       # %bb.0:
   1467 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
   1468 ; SSE2-NEXT:    movdqa (%rdi), %xmm1
   1469 ; SSE2-NEXT:    pavgw %xmm0, %xmm1
   1470 ; SSE2-NEXT:    movdqa 16(%rdi), %xmm2
   1471 ; SSE2-NEXT:    pavgw %xmm0, %xmm2
   1472 ; SSE2-NEXT:    movdqa 32(%rdi), %xmm3
   1473 ; SSE2-NEXT:    pavgw %xmm0, %xmm3
   1474 ; SSE2-NEXT:    pavgw 48(%rdi), %xmm0
   1475 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
   1476 ; SSE2-NEXT:    movdqu %xmm3, (%rax)
   1477 ; SSE2-NEXT:    movdqu %xmm2, (%rax)
   1478 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
   1479 ; SSE2-NEXT:    retq
   1480 ;
   1481 ; AVX1-LABEL: avg_v32i16_const:
   1482 ; AVX1:       # %bb.0:
   1483 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
   1484 ; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
   1485 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1486 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7]
   1487 ; AVX1-NEXT:    vpavgw %xmm3, %xmm2, %xmm2
   1488 ; AVX1-NEXT:    vpavgw %xmm3, %xmm0, %xmm0
   1489 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1490 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1491 ; AVX1-NEXT:    vpavgw %xmm3, %xmm2, %xmm2
   1492 ; AVX1-NEXT:    vpavgw %xmm3, %xmm1, %xmm1
   1493 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1494 ; AVX1-NEXT:    vmovups %ymm1, (%rax)
   1495 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
   1496 ; AVX1-NEXT:    vzeroupper
   1497 ; AVX1-NEXT:    retq
   1498 ;
   1499 ; AVX2-LABEL: avg_v32i16_const:
   1500 ; AVX2:       # %bb.0:
   1501 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
   1502 ; AVX2-NEXT:    # ymm0 = mem[0,1,0,1]
   1503 ; AVX2-NEXT:    vpavgw (%rdi), %ymm0, %ymm1
   1504 ; AVX2-NEXT:    vpavgw 32(%rdi), %ymm0, %ymm0
   1505 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
   1506 ; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
   1507 ; AVX2-NEXT:    vzeroupper
   1508 ; AVX2-NEXT:    retq
   1509 ;
   1510 ; AVX512F-LABEL: avg_v32i16_const:
   1511 ; AVX512F:       # %bb.0:
   1512 ; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
   1513 ; AVX512F-NEXT:    # ymm0 = mem[0,1,0,1]
   1514 ; AVX512F-NEXT:    vpavgw (%rdi), %ymm0, %ymm1
   1515 ; AVX512F-NEXT:    vpavgw 32(%rdi), %ymm0, %ymm0
   1516 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
   1517 ; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
   1518 ; AVX512F-NEXT:    vzeroupper
   1519 ; AVX512F-NEXT:    retq
   1520 ;
   1521 ; AVX512BW-LABEL: avg_v32i16_const:
   1522 ; AVX512BW:       # %bb.0:
   1523 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
   1524 ; AVX512BW-NEXT:    vpavgw {{.*}}(%rip), %zmm0, %zmm0
   1525 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rax)
   1526 ; AVX512BW-NEXT:    vzeroupper
   1527 ; AVX512BW-NEXT:    retq
   1528   %1 = load <32 x i16>, <32 x i16>* %a
   1529   %2 = zext <32 x i16> %1 to <32 x i32>
   1530   %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
   1531   %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   1532   %5 = trunc <32 x i32> %4 to <32 x i16>
   1533   store <32 x i16> %5, <32 x i16>* undef, align 4
   1534   ret void
   1535 }
   1536 
   1537 define <16 x i8> @avg_v16i8_3(<16 x i8> %a, <16 x i8> %b) nounwind {
   1538 ; SSE2-LABEL: avg_v16i8_3:
   1539 ; SSE2:       # %bb.0:
   1540 ; SSE2-NEXT:    pavgb %xmm1, %xmm0
   1541 ; SSE2-NEXT:    retq
   1542 ;
   1543 ; AVX-LABEL: avg_v16i8_3:
   1544 ; AVX:       # %bb.0:
   1545 ; AVX-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
   1546 ; AVX-NEXT:    retq
   1547   %za = zext <16 x i8> %a to <16 x i16>
   1548   %zb = zext <16 x i8> %b to <16 x i16>
   1549   %add = add nuw nsw <16 x i16> %za, %zb
   1550   %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   1551   %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   1552   %res = trunc <16 x i16> %lshr to <16 x i8>
   1553   ret <16 x i8> %res
   1554 }
   1555 
   1556 define <32 x i8> @avg_v32i8_3(<32 x i8> %a, <32 x i8> %b) nounwind {
   1557 ; SSE2-LABEL: avg_v32i8_3:
   1558 ; SSE2:       # %bb.0:
   1559 ; SSE2-NEXT:    pavgb %xmm2, %xmm0
   1560 ; SSE2-NEXT:    pavgb %xmm3, %xmm1
   1561 ; SSE2-NEXT:    retq
   1562 ;
   1563 ; AVX1-LABEL: avg_v32i8_3:
   1564 ; AVX1:       # %bb.0:
   1565 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1566 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1567 ; AVX1-NEXT:    vpavgb %xmm2, %xmm3, %xmm2
   1568 ; AVX1-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
   1569 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1570 ; AVX1-NEXT:    retq
   1571 ;
   1572 ; AVX2-LABEL: avg_v32i8_3:
   1573 ; AVX2:       # %bb.0:
   1574 ; AVX2-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
   1575 ; AVX2-NEXT:    retq
   1576 ;
   1577 ; AVX512-LABEL: avg_v32i8_3:
   1578 ; AVX512:       # %bb.0:
   1579 ; AVX512-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
   1580 ; AVX512-NEXT:    retq
   1581   %za = zext <32 x i8> %a to <32 x i16>
   1582   %zb = zext <32 x i8> %b to <32 x i16>
   1583   %add = add nuw nsw <32 x i16> %za, %zb
   1584   %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   1585   %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   1586   %res = trunc <32 x i16> %lshr to <32 x i8>
   1587   ret <32 x i8> %res
   1588 }
   1589 
   1590 define <64 x i8> @avg_v64i8_3(<64 x i8> %a, <64 x i8> %b) nounwind {
   1591 ; SSE2-LABEL: avg_v64i8_3:
   1592 ; SSE2:       # %bb.0:
   1593 ; SSE2-NEXT:    pavgb %xmm4, %xmm0
   1594 ; SSE2-NEXT:    pavgb %xmm5, %xmm1
   1595 ; SSE2-NEXT:    pavgb %xmm6, %xmm2
   1596 ; SSE2-NEXT:    pavgb %xmm7, %xmm3
   1597 ; SSE2-NEXT:    retq
   1598 ;
   1599 ; AVX1-LABEL: avg_v64i8_3:
   1600 ; AVX1:       # %bb.0:
   1601 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
   1602 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
   1603 ; AVX1-NEXT:    vpavgb %xmm4, %xmm5, %xmm4
   1604 ; AVX1-NEXT:    vpavgb %xmm2, %xmm0, %xmm0
   1605 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
   1606 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
   1607 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
   1608 ; AVX1-NEXT:    vpavgb %xmm2, %xmm4, %xmm2
   1609 ; AVX1-NEXT:    vpavgb %xmm3, %xmm1, %xmm1
   1610 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1611 ; AVX1-NEXT:    retq
   1612 ;
   1613 ; AVX2-LABEL: avg_v64i8_3:
   1614 ; AVX2:       # %bb.0:
   1615 ; AVX2-NEXT:    vpavgb %ymm2, %ymm0, %ymm0
   1616 ; AVX2-NEXT:    vpavgb %ymm3, %ymm1, %ymm1
   1617 ; AVX2-NEXT:    retq
   1618 ;
   1619 ; AVX512F-LABEL: avg_v64i8_3:
   1620 ; AVX512F:       # %bb.0:
   1621 ; AVX512F-NEXT:    vpavgb %ymm2, %ymm0, %ymm0
   1622 ; AVX512F-NEXT:    vpavgb %ymm3, %ymm1, %ymm1
   1623 ; AVX512F-NEXT:    retq
   1624 ;
   1625 ; AVX512BW-LABEL: avg_v64i8_3:
   1626 ; AVX512BW:       # %bb.0:
   1627 ; AVX512BW-NEXT:    vpavgb %zmm1, %zmm0, %zmm0
   1628 ; AVX512BW-NEXT:    retq
   1629   %za = zext <64 x i8> %a to <64 x i16>
   1630   %zb = zext <64 x i8> %b to <64 x i16>
   1631   %add = add nuw nsw <64 x i16> %za, %zb
   1632   %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   1633   %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   1634   %res = trunc <64 x i16> %lshr to <64 x i8>
   1635   ret <64 x i8> %res
   1636 }
   1637 
   1638 define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind {
   1639 ; SSE2-LABEL: avg_v512i8_3:
   1640 ; SSE2:       # %bb.0:
   1641 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1642 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1643 ; SSE2-NEXT:    movdqa %xmm8, 496(%rdi)
   1644 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1645 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1646 ; SSE2-NEXT:    movdqa %xmm8, 480(%rdi)
   1647 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1648 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1649 ; SSE2-NEXT:    movdqa %xmm8, 464(%rdi)
   1650 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1651 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1652 ; SSE2-NEXT:    movdqa %xmm8, 448(%rdi)
   1653 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1654 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1655 ; SSE2-NEXT:    movdqa %xmm8, 432(%rdi)
   1656 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1657 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1658 ; SSE2-NEXT:    movdqa %xmm8, 416(%rdi)
   1659 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1660 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1661 ; SSE2-NEXT:    movdqa %xmm8, 400(%rdi)
   1662 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1663 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1664 ; SSE2-NEXT:    movdqa %xmm8, 384(%rdi)
   1665 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1666 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1667 ; SSE2-NEXT:    movdqa %xmm8, 368(%rdi)
   1668 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1669 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1670 ; SSE2-NEXT:    movdqa %xmm8, 352(%rdi)
   1671 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1672 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1673 ; SSE2-NEXT:    movdqa %xmm8, 336(%rdi)
   1674 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1675 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1676 ; SSE2-NEXT:    movdqa %xmm8, 320(%rdi)
   1677 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1678 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1679 ; SSE2-NEXT:    movdqa %xmm8, 304(%rdi)
   1680 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1681 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1682 ; SSE2-NEXT:    movdqa %xmm8, 288(%rdi)
   1683 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1684 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1685 ; SSE2-NEXT:    movdqa %xmm8, 272(%rdi)
   1686 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1687 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1688 ; SSE2-NEXT:    movdqa %xmm8, 256(%rdi)
   1689 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1690 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1691 ; SSE2-NEXT:    movdqa %xmm8, 240(%rdi)
   1692 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1693 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1694 ; SSE2-NEXT:    movdqa %xmm8, 224(%rdi)
   1695 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1696 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1697 ; SSE2-NEXT:    movdqa %xmm8, 208(%rdi)
   1698 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1699 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1700 ; SSE2-NEXT:    movdqa %xmm8, 192(%rdi)
   1701 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1702 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1703 ; SSE2-NEXT:    movdqa %xmm8, 176(%rdi)
   1704 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1705 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1706 ; SSE2-NEXT:    movdqa %xmm8, 160(%rdi)
   1707 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1708 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1709 ; SSE2-NEXT:    movdqa %xmm8, 144(%rdi)
   1710 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   1711 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
   1712 ; SSE2-NEXT:    movdqa %xmm8, 128(%rdi)
   1713 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm7
   1714 ; SSE2-NEXT:    movdqa %xmm7, 112(%rdi)
   1715 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm6
   1716 ; SSE2-NEXT:    movdqa %xmm6, 96(%rdi)
   1717 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm5
   1718 ; SSE2-NEXT:    movdqa %xmm5, 80(%rdi)
   1719 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm4
   1720 ; SSE2-NEXT:    movdqa %xmm4, 64(%rdi)
   1721 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm3
   1722 ; SSE2-NEXT:    movdqa %xmm3, 48(%rdi)
   1723 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm2
   1724 ; SSE2-NEXT:    movdqa %xmm2, 32(%rdi)
   1725 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm1
   1726 ; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
   1727 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm0
   1728 ; SSE2-NEXT:    movdqa %xmm0, (%rdi)
   1729 ; SSE2-NEXT:    movq %rdi, %rax
   1730 ; SSE2-NEXT:    retq
   1731 ;
   1732 ; AVX1-LABEL: avg_v512i8_3:
   1733 ; AVX1:       # %bb.0:
   1734 ; AVX1-NEXT:    pushq %rbp
   1735 ; AVX1-NEXT:    movq %rsp, %rbp
   1736 ; AVX1-NEXT:    andq $-32, %rsp
   1737 ; AVX1-NEXT:    subq $128, %rsp
   1738 ; AVX1-NEXT:    vmovdqa 144(%rbp), %ymm8
   1739 ; AVX1-NEXT:    vmovdqa 112(%rbp), %ymm9
   1740 ; AVX1-NEXT:    vmovdqa 80(%rbp), %ymm10
   1741 ; AVX1-NEXT:    vmovdqa 48(%rbp), %ymm11
   1742 ; AVX1-NEXT:    vmovdqa 16(%rbp), %ymm12
   1743 ; AVX1-NEXT:    vmovdqa 272(%rbp), %ymm13
   1744 ; AVX1-NEXT:    vextractf128 $1, %ymm13, %xmm14
   1745 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm15
   1746 ; AVX1-NEXT:    vpavgb %xmm14, %xmm15, %xmm14
   1747 ; AVX1-NEXT:    vmovdqa 304(%rbp), %ymm15
   1748 ; AVX1-NEXT:    vpavgb %xmm13, %xmm0, %xmm0
   1749 ; AVX1-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm0
   1750 ; AVX1-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
   1751 ; AVX1-NEXT:    vextractf128 $1, %ymm15, %xmm14
   1752 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
   1753 ; AVX1-NEXT:    vpavgb %xmm14, %xmm0, %xmm0
   1754 ; AVX1-NEXT:    vmovdqa 336(%rbp), %ymm14
   1755 ; AVX1-NEXT:    vpavgb %xmm15, %xmm1, %xmm1
   1756 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   1757 ; AVX1-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
   1758 ; AVX1-NEXT:    vextractf128 $1, %ymm14, %xmm0
   1759 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
   1760 ; AVX1-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
   1761 ; AVX1-NEXT:    vmovdqa 368(%rbp), %ymm1
   1762 ; AVX1-NEXT:    vpavgb %xmm14, %xmm2, %xmm2
   1763 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
   1764 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp) # 32-byte Spill
   1765 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
   1766 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
   1767 ; AVX1-NEXT:    vpavgb %xmm0, %xmm2, %xmm0
   1768 ; AVX1-NEXT:    vmovdqa 400(%rbp), %ymm2
   1769 ; AVX1-NEXT:    vpavgb %xmm1, %xmm3, %xmm1
   1770 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm3
   1771 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
   1772 ; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm1
   1773 ; AVX1-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
   1774 ; AVX1-NEXT:    vmovdqa 432(%rbp), %ymm1
   1775 ; AVX1-NEXT:    vpavgb %xmm2, %xmm4, %xmm2
   1776 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm4
   1777 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
   1778 ; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm2
   1779 ; AVX1-NEXT:    vpavgb %xmm0, %xmm2, %xmm0
   1780 ; AVX1-NEXT:    vmovdqa 464(%rbp), %ymm2
   1781 ; AVX1-NEXT:    vpavgb %xmm1, %xmm5, %xmm1
   1782 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm5
   1783 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
   1784 ; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm1
   1785 ; AVX1-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
   1786 ; AVX1-NEXT:    vmovdqa 496(%rbp), %ymm1
   1787 ; AVX1-NEXT:    vpavgb %xmm2, %xmm6, %xmm2
   1788 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm6
   1789 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
   1790 ; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm2
   1791 ; AVX1-NEXT:    vpavgb %xmm0, %xmm2, %xmm0
   1792 ; AVX1-NEXT:    vmovdqa 528(%rbp), %ymm2
   1793 ; AVX1-NEXT:    vpavgb %xmm1, %xmm7, %xmm1
   1794 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm7
   1795 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
   1796 ; AVX1-NEXT:    vextractf128 $1, %ymm12, %xmm1
   1797 ; AVX1-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
   1798 ; AVX1-NEXT:    vmovdqa 560(%rbp), %ymm1
   1799 ; AVX1-NEXT:    vpavgb %xmm2, %xmm12, %xmm2
   1800 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm12
   1801 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
   1802 ; AVX1-NEXT:    vextractf128 $1, %ymm11, %xmm2
   1803 ; AVX1-NEXT:    vpavgb %xmm0, %xmm2, %xmm0
   1804 ; AVX1-NEXT:    vmovdqa 592(%rbp), %ymm2
   1805 ; AVX1-NEXT:    vpavgb %xmm1, %xmm11, %xmm1
   1806 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm11
   1807 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
   1808 ; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm1
   1809 ; AVX1-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
   1810 ; AVX1-NEXT:    vmovdqa 624(%rbp), %ymm1
   1811 ; AVX1-NEXT:    vpavgb %xmm2, %xmm10, %xmm2
   1812 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm10
   1813 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
   1814 ; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm2
   1815 ; AVX1-NEXT:    vpavgb %xmm0, %xmm2, %xmm0
   1816 ; AVX1-NEXT:    vmovdqa 656(%rbp), %ymm2
   1817 ; AVX1-NEXT:    vpavgb %xmm1, %xmm9, %xmm1
   1818 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm9
   1819 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
   1820 ; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm1
   1821 ; AVX1-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
   1822 ; AVX1-NEXT:    vmovdqa 176(%rbp), %ymm1
   1823 ; AVX1-NEXT:    vpavgb %xmm2, %xmm8, %xmm2
   1824 ; AVX1-NEXT:    vmovdqa 688(%rbp), %ymm8
   1825 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
   1826 ; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm2
   1827 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm13
   1828 ; AVX1-NEXT:    vpavgb %xmm2, %xmm13, %xmm2
   1829 ; AVX1-NEXT:    vpavgb %xmm8, %xmm1, %xmm1
   1830 ; AVX1-NEXT:    vmovdqa 208(%rbp), %ymm8
   1831 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm13
   1832 ; AVX1-NEXT:    vmovdqa 720(%rbp), %ymm2
   1833 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
   1834 ; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm15
   1835 ; AVX1-NEXT:    vpavgb %xmm1, %xmm15, %xmm1
   1836 ; AVX1-NEXT:    vpavgb %xmm2, %xmm8, %xmm2
   1837 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
   1838 ; AVX1-NEXT:    vmovdqa 240(%rbp), %ymm15
   1839 ; AVX1-NEXT:    vmovdqa 752(%rbp), %ymm8
   1840 ; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm2
   1841 ; AVX1-NEXT:    vextractf128 $1, %ymm15, %xmm14
   1842 ; AVX1-NEXT:    vpavgb %xmm2, %xmm14, %xmm2
   1843 ; AVX1-NEXT:    vpavgb %xmm8, %xmm15, %xmm8
   1844 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm8, %ymm2
   1845 ; AVX1-NEXT:    vmovaps %ymm2, 480(%rdi)
   1846 ; AVX1-NEXT:    vmovaps %ymm1, 448(%rdi)
   1847 ; AVX1-NEXT:    vmovaps %ymm13, 416(%rdi)
   1848 ; AVX1-NEXT:    vmovaps %ymm0, 384(%rdi)
   1849 ; AVX1-NEXT:    vmovaps %ymm9, 352(%rdi)
   1850 ; AVX1-NEXT:    vmovaps %ymm10, 320(%rdi)
   1851 ; AVX1-NEXT:    vmovaps %ymm11, 288(%rdi)
   1852 ; AVX1-NEXT:    vmovaps %ymm12, 256(%rdi)
   1853 ; AVX1-NEXT:    vmovaps %ymm7, 224(%rdi)
   1854 ; AVX1-NEXT:    vmovaps %ymm6, 192(%rdi)
   1855 ; AVX1-NEXT:    vmovaps %ymm5, 160(%rdi)
   1856 ; AVX1-NEXT:    vmovaps %ymm4, 128(%rdi)
   1857 ; AVX1-NEXT:    vmovaps %ymm3, 96(%rdi)
   1858 ; AVX1-NEXT:    vmovaps (%rsp), %ymm0 # 32-byte Reload
   1859 ; AVX1-NEXT:    vmovaps %ymm0, 64(%rdi)
   1860 ; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
   1861 ; AVX1-NEXT:    vmovaps %ymm0, 32(%rdi)
   1862 ; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
   1863 ; AVX1-NEXT:    vmovaps %ymm0, (%rdi)
   1864 ; AVX1-NEXT:    movq %rdi, %rax
   1865 ; AVX1-NEXT:    movq %rbp, %rsp
   1866 ; AVX1-NEXT:    popq %rbp
   1867 ; AVX1-NEXT:    vzeroupper
   1868 ; AVX1-NEXT:    retq
   1869 ;
   1870 ; AVX2-LABEL: avg_v512i8_3:
   1871 ; AVX2:       # %bb.0:
   1872 ; AVX2-NEXT:    pushq %rbp
   1873 ; AVX2-NEXT:    movq %rsp, %rbp
   1874 ; AVX2-NEXT:    andq $-32, %rsp
   1875 ; AVX2-NEXT:    subq $32, %rsp
   1876 ; AVX2-NEXT:    vmovdqa 240(%rbp), %ymm8
   1877 ; AVX2-NEXT:    vmovdqa 208(%rbp), %ymm9
   1878 ; AVX2-NEXT:    vmovdqa 176(%rbp), %ymm10
   1879 ; AVX2-NEXT:    vmovdqa 144(%rbp), %ymm11
   1880 ; AVX2-NEXT:    vmovdqa 112(%rbp), %ymm12
   1881 ; AVX2-NEXT:    vmovdqa 80(%rbp), %ymm13
   1882 ; AVX2-NEXT:    vmovdqa 48(%rbp), %ymm14
   1883 ; AVX2-NEXT:    vmovdqa 16(%rbp), %ymm15
   1884 ; AVX2-NEXT:    vpavgb 272(%rbp), %ymm0, %ymm0
   1885 ; AVX2-NEXT:    vpavgb 304(%rbp), %ymm1, %ymm1
   1886 ; AVX2-NEXT:    vpavgb 336(%rbp), %ymm2, %ymm2
   1887 ; AVX2-NEXT:    vpavgb 368(%rbp), %ymm3, %ymm3
   1888 ; AVX2-NEXT:    vpavgb 400(%rbp), %ymm4, %ymm4
   1889 ; AVX2-NEXT:    vpavgb 432(%rbp), %ymm5, %ymm5
   1890 ; AVX2-NEXT:    vpavgb 464(%rbp), %ymm6, %ymm6
   1891 ; AVX2-NEXT:    vpavgb 496(%rbp), %ymm7, %ymm7
   1892 ; AVX2-NEXT:    vpavgb 528(%rbp), %ymm15, %ymm15
   1893 ; AVX2-NEXT:    vpavgb 560(%rbp), %ymm14, %ymm14
   1894 ; AVX2-NEXT:    vpavgb 592(%rbp), %ymm13, %ymm13
   1895 ; AVX2-NEXT:    vpavgb 624(%rbp), %ymm12, %ymm12
   1896 ; AVX2-NEXT:    vpavgb 656(%rbp), %ymm11, %ymm11
   1897 ; AVX2-NEXT:    vpavgb 688(%rbp), %ymm10, %ymm10
   1898 ; AVX2-NEXT:    vpavgb 720(%rbp), %ymm9, %ymm9
   1899 ; AVX2-NEXT:    vpavgb 752(%rbp), %ymm8, %ymm8
   1900 ; AVX2-NEXT:    vmovdqa %ymm8, 480(%rdi)
   1901 ; AVX2-NEXT:    vmovdqa %ymm9, 448(%rdi)
   1902 ; AVX2-NEXT:    vmovdqa %ymm10, 416(%rdi)
   1903 ; AVX2-NEXT:    vmovdqa %ymm11, 384(%rdi)
   1904 ; AVX2-NEXT:    vmovdqa %ymm12, 352(%rdi)
   1905 ; AVX2-NEXT:    vmovdqa %ymm13, 320(%rdi)
   1906 ; AVX2-NEXT:    vmovdqa %ymm14, 288(%rdi)
   1907 ; AVX2-NEXT:    vmovdqa %ymm15, 256(%rdi)
   1908 ; AVX2-NEXT:    vmovdqa %ymm7, 224(%rdi)
   1909 ; AVX2-NEXT:    vmovdqa %ymm6, 192(%rdi)
   1910 ; AVX2-NEXT:    vmovdqa %ymm5, 160(%rdi)
   1911 ; AVX2-NEXT:    vmovdqa %ymm4, 128(%rdi)
   1912 ; AVX2-NEXT:    vmovdqa %ymm3, 96(%rdi)
   1913 ; AVX2-NEXT:    vmovdqa %ymm2, 64(%rdi)
   1914 ; AVX2-NEXT:    vmovdqa %ymm1, 32(%rdi)
   1915 ; AVX2-NEXT:    vmovdqa %ymm0, (%rdi)
   1916 ; AVX2-NEXT:    movq %rdi, %rax
   1917 ; AVX2-NEXT:    movq %rbp, %rsp
   1918 ; AVX2-NEXT:    popq %rbp
   1919 ; AVX2-NEXT:    vzeroupper
   1920 ; AVX2-NEXT:    retq
   1921 ;
   1922 ; AVX512F-LABEL: avg_v512i8_3:
   1923 ; AVX512F:       # %bb.0:
   1924 ; AVX512F-NEXT:    pushq %rbp
   1925 ; AVX512F-NEXT:    movq %rsp, %rbp
   1926 ; AVX512F-NEXT:    andq $-32, %rsp
   1927 ; AVX512F-NEXT:    subq $32, %rsp
   1928 ; AVX512F-NEXT:    vmovdqa 240(%rbp), %ymm8
   1929 ; AVX512F-NEXT:    vmovdqa 208(%rbp), %ymm9
   1930 ; AVX512F-NEXT:    vmovdqa 176(%rbp), %ymm10
   1931 ; AVX512F-NEXT:    vmovdqa 144(%rbp), %ymm11
   1932 ; AVX512F-NEXT:    vmovdqa 112(%rbp), %ymm12
   1933 ; AVX512F-NEXT:    vmovdqa 80(%rbp), %ymm13
   1934 ; AVX512F-NEXT:    vmovdqa 48(%rbp), %ymm14
   1935 ; AVX512F-NEXT:    vmovdqa 16(%rbp), %ymm15
   1936 ; AVX512F-NEXT:    vpavgb 272(%rbp), %ymm0, %ymm0
   1937 ; AVX512F-NEXT:    vpavgb 304(%rbp), %ymm1, %ymm1
   1938 ; AVX512F-NEXT:    vpavgb 336(%rbp), %ymm2, %ymm2
   1939 ; AVX512F-NEXT:    vpavgb 368(%rbp), %ymm3, %ymm3
   1940 ; AVX512F-NEXT:    vpavgb 400(%rbp), %ymm4, %ymm4
   1941 ; AVX512F-NEXT:    vpavgb 432(%rbp), %ymm5, %ymm5
   1942 ; AVX512F-NEXT:    vpavgb 464(%rbp), %ymm6, %ymm6
   1943 ; AVX512F-NEXT:    vpavgb 496(%rbp), %ymm7, %ymm7
   1944 ; AVX512F-NEXT:    vpavgb 528(%rbp), %ymm15, %ymm15
   1945 ; AVX512F-NEXT:    vpavgb 560(%rbp), %ymm14, %ymm14
   1946 ; AVX512F-NEXT:    vpavgb 592(%rbp), %ymm13, %ymm13
   1947 ; AVX512F-NEXT:    vpavgb 624(%rbp), %ymm12, %ymm12
   1948 ; AVX512F-NEXT:    vpavgb 656(%rbp), %ymm11, %ymm11
   1949 ; AVX512F-NEXT:    vpavgb 688(%rbp), %ymm10, %ymm10
   1950 ; AVX512F-NEXT:    vpavgb 720(%rbp), %ymm9, %ymm9
   1951 ; AVX512F-NEXT:    vpavgb 752(%rbp), %ymm8, %ymm8
   1952 ; AVX512F-NEXT:    vmovdqa %ymm8, 480(%rdi)
   1953 ; AVX512F-NEXT:    vmovdqa %ymm9, 448(%rdi)
   1954 ; AVX512F-NEXT:    vmovdqa %ymm10, 416(%rdi)
   1955 ; AVX512F-NEXT:    vmovdqa %ymm11, 384(%rdi)
   1956 ; AVX512F-NEXT:    vmovdqa %ymm12, 352(%rdi)
   1957 ; AVX512F-NEXT:    vmovdqa %ymm13, 320(%rdi)
   1958 ; AVX512F-NEXT:    vmovdqa %ymm14, 288(%rdi)
   1959 ; AVX512F-NEXT:    vmovdqa %ymm15, 256(%rdi)
   1960 ; AVX512F-NEXT:    vmovdqa %ymm7, 224(%rdi)
   1961 ; AVX512F-NEXT:    vmovdqa %ymm6, 192(%rdi)
   1962 ; AVX512F-NEXT:    vmovdqa %ymm5, 160(%rdi)
   1963 ; AVX512F-NEXT:    vmovdqa %ymm4, 128(%rdi)
   1964 ; AVX512F-NEXT:    vmovdqa %ymm3, 96(%rdi)
   1965 ; AVX512F-NEXT:    vmovdqa %ymm2, 64(%rdi)
   1966 ; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rdi)
   1967 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rdi)
   1968 ; AVX512F-NEXT:    movq %rdi, %rax
   1969 ; AVX512F-NEXT:    movq %rbp, %rsp
   1970 ; AVX512F-NEXT:    popq %rbp
   1971 ; AVX512F-NEXT:    vzeroupper
   1972 ; AVX512F-NEXT:    retq
   1973 ;
   1974 ; AVX512BW-LABEL: avg_v512i8_3:
   1975 ; AVX512BW:       # %bb.0:
   1976 ; AVX512BW-NEXT:    pushq %rbp
   1977 ; AVX512BW-NEXT:    movq %rsp, %rbp
   1978 ; AVX512BW-NEXT:    andq $-64, %rsp
   1979 ; AVX512BW-NEXT:    subq $64, %rsp
   1980 ; AVX512BW-NEXT:    vpavgb 16(%rbp), %zmm0, %zmm0
   1981 ; AVX512BW-NEXT:    vpavgb 80(%rbp), %zmm1, %zmm1
   1982 ; AVX512BW-NEXT:    vpavgb 144(%rbp), %zmm2, %zmm2
   1983 ; AVX512BW-NEXT:    vpavgb 208(%rbp), %zmm3, %zmm3
   1984 ; AVX512BW-NEXT:    vpavgb 272(%rbp), %zmm4, %zmm4
   1985 ; AVX512BW-NEXT:    vpavgb 336(%rbp), %zmm5, %zmm5
   1986 ; AVX512BW-NEXT:    vpavgb 400(%rbp), %zmm6, %zmm6
   1987 ; AVX512BW-NEXT:    vpavgb 464(%rbp), %zmm7, %zmm7
   1988 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, 448(%rdi)
   1989 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, 384(%rdi)
   1990 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, 320(%rdi)
   1991 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, 256(%rdi)
   1992 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rdi)
   1993 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, 128(%rdi)
   1994 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rdi)
   1995 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdi)
   1996 ; AVX512BW-NEXT:    movq %rdi, %rax
   1997 ; AVX512BW-NEXT:    movq %rbp, %rsp
   1998 ; AVX512BW-NEXT:    popq %rbp
   1999 ; AVX512BW-NEXT:    vzeroupper
   2000 ; AVX512BW-NEXT:    retq
   2001   %za = zext <512 x i8> %a to <512 x i16>
   2002   %zb = zext <512 x i8> %b to <512 x i16>
   2003   %add = add nuw nsw <512 x i16> %za, %zb
   2004   %add1 = add nuw nsw <512 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   2005   %lshr = lshr <512 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   2006   %res = trunc <512 x i16> %lshr to <512 x i8>
   2007   ret <512 x i8> %res
   2008 }
   2009 
   2010 ; This is not an avg, but its structurally similar and previously caused a crash
   2011 ; because the constants can't be read with APInt::getZExtValue.
   2012 define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind {
   2013 ; SSE2-LABEL: not_avg_v16i8_wide_constants:
   2014 ; SSE2:       # %bb.0:
   2015 ; SSE2-NEXT:    pushq %rbp
   2016 ; SSE2-NEXT:    pushq %r15
   2017 ; SSE2-NEXT:    pushq %r14
   2018 ; SSE2-NEXT:    pushq %r13
   2019 ; SSE2-NEXT:    pushq %r12
   2020 ; SSE2-NEXT:    pushq %rbx
   2021 ; SSE2-NEXT:    movaps (%rdi), %xmm1
   2022 ; SSE2-NEXT:    movaps (%rsi), %xmm0
   2023 ; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
   2024 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
   2025 ; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2026 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
   2027 ; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2028 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
   2029 ; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2030 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
   2031 ; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2032 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r14d
   2033 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r15d
   2034 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r12d
   2035 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r13d
   2036 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
   2037 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r11d
   2038 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
   2039 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
   2040 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
   2041 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
   2042 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebp
   2043 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
   2044 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
   2045 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
   2046 ; SSE2-NEXT:    leaq -1(%rax,%r9), %rax
   2047 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
   2048 ; SSE2-NEXT:    leaq -1(%rbp,%rbx), %rbp
   2049 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
   2050 ; SSE2-NEXT:    leaq -1(%rdx,%rbx), %rdx
   2051 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
   2052 ; SSE2-NEXT:    leaq -1(%rcx,%rbx), %rcx
   2053 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
   2054 ; SSE2-NEXT:    leaq -1(%rsi,%rbx), %rsi
   2055 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
   2056 ; SSE2-NEXT:    leaq -1(%rdi,%rbx), %r8
   2057 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
   2058 ; SSE2-NEXT:    leaq -1(%r11,%rbx), %r9
   2059 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
   2060 ; SSE2-NEXT:    leaq -1(%r10,%rbx), %r11
   2061 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
   2062 ; SSE2-NEXT:    leaq -1(%r13,%rbx), %r13
   2063 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
   2064 ; SSE2-NEXT:    leaq -1(%r12,%rbx), %r12
   2065 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
   2066 ; SSE2-NEXT:    leaq -1(%r15,%rbx), %r15
   2067 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
   2068 ; SSE2-NEXT:    leaq -1(%r14,%rbx), %r14
   2069 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
   2070 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
   2071 ; SSE2-NEXT:    leaq -1(%rdi,%rbx), %rdi
   2072 ; SSE2-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2073 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
   2074 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
   2075 ; SSE2-NEXT:    leaq -1(%rdi,%rbx), %rbx
   2076 ; SSE2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2077 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
   2078 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
   2079 ; SSE2-NEXT:    leaq -1(%rdi,%rbx), %rbx
   2080 ; SSE2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2081 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
   2082 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
   2083 ; SSE2-NEXT:    leaq -1(%r10,%rbx), %rbx
   2084 ; SSE2-NEXT:    shrq %rax
   2085 ; SSE2-NEXT:    movd %eax, %xmm8
   2086 ; SSE2-NEXT:    shrq %rbp
   2087 ; SSE2-NEXT:    movd %ebp, %xmm15
   2088 ; SSE2-NEXT:    shrq %rdx
   2089 ; SSE2-NEXT:    movd %edx, %xmm9
   2090 ; SSE2-NEXT:    shrq %rcx
   2091 ; SSE2-NEXT:    movd %ecx, %xmm2
   2092 ; SSE2-NEXT:    shrq %rsi
   2093 ; SSE2-NEXT:    movd %esi, %xmm10
   2094 ; SSE2-NEXT:    shrq %r8
   2095 ; SSE2-NEXT:    movd %r8d, %xmm6
   2096 ; SSE2-NEXT:    shrq %r9
   2097 ; SSE2-NEXT:    movd %r9d, %xmm11
   2098 ; SSE2-NEXT:    shrq %r11
   2099 ; SSE2-NEXT:    movd %r11d, %xmm5
   2100 ; SSE2-NEXT:    shrq %r13
   2101 ; SSE2-NEXT:    movd %r13d, %xmm12
   2102 ; SSE2-NEXT:    shrq %r12
   2103 ; SSE2-NEXT:    movd %r12d, %xmm3
   2104 ; SSE2-NEXT:    shrq %r15
   2105 ; SSE2-NEXT:    movd %r15d, %xmm13
   2106 ; SSE2-NEXT:    shrq %r14
   2107 ; SSE2-NEXT:    movd %r14d, %xmm7
   2108 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
   2109 ; SSE2-NEXT:    shrq %rax
   2110 ; SSE2-NEXT:    movd %eax, %xmm14
   2111 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
   2112 ; SSE2-NEXT:    shrq %rax
   2113 ; SSE2-NEXT:    movd %eax, %xmm4
   2114 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
   2115 ; SSE2-NEXT:    shrq %rax
   2116 ; SSE2-NEXT:    movd %eax, %xmm0
   2117 ; SSE2-NEXT:    shrq %rbx
   2118 ; SSE2-NEXT:    movd %ebx, %xmm1
   2119 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
   2120 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
   2121 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
   2122 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
   2123 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
   2124 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
   2125 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
   2126 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
   2127 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
   2128 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
   2129 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
   2130 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   2131 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
   2132 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
   2133 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
   2134 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
   2135 ; SSE2-NEXT:    popq %rbx
   2136 ; SSE2-NEXT:    popq %r12
   2137 ; SSE2-NEXT:    popq %r13
   2138 ; SSE2-NEXT:    popq %r14
   2139 ; SSE2-NEXT:    popq %r15
   2140 ; SSE2-NEXT:    popq %rbp
   2141 ; SSE2-NEXT:    retq
   2142 ;
   2143 ; AVX1-LABEL: not_avg_v16i8_wide_constants:
   2144 ; AVX1:       # %bb.0:
   2145 ; AVX1-NEXT:    pushq %rbp
   2146 ; AVX1-NEXT:    pushq %r15
   2147 ; AVX1-NEXT:    pushq %r14
   2148 ; AVX1-NEXT:    pushq %r13
   2149 ; AVX1-NEXT:    pushq %r12
   2150 ; AVX1-NEXT:    pushq %rbx
   2151 ; AVX1-NEXT:    subq $24, %rsp
   2152 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
   2153 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
   2154 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
   2155 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
   2156 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
   2157 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
   2158 ; AVX1-NEXT:    vpextrq $1, %xmm5, %rbx
   2159 ; AVX1-NEXT:    vmovq %xmm5, %rbp
   2160 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
   2161 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
   2162 ; AVX1-NEXT:    vpextrq $1, %xmm4, %rsi
   2163 ; AVX1-NEXT:    vmovq %xmm4, %rcx
   2164 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
   2165 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
   2166 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
   2167 ; AVX1-NEXT:    vpextrq $1, %xmm4, %r8
   2168 ; AVX1-NEXT:    vmovq %xmm4, %r11
   2169 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
   2170 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
   2171 ; AVX1-NEXT:    vpextrq $1, %xmm3, %r13
   2172 ; AVX1-NEXT:    vmovq %xmm3, %r12
   2173 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
   2174 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
   2175 ; AVX1-NEXT:    vpextrq $1, %xmm4, %r15
   2176 ; AVX1-NEXT:    vmovq %xmm4, %rdi
   2177 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
   2178 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
   2179 ; AVX1-NEXT:    vpextrq $1, %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
   2180 ; AVX1-NEXT:    vmovq %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
   2181 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   2182 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
   2183 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
   2184 ; AVX1-NEXT:    vpextrq $1, %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
   2185 ; AVX1-NEXT:    vmovq %xmm3, %r10
   2186 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   2187 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
   2188 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   2189 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
   2190 ; AVX1-NEXT:    vpextrq $1, %xmm4, %rdx
   2191 ; AVX1-NEXT:    addq %rbx, %rdx
   2192 ; AVX1-NEXT:    vmovq %xmm4, %r9
   2193 ; AVX1-NEXT:    addq %rbp, %r9
   2194 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
   2195 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
   2196 ; AVX1-NEXT:    vpextrq $1, %xmm3, %rax
   2197 ; AVX1-NEXT:    addq %rsi, %rax
   2198 ; AVX1-NEXT:    movq %rax, %r14
   2199 ; AVX1-NEXT:    vmovq %xmm3, %rbp
   2200 ; AVX1-NEXT:    addq %rcx, %rbp
   2201 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
   2202 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   2203 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
   2204 ; AVX1-NEXT:    vpextrq $1, %xmm3, %rsi
   2205 ; AVX1-NEXT:    addq %r8, %rsi
   2206 ; AVX1-NEXT:    vmovq %xmm3, %rax
   2207 ; AVX1-NEXT:    addq %r11, %rax
   2208 ; AVX1-NEXT:    movq %rax, %r11
   2209 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
   2210 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
   2211 ; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
   2212 ; AVX1-NEXT:    addq %r13, %rax
   2213 ; AVX1-NEXT:    movq %rax, %rcx
   2214 ; AVX1-NEXT:    vmovq %xmm2, %rax
   2215 ; AVX1-NEXT:    addq %r12, %rax
   2216 ; AVX1-NEXT:    movq %rax, %r8
   2217 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   2218 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
   2219 ; AVX1-NEXT:    vpextrq $1, %xmm3, %rax
   2220 ; AVX1-NEXT:    addq %r15, %rax
   2221 ; AVX1-NEXT:    movq %rax, %rbx
   2222 ; AVX1-NEXT:    vmovq %xmm3, %rax
   2223 ; AVX1-NEXT:    addq %rdi, %rax
   2224 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2225 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
   2226 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
   2227 ; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
   2228 ; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
   2229 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2230 ; AVX1-NEXT:    vmovq %xmm2, %rax
   2231 ; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
   2232 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2233 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   2234 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   2235 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
   2236 ; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
   2237 ; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
   2238 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2239 ; AVX1-NEXT:    vmovq %xmm2, %r12
   2240 ; AVX1-NEXT:    addq %r10, %r12
   2241 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
   2242 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   2243 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
   2244 ; AVX1-NEXT:    vpextrq $1, %xmm0, %r10
   2245 ; AVX1-NEXT:    addq %rax, %r10
   2246 ; AVX1-NEXT:    vmovq %xmm1, %rax
   2247 ; AVX1-NEXT:    vmovq %xmm0, %rdi
   2248 ; AVX1-NEXT:    addq %rax, %rdi
   2249 ; AVX1-NEXT:    addq $-1, %rdx
   2250 ; AVX1-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2251 ; AVX1-NEXT:    movl $0, %eax
   2252 ; AVX1-NEXT:    adcq $-1, %rax
   2253 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2254 ; AVX1-NEXT:    addq $-1, %r9
   2255 ; AVX1-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2256 ; AVX1-NEXT:    movl $0, %eax
   2257 ; AVX1-NEXT:    adcq $-1, %rax
   2258 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2259 ; AVX1-NEXT:    addq $-1, %r14
   2260 ; AVX1-NEXT:    movq %r14, (%rsp) # 8-byte Spill
   2261 ; AVX1-NEXT:    movl $0, %eax
   2262 ; AVX1-NEXT:    adcq $-1, %rax
   2263 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2264 ; AVX1-NEXT:    addq $-1, %rbp
   2265 ; AVX1-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2266 ; AVX1-NEXT:    movl $0, %eax
   2267 ; AVX1-NEXT:    adcq $-1, %rax
   2268 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2269 ; AVX1-NEXT:    addq $-1, %rsi
   2270 ; AVX1-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2271 ; AVX1-NEXT:    movl $0, %eax
   2272 ; AVX1-NEXT:    adcq $-1, %rax
   2273 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2274 ; AVX1-NEXT:    addq $-1, %r11
   2275 ; AVX1-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2276 ; AVX1-NEXT:    movl $0, %eax
   2277 ; AVX1-NEXT:    adcq $-1, %rax
   2278 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2279 ; AVX1-NEXT:    addq $-1, %rcx
   2280 ; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2281 ; AVX1-NEXT:    movl $0, %ebp
   2282 ; AVX1-NEXT:    adcq $-1, %rbp
   2283 ; AVX1-NEXT:    addq $-1, %r8
   2284 ; AVX1-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2285 ; AVX1-NEXT:    movl $0, %r15d
   2286 ; AVX1-NEXT:    adcq $-1, %r15
   2287 ; AVX1-NEXT:    addq $-1, %rbx
   2288 ; AVX1-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2289 ; AVX1-NEXT:    movl $0, %eax
   2290 ; AVX1-NEXT:    adcq $-1, %rax
   2291 ; AVX1-NEXT:    movq %rax, %rsi
   2292 ; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
   2293 ; AVX1-NEXT:    movl $0, %r13d
   2294 ; AVX1-NEXT:    adcq $-1, %r13
   2295 ; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
   2296 ; AVX1-NEXT:    movl $0, %r14d
   2297 ; AVX1-NEXT:    adcq $-1, %r14
   2298 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
   2299 ; AVX1-NEXT:    addq $-1, %rdx
   2300 ; AVX1-NEXT:    movl $0, %r11d
   2301 ; AVX1-NEXT:    adcq $-1, %r11
   2302 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
   2303 ; AVX1-NEXT:    addq $-1, %rax
   2304 ; AVX1-NEXT:    movl $0, %ebx
   2305 ; AVX1-NEXT:    adcq $-1, %rbx
   2306 ; AVX1-NEXT:    addq $-1, %r12
   2307 ; AVX1-NEXT:    movl $0, %r9d
   2308 ; AVX1-NEXT:    adcq $-1, %r9
   2309 ; AVX1-NEXT:    addq $-1, %r10
   2310 ; AVX1-NEXT:    movl $0, %r8d
   2311 ; AVX1-NEXT:    adcq $-1, %r8
   2312 ; AVX1-NEXT:    addq $-1, %rdi
   2313 ; AVX1-NEXT:    movl $0, %ecx
   2314 ; AVX1-NEXT:    adcq $-1, %rcx
   2315 ; AVX1-NEXT:    shldq $63, %rdi, %rcx
   2316 ; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2317 ; AVX1-NEXT:    shldq $63, %r10, %r8
   2318 ; AVX1-NEXT:    shldq $63, %r12, %r9
   2319 ; AVX1-NEXT:    shldq $63, %rax, %rbx
   2320 ; AVX1-NEXT:    shldq $63, %rdx, %r11
   2321 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
   2322 ; AVX1-NEXT:    shldq $63, %rdx, %r14
   2323 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
   2324 ; AVX1-NEXT:    shldq $63, %rdx, %r13
   2325 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
   2326 ; AVX1-NEXT:    shldq $63, %rax, %rsi
   2327 ; AVX1-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2328 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
   2329 ; AVX1-NEXT:    shldq $63, %rax, %r15
   2330 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
   2331 ; AVX1-NEXT:    shldq $63, %rax, %rbp
   2332 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
   2333 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
   2334 ; AVX1-NEXT:    shldq $63, %rax, %rsi
   2335 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
   2336 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
   2337 ; AVX1-NEXT:    shldq $63, %rax, %rcx
   2338 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
   2339 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
   2340 ; AVX1-NEXT:    shldq $63, %rax, %rdi
   2341 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
   2342 ; AVX1-NEXT:    movq (%rsp), %rax # 8-byte Reload
   2343 ; AVX1-NEXT:    shldq $63, %rax, %r12
   2344 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
   2345 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
   2346 ; AVX1-NEXT:    shldq $63, %rax, %r10
   2347 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
   2348 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
   2349 ; AVX1-NEXT:    shldq $63, %rdx, %rax
   2350 ; AVX1-NEXT:    vmovq %rax, %xmm8
   2351 ; AVX1-NEXT:    vmovq %r10, %xmm0
   2352 ; AVX1-NEXT:    vmovq %r12, %xmm1
   2353 ; AVX1-NEXT:    vmovq %rdi, %xmm11
   2354 ; AVX1-NEXT:    vmovq %rcx, %xmm2
   2355 ; AVX1-NEXT:    vmovq %rsi, %xmm13
   2356 ; AVX1-NEXT:    vmovq %rbp, %xmm14
   2357 ; AVX1-NEXT:    vmovq %r15, %xmm15
   2358 ; AVX1-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 8-byte Folded Reload
   2359 ; AVX1-NEXT:    # xmm9 = mem[0],zero
   2360 ; AVX1-NEXT:    vmovq %r13, %xmm10
   2361 ; AVX1-NEXT:    vmovq %r14, %xmm12
   2362 ; AVX1-NEXT:    vmovq %r11, %xmm3
   2363 ; AVX1-NEXT:    vmovq %rbx, %xmm4
   2364 ; AVX1-NEXT:    vmovq %r9, %xmm5
   2365 ; AVX1-NEXT:    vmovq %r8, %xmm6
   2366 ; AVX1-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 8-byte Folded Reload
   2367 ; AVX1-NEXT:    # xmm7 = mem[0],zero
   2368 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm0[0],xmm8[0]
   2369 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm11[0],xmm1[0]
   2370 ; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[0,2]
   2371 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm13[0],xmm2[0]
   2372 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm15[0],xmm14[0]
   2373 ; AVX1-NEXT:    vshufps {{.*#+}} xmm11 = xmm0[0,2],xmm1[0,2]
   2374 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   2375 ; AVX1-NEXT:    vpshufb %xmm1, %xmm8, %xmm0
   2376 ; AVX1-NEXT:    vpshufb %xmm1, %xmm11, %xmm2
   2377 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   2378 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm10[0],xmm9[0]
   2379 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm12[0]
   2380 ; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
   2381 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm4[0]
   2382 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   2383 ; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
   2384 ; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
   2385 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm6[0]
   2386 ; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[0,2]
   2387 ; AVX1-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
   2388 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
   2389 ; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
   2390 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   2391 ; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
   2392 ; AVX1-NEXT:    addq $24, %rsp
   2393 ; AVX1-NEXT:    popq %rbx
   2394 ; AVX1-NEXT:    popq %r12
   2395 ; AVX1-NEXT:    popq %r13
   2396 ; AVX1-NEXT:    popq %r14
   2397 ; AVX1-NEXT:    popq %r15
   2398 ; AVX1-NEXT:    popq %rbp
   2399 ; AVX1-NEXT:    retq
   2400 ;
   2401 ; AVX2-LABEL: not_avg_v16i8_wide_constants:
   2402 ; AVX2:       # %bb.0:
   2403 ; AVX2-NEXT:    pushq %rbp
   2404 ; AVX2-NEXT:    pushq %r15
   2405 ; AVX2-NEXT:    pushq %r14
   2406 ; AVX2-NEXT:    pushq %r13
   2407 ; AVX2-NEXT:    pushq %r12
   2408 ; AVX2-NEXT:    pushq %rbx
   2409 ; AVX2-NEXT:    subq $16, %rsp
   2410 ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
   2411 ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
   2412 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
   2413 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
   2414 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   2415 ; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
   2416 ; AVX2-NEXT:    vpextrq $1, %xmm4, %rbx
   2417 ; AVX2-NEXT:    vmovq %xmm4, %rbp
   2418 ; AVX2-NEXT:    vpextrq $1, %xmm3, %rdi
   2419 ; AVX2-NEXT:    vmovq %xmm3, %rcx
   2420 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
   2421 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   2422 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
   2423 ; AVX2-NEXT:    vpextrq $1, %xmm3, %rdx
   2424 ; AVX2-NEXT:    vmovq %xmm3, %r9
   2425 ; AVX2-NEXT:    vpextrq $1, %xmm2, %r11
   2426 ; AVX2-NEXT:    vmovq %xmm2, %r12
   2427 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2428 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
   2429 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
   2430 ; AVX2-NEXT:    vpextrq $1, %xmm3, %r15
   2431 ; AVX2-NEXT:    vmovq %xmm3, %rsi
   2432 ; AVX2-NEXT:    vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
   2433 ; AVX2-NEXT:    vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
   2434 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
   2435 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
   2436 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
   2437 ; AVX2-NEXT:    vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
   2438 ; AVX2-NEXT:    vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
   2439 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
   2440 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
   2441 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   2442 ; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
   2443 ; AVX2-NEXT:    vpextrq $1, %xmm4, %rax
   2444 ; AVX2-NEXT:    addq %rbx, %rax
   2445 ; AVX2-NEXT:    movq %rax, %rbx
   2446 ; AVX2-NEXT:    vmovq %xmm4, %r13
   2447 ; AVX2-NEXT:    addq %rbp, %r13
   2448 ; AVX2-NEXT:    vpextrq $1, %xmm3, %r10
   2449 ; AVX2-NEXT:    addq %rdi, %r10
   2450 ; AVX2-NEXT:    vmovq %xmm3, %r14
   2451 ; AVX2-NEXT:    addq %rcx, %r14
   2452 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
   2453 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   2454 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
   2455 ; AVX2-NEXT:    vpextrq $1, %xmm3, %rax
   2456 ; AVX2-NEXT:    addq %rdx, %rax
   2457 ; AVX2-NEXT:    movq %rax, %rcx
   2458 ; AVX2-NEXT:    vmovq %xmm3, %r8
   2459 ; AVX2-NEXT:    addq %r9, %r8
   2460 ; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
   2461 ; AVX2-NEXT:    addq %r11, %rax
   2462 ; AVX2-NEXT:    movq %rax, %r11
   2463 ; AVX2-NEXT:    vmovq %xmm2, %rax
   2464 ; AVX2-NEXT:    addq %r12, %rax
   2465 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2466 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2467 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   2468 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
   2469 ; AVX2-NEXT:    vpextrq $1, %xmm3, %rax
   2470 ; AVX2-NEXT:    addq %r15, %rax
   2471 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2472 ; AVX2-NEXT:    vmovq %xmm3, %rax
   2473 ; AVX2-NEXT:    addq %rsi, %rax
   2474 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2475 ; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
   2476 ; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
   2477 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2478 ; AVX2-NEXT:    vmovq %xmm2, %rax
   2479 ; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
   2480 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2481 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
   2482 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   2483 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
   2484 ; AVX2-NEXT:    vpextrq $1, %xmm2, %rbp
   2485 ; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
   2486 ; AVX2-NEXT:    vmovq %xmm2, %r9
   2487 ; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
   2488 ; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
   2489 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rdi
   2490 ; AVX2-NEXT:    addq %rax, %rdi
   2491 ; AVX2-NEXT:    vmovq %xmm1, %rdx
   2492 ; AVX2-NEXT:    vmovq %xmm0, %rsi
   2493 ; AVX2-NEXT:    addq %rdx, %rsi
   2494 ; AVX2-NEXT:    addq $-1, %rbx
   2495 ; AVX2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2496 ; AVX2-NEXT:    movl $0, %eax
   2497 ; AVX2-NEXT:    adcq $-1, %rax
   2498 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2499 ; AVX2-NEXT:    addq $-1, %r13
   2500 ; AVX2-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2501 ; AVX2-NEXT:    movl $0, %eax
   2502 ; AVX2-NEXT:    adcq $-1, %rax
   2503 ; AVX2-NEXT:    movq %rax, (%rsp) # 8-byte Spill
   2504 ; AVX2-NEXT:    addq $-1, %r10
   2505 ; AVX2-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2506 ; AVX2-NEXT:    movl $0, %eax
   2507 ; AVX2-NEXT:    adcq $-1, %rax
   2508 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2509 ; AVX2-NEXT:    addq $-1, %r14
   2510 ; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2511 ; AVX2-NEXT:    movl $0, %r13d
   2512 ; AVX2-NEXT:    adcq $-1, %r13
   2513 ; AVX2-NEXT:    addq $-1, %rcx
   2514 ; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2515 ; AVX2-NEXT:    movl $0, %eax
   2516 ; AVX2-NEXT:    adcq $-1, %rax
   2517 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2518 ; AVX2-NEXT:    addq $-1, %r8
   2519 ; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2520 ; AVX2-NEXT:    movl $0, %r15d
   2521 ; AVX2-NEXT:    adcq $-1, %r15
   2522 ; AVX2-NEXT:    addq $-1, %r11
   2523 ; AVX2-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2524 ; AVX2-NEXT:    movl $0, %ebx
   2525 ; AVX2-NEXT:    adcq $-1, %rbx
   2526 ; AVX2-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
   2527 ; AVX2-NEXT:    movl $0, %r8d
   2528 ; AVX2-NEXT:    adcq $-1, %r8
   2529 ; AVX2-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
   2530 ; AVX2-NEXT:    movl $0, %eax
   2531 ; AVX2-NEXT:    adcq $-1, %rax
   2532 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2533 ; AVX2-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
   2534 ; AVX2-NEXT:    movl $0, %eax
   2535 ; AVX2-NEXT:    adcq $-1, %rax
   2536 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2537 ; AVX2-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
   2538 ; AVX2-NEXT:    movl $0, %r12d
   2539 ; AVX2-NEXT:    adcq $-1, %r12
   2540 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
   2541 ; AVX2-NEXT:    addq $-1, %rcx
   2542 ; AVX2-NEXT:    movl $0, %r11d
   2543 ; AVX2-NEXT:    adcq $-1, %r11
   2544 ; AVX2-NEXT:    addq $-1, %rbp
   2545 ; AVX2-NEXT:    movl $0, %r14d
   2546 ; AVX2-NEXT:    adcq $-1, %r14
   2547 ; AVX2-NEXT:    addq $-1, %r9
   2548 ; AVX2-NEXT:    movl $0, %r10d
   2549 ; AVX2-NEXT:    adcq $-1, %r10
   2550 ; AVX2-NEXT:    addq $-1, %rdi
   2551 ; AVX2-NEXT:    movl $0, %edx
   2552 ; AVX2-NEXT:    adcq $-1, %rdx
   2553 ; AVX2-NEXT:    addq $-1, %rsi
   2554 ; AVX2-NEXT:    movl $0, %eax
   2555 ; AVX2-NEXT:    adcq $-1, %rax
   2556 ; AVX2-NEXT:    shldq $63, %rsi, %rax
   2557 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2558 ; AVX2-NEXT:    shldq $63, %rdi, %rdx
   2559 ; AVX2-NEXT:    shldq $63, %r9, %r10
   2560 ; AVX2-NEXT:    shldq $63, %rbp, %r14
   2561 ; AVX2-NEXT:    shldq $63, %rcx, %r11
   2562 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
   2563 ; AVX2-NEXT:    shldq $63, %rcx, %r12
   2564 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
   2565 ; AVX2-NEXT:    shldq $63, %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
   2566 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
   2567 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
   2568 ; AVX2-NEXT:    shldq $63, %rcx, %r9
   2569 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
   2570 ; AVX2-NEXT:    shldq $63, %rcx, %r8
   2571 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
   2572 ; AVX2-NEXT:    shldq $63, %rax, %rbx
   2573 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
   2574 ; AVX2-NEXT:    shldq $63, %rax, %r15
   2575 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
   2576 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
   2577 ; AVX2-NEXT:    shldq $63, %rcx, %rax
   2578 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
   2579 ; AVX2-NEXT:    shldq $63, %rcx, %r13
   2580 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
   2581 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
   2582 ; AVX2-NEXT:    shldq $63, %rcx, %rbp
   2583 ; AVX2-NEXT:    movq (%rsp), %rdi # 8-byte Reload
   2584 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
   2585 ; AVX2-NEXT:    shldq $63, %rcx, %rdi
   2586 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
   2587 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
   2588 ; AVX2-NEXT:    shldq $63, %rcx, %rsi
   2589 ; AVX2-NEXT:    vmovq %rsi, %xmm8
   2590 ; AVX2-NEXT:    vmovq %rdi, %xmm9
   2591 ; AVX2-NEXT:    vmovq %rbp, %xmm10
   2592 ; AVX2-NEXT:    vmovq %r13, %xmm11
   2593 ; AVX2-NEXT:    vmovq %rax, %xmm12
   2594 ; AVX2-NEXT:    vmovq %r15, %xmm13
   2595 ; AVX2-NEXT:    vmovq %rbx, %xmm14
   2596 ; AVX2-NEXT:    vmovq %r8, %xmm15
   2597 ; AVX2-NEXT:    vmovq %r9, %xmm0
   2598 ; AVX2-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Folded Reload
   2599 ; AVX2-NEXT:    # xmm1 = mem[0],zero
   2600 ; AVX2-NEXT:    vmovq %r12, %xmm2
   2601 ; AVX2-NEXT:    vmovq %r11, %xmm3
   2602 ; AVX2-NEXT:    vmovq %r14, %xmm4
   2603 ; AVX2-NEXT:    vmovq %r10, %xmm5
   2604 ; AVX2-NEXT:    vmovq %rdx, %xmm6
   2605 ; AVX2-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 8-byte Folded Reload
   2606 ; AVX2-NEXT:    # xmm7 = mem[0],zero
   2607 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm9[0],xmm8[0]
   2608 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm10[0]
   2609 ; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
   2610 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm13[0],xmm12[0]
   2611 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm15[0],xmm14[0]
   2612 ; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm10, %ymm9
   2613 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7]
   2614 ; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
   2615 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
   2616 ; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
   2617 ; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm8, %ymm8
   2618 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   2619 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
   2620 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
   2621 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   2622 ; AVX2-NEXT:    vpshufb %ymm1, %ymm8, %ymm2
   2623 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   2624 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm4[0]
   2625 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   2626 ; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
   2627 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   2628 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   2629 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm6[0]
   2630 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm5, %ymm3
   2631 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
   2632 ; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
   2633 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
   2634 ; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
   2635 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   2636 ; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
   2637 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   2638 ; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
   2639 ; AVX2-NEXT:    addq $16, %rsp
   2640 ; AVX2-NEXT:    popq %rbx
   2641 ; AVX2-NEXT:    popq %r12
   2642 ; AVX2-NEXT:    popq %r13
   2643 ; AVX2-NEXT:    popq %r14
   2644 ; AVX2-NEXT:    popq %r15
   2645 ; AVX2-NEXT:    popq %rbp
   2646 ; AVX2-NEXT:    vzeroupper
   2647 ; AVX2-NEXT:    retq
   2648 ;
   2649 ; AVX512-LABEL: not_avg_v16i8_wide_constants:
   2650 ; AVX512:       # %bb.0:
   2651 ; AVX512-NEXT:    pushq %rbp
   2652 ; AVX512-NEXT:    pushq %r15
   2653 ; AVX512-NEXT:    pushq %r14
   2654 ; AVX512-NEXT:    pushq %r13
   2655 ; AVX512-NEXT:    pushq %r12
   2656 ; AVX512-NEXT:    pushq %rbx
   2657 ; AVX512-NEXT:    subq $24, %rsp
   2658 ; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
   2659 ; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
   2660 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2661 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
   2662 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
   2663 ; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
   2664 ; AVX512-NEXT:    vpextrq $1, %xmm4, %rbx
   2665 ; AVX512-NEXT:    vmovq %xmm4, %rbp
   2666 ; AVX512-NEXT:    vpextrq $1, %xmm3, %rdi
   2667 ; AVX512-NEXT:    vmovq %xmm3, %rsi
   2668 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   2669 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
   2670 ; AVX512-NEXT:    vpextrq $1, %xmm3, %rdx
   2671 ; AVX512-NEXT:    vmovq %xmm3, %r8
   2672 ; AVX512-NEXT:    vpextrq $1, %xmm2, %r13
   2673 ; AVX512-NEXT:    vmovq %xmm2, %r12
   2674 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
   2675 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2676 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
   2677 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   2678 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
   2679 ; AVX512-NEXT:    vpextrq $1, %xmm3, %r15
   2680 ; AVX512-NEXT:    vmovq %xmm3, %r14
   2681 ; AVX512-NEXT:    vpextrq $1, %xmm2, %r9
   2682 ; AVX512-NEXT:    vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
   2683 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   2684 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
   2685 ; AVX512-NEXT:    vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
   2686 ; AVX512-NEXT:    vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
   2687 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2688 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
   2689 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
   2690 ; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
   2691 ; AVX512-NEXT:    vpextrq $1, %xmm4, %rax
   2692 ; AVX512-NEXT:    addq %rbx, %rax
   2693 ; AVX512-NEXT:    movq %rax, %rbx
   2694 ; AVX512-NEXT:    vmovq %xmm4, %rax
   2695 ; AVX512-NEXT:    addq %rbp, %rax
   2696 ; AVX512-NEXT:    movq %rax, %rbp
   2697 ; AVX512-NEXT:    vpextrq $1, %xmm3, %rax
   2698 ; AVX512-NEXT:    addq %rdi, %rax
   2699 ; AVX512-NEXT:    movq %rax, %rdi
   2700 ; AVX512-NEXT:    vmovq %xmm3, %r10
   2701 ; AVX512-NEXT:    addq %rsi, %r10
   2702 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   2703 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
   2704 ; AVX512-NEXT:    vpextrq $1, %xmm3, %rcx
   2705 ; AVX512-NEXT:    addq %rdx, %rcx
   2706 ; AVX512-NEXT:    vmovq %xmm3, %rax
   2707 ; AVX512-NEXT:    addq %r8, %rax
   2708 ; AVX512-NEXT:    movq %rax, %r8
   2709 ; AVX512-NEXT:    vpextrq $1, %xmm2, %rsi
   2710 ; AVX512-NEXT:    addq %r13, %rsi
   2711 ; AVX512-NEXT:    vmovq %xmm2, %r11
   2712 ; AVX512-NEXT:    addq %r12, %r11
   2713 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
   2714 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2715 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
   2716 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   2717 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
   2718 ; AVX512-NEXT:    vpextrq $1, %xmm3, %rax
   2719 ; AVX512-NEXT:    addq %r15, %rax
   2720 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2721 ; AVX512-NEXT:    vmovq %xmm3, %rax
   2722 ; AVX512-NEXT:    addq %r14, %rax
   2723 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2724 ; AVX512-NEXT:    vpextrq $1, %xmm2, %rax
   2725 ; AVX512-NEXT:    addq %r9, %rax
   2726 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2727 ; AVX512-NEXT:    vmovq %xmm2, %rax
   2728 ; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
   2729 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2730 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
   2731 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
   2732 ; AVX512-NEXT:    vpextrq $1, %xmm2, %rax
   2733 ; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
   2734 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2735 ; AVX512-NEXT:    vmovq %xmm2, %r14
   2736 ; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
   2737 ; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
   2738 ; AVX512-NEXT:    vpextrq $1, %xmm1, %r9
   2739 ; AVX512-NEXT:    addq %rax, %r9
   2740 ; AVX512-NEXT:    vmovq %xmm0, %rax
   2741 ; AVX512-NEXT:    vmovq %xmm1, %rdx
   2742 ; AVX512-NEXT:    addq %rax, %rdx
   2743 ; AVX512-NEXT:    addq $-1, %rbx
   2744 ; AVX512-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2745 ; AVX512-NEXT:    movl $0, %eax
   2746 ; AVX512-NEXT:    adcq $-1, %rax
   2747 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2748 ; AVX512-NEXT:    addq $-1, %rbp
   2749 ; AVX512-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2750 ; AVX512-NEXT:    movl $0, %eax
   2751 ; AVX512-NEXT:    adcq $-1, %rax
   2752 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2753 ; AVX512-NEXT:    addq $-1, %rdi
   2754 ; AVX512-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2755 ; AVX512-NEXT:    movl $0, %eax
   2756 ; AVX512-NEXT:    adcq $-1, %rax
   2757 ; AVX512-NEXT:    movq %rax, (%rsp) # 8-byte Spill
   2758 ; AVX512-NEXT:    addq $-1, %r10
   2759 ; AVX512-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2760 ; AVX512-NEXT:    movl $0, %eax
   2761 ; AVX512-NEXT:    adcq $-1, %rax
   2762 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2763 ; AVX512-NEXT:    addq $-1, %rcx
   2764 ; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2765 ; AVX512-NEXT:    movl $0, %eax
   2766 ; AVX512-NEXT:    adcq $-1, %rax
   2767 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2768 ; AVX512-NEXT:    addq $-1, %r8
   2769 ; AVX512-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2770 ; AVX512-NEXT:    movl $0, %eax
   2771 ; AVX512-NEXT:    adcq $-1, %rax
   2772 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2773 ; AVX512-NEXT:    addq $-1, %rsi
   2774 ; AVX512-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2775 ; AVX512-NEXT:    movl $0, %r13d
   2776 ; AVX512-NEXT:    adcq $-1, %r13
   2777 ; AVX512-NEXT:    addq $-1, %r11
   2778 ; AVX512-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2779 ; AVX512-NEXT:    movl $0, %r15d
   2780 ; AVX512-NEXT:    adcq $-1, %r15
   2781 ; AVX512-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
   2782 ; AVX512-NEXT:    movl $0, %eax
   2783 ; AVX512-NEXT:    adcq $-1, %rax
   2784 ; AVX512-NEXT:    movq %rax, %rsi
   2785 ; AVX512-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
   2786 ; AVX512-NEXT:    movl $0, %r12d
   2787 ; AVX512-NEXT:    adcq $-1, %r12
   2788 ; AVX512-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
   2789 ; AVX512-NEXT:    movl $0, %ebx
   2790 ; AVX512-NEXT:    adcq $-1, %rbx
   2791 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
   2792 ; AVX512-NEXT:    addq $-1, %rbp
   2793 ; AVX512-NEXT:    movl $0, %r11d
   2794 ; AVX512-NEXT:    adcq $-1, %r11
   2795 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
   2796 ; AVX512-NEXT:    addq $-1, %rax
   2797 ; AVX512-NEXT:    movl $0, %r10d
   2798 ; AVX512-NEXT:    adcq $-1, %r10
   2799 ; AVX512-NEXT:    addq $-1, %r14
   2800 ; AVX512-NEXT:    movl $0, %r8d
   2801 ; AVX512-NEXT:    adcq $-1, %r8
   2802 ; AVX512-NEXT:    addq $-1, %r9
   2803 ; AVX512-NEXT:    movl $0, %edi
   2804 ; AVX512-NEXT:    adcq $-1, %rdi
   2805 ; AVX512-NEXT:    addq $-1, %rdx
   2806 ; AVX512-NEXT:    movl $0, %ecx
   2807 ; AVX512-NEXT:    adcq $-1, %rcx
   2808 ; AVX512-NEXT:    shldq $63, %rdx, %rcx
   2809 ; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2810 ; AVX512-NEXT:    shldq $63, %r9, %rdi
   2811 ; AVX512-NEXT:    shldq $63, %r14, %r8
   2812 ; AVX512-NEXT:    shldq $63, %rax, %r10
   2813 ; AVX512-NEXT:    shldq $63, %rbp, %r11
   2814 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
   2815 ; AVX512-NEXT:    shldq $63, %rdx, %rbx
   2816 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
   2817 ; AVX512-NEXT:    shldq $63, %rdx, %r12
   2818 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
   2819 ; AVX512-NEXT:    shldq $63, %rdx, %rsi
   2820 ; AVX512-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
   2821 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
   2822 ; AVX512-NEXT:    shldq $63, %rax, %r15
   2823 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
   2824 ; AVX512-NEXT:    shldq $63, %rax, %r13
   2825 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
   2826 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
   2827 ; AVX512-NEXT:    shldq $63, %rax, %rsi
   2828 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
   2829 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
   2830 ; AVX512-NEXT:    shldq $63, %rax, %rcx
   2831 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
   2832 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
   2833 ; AVX512-NEXT:    shldq $63, %rdx, %rax
   2834 ; AVX512-NEXT:    movq (%rsp), %r14 # 8-byte Reload
   2835 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
   2836 ; AVX512-NEXT:    shldq $63, %rdx, %r14
   2837 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
   2838 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
   2839 ; AVX512-NEXT:    shldq $63, %rdx, %r9
   2840 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
   2841 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
   2842 ; AVX512-NEXT:    shldq $63, %rdx, %rbp
   2843 ; AVX512-NEXT:    vmovq %rbp, %xmm8
   2844 ; AVX512-NEXT:    vmovq %r9, %xmm9
   2845 ; AVX512-NEXT:    vmovq %r14, %xmm10
   2846 ; AVX512-NEXT:    vmovq %rax, %xmm11
   2847 ; AVX512-NEXT:    vmovq %rcx, %xmm12
   2848 ; AVX512-NEXT:    vmovq %rsi, %xmm13
   2849 ; AVX512-NEXT:    vmovq %r13, %xmm14
   2850 ; AVX512-NEXT:    vmovq %r15, %xmm15
   2851 ; AVX512-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
   2852 ; AVX512-NEXT:    # xmm0 = mem[0],zero
   2853 ; AVX512-NEXT:    vmovq %r12, %xmm1
   2854 ; AVX512-NEXT:    vmovq %rbx, %xmm2
   2855 ; AVX512-NEXT:    vmovq %r11, %xmm3
   2856 ; AVX512-NEXT:    vmovq %r10, %xmm4
   2857 ; AVX512-NEXT:    vmovq %r8, %xmm5
   2858 ; AVX512-NEXT:    vmovq %rdi, %xmm6
   2859 ; AVX512-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 8-byte Folded Reload
   2860 ; AVX512-NEXT:    # xmm7 = mem[0],zero
   2861 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm9[0],xmm8[0]
   2862 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm10[0]
   2863 ; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
   2864 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm13[0],xmm12[0]
   2865 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm15[0],xmm14[0]
   2866 ; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm10, %ymm9
   2867 ; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm9, %zmm8
   2868 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   2869 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
   2870 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
   2871 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
   2872 ; AVX512-NEXT:    vpmovqd %zmm8, %ymm2
   2873 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm7[0],xmm6[0]
   2874 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm1
   2875 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
   2876 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
   2877 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
   2878 ; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
   2879 ; AVX512-NEXT:    addq $24, %rsp
   2880 ; AVX512-NEXT:    popq %rbx
   2881 ; AVX512-NEXT:    popq %r12
   2882 ; AVX512-NEXT:    popq %r13
   2883 ; AVX512-NEXT:    popq %r14
   2884 ; AVX512-NEXT:    popq %r15
   2885 ; AVX512-NEXT:    popq %rbp
   2886 ; AVX512-NEXT:    vzeroupper
   2887 ; AVX512-NEXT:    retq
   2888   %1 = load <16 x i8>, <16 x i8>* %a
   2889   %2 = load <16 x i8>, <16 x i8>* %b
   2890   %3 = zext <16 x i8> %1 to <16 x i128>
   2891   %4 = zext <16 x i8> %2 to <16 x i128>
   2892   %5 = add nuw nsw <16 x i128> %3, <i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1>
   2893   %6 = add nuw nsw <16 x i128> %5, %4
   2894   %7 = lshr <16 x i128> %6, <i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1>
   2895   %8 = trunc <16 x i128> %7 to <16 x i8>
   2896   store <16 x i8> %8, <16 x i8>* undef, align 4
   2897   ret void
   2898 }
   2899