1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW 7 8 define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) nounwind { 9 ; SSE2-LABEL: avg_v4i8: 10 ; SSE2: # %bb.0: 11 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 12 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 13 ; SSE2-NEXT: pavgb %xmm0, %xmm1 14 ; SSE2-NEXT: movd %xmm1, (%rax) 15 ; SSE2-NEXT: retq 16 ; 17 ; AVX-LABEL: avg_v4i8: 18 ; AVX: # %bb.0: 19 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 20 ; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 21 ; AVX-NEXT: vpavgb %xmm0, %xmm1, %xmm0 22 ; AVX-NEXT: vmovd %xmm0, (%rax) 23 ; AVX-NEXT: retq 24 %1 = load <4 x i8>, <4 x i8>* %a 25 %2 = load <4 x i8>, <4 x i8>* %b 26 %3 = zext <4 x i8> %1 to <4 x i32> 27 %4 = zext <4 x i8> %2 to <4 x i32> 28 %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1> 29 %6 = add nuw nsw <4 x i32> %5, %4 30 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1> 31 %8 = trunc <4 x i32> %7 to <4 x i8> 32 store <4 x i8> %8, <4 x i8>* undef, align 4 33 ret void 34 } 35 36 define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) nounwind { 37 ; SSE2-LABEL: avg_v8i8: 38 ; SSE2: # %bb.0: 39 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 40 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 41 ; SSE2-NEXT: pavgb %xmm0, %xmm1 42 ; SSE2-NEXT: movq %xmm1, (%rax) 43 ; SSE2-NEXT: retq 44 ; 45 ; AVX-LABEL: avg_v8i8: 46 ; AVX: # %bb.0: 47 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 48 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 49 ; AVX-NEXT: vpavgb %xmm0, %xmm1, %xmm0 50 ; AVX-NEXT: vmovq %xmm0, (%rax) 51 ; AVX-NEXT: retq 52 %1 = load <8 x i8>, <8 x i8>* %a 53 %2 = load <8 x i8>, <8 x i8>* %b 54 %3 = zext <8 x i8> %1 to <8 x i32> 55 %4 = zext <8 x i8> %2 to <8 x i32> 56 %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 57 %6 = add nuw nsw <8 x i32> %5, %4 58 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 59 %8 = trunc <8 x i32> %7 to <8 x i8> 60 store <8 x i8> %8, <8 x i8>* undef, align 4 61 ret void 62 } 63 64 define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind { 65 ; SSE2-LABEL: avg_v16i8: 66 ; SSE2: # %bb.0: 67 ; SSE2-NEXT: movdqa (%rsi), %xmm0 68 ; SSE2-NEXT: pavgb (%rdi), %xmm0 69 ; SSE2-NEXT: movdqu %xmm0, (%rax) 70 ; SSE2-NEXT: retq 71 ; 72 ; AVX-LABEL: avg_v16i8: 73 ; AVX: # %bb.0: 74 ; AVX-NEXT: vmovdqa (%rsi), %xmm0 75 ; AVX-NEXT: vpavgb (%rdi), %xmm0, %xmm0 76 ; AVX-NEXT: vmovdqu %xmm0, (%rax) 77 ; AVX-NEXT: retq 78 %1 = load <16 x i8>, <16 x i8>* %a 79 %2 = load <16 x i8>, <16 x i8>* %b 80 %3 = zext <16 x i8> %1 to <16 x i32> 81 %4 = zext <16 x i8> %2 to <16 x i32> 82 %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 83 %6 = add nuw nsw <16 x i32> %5, %4 84 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 85 %8 = trunc <16 x i32> %7 to <16 x i8> 86 store <16 x i8> %8, <16 x i8>* undef, align 4 87 ret void 88 } 89 90 define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind { 91 ; SSE2-LABEL: avg_v32i8: 92 ; SSE2: # %bb.0: 93 ; SSE2-NEXT: movdqa (%rsi), %xmm0 94 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 95 ; SSE2-NEXT: pavgb (%rdi), %xmm0 96 ; SSE2-NEXT: pavgb 16(%rdi), %xmm1 97 ; SSE2-NEXT: movdqu %xmm1, (%rax) 98 ; SSE2-NEXT: movdqu %xmm0, (%rax) 99 ; SSE2-NEXT: retq 100 ; 101 ; AVX1-LABEL: avg_v32i8: 102 ; AVX1: # %bb.0: 103 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 104 ; AVX1-NEXT: vmovdqa (%rsi), %ymm1 105 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 106 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 107 ; AVX1-NEXT: vpavgb %xmm2, %xmm3, %xmm2 108 ; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0 109 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 110 ; AVX1-NEXT: vmovups %ymm0, (%rax) 111 ; AVX1-NEXT: vzeroupper 112 ; AVX1-NEXT: retq 113 ; 114 ; AVX2-LABEL: avg_v32i8: 115 ; AVX2: # %bb.0: 116 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0 117 ; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 118 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) 119 ; AVX2-NEXT: vzeroupper 120 ; AVX2-NEXT: retq 121 ; 122 ; AVX512-LABEL: avg_v32i8: 123 ; AVX512: # %bb.0: 124 ; AVX512-NEXT: vmovdqa (%rsi), %ymm0 125 ; AVX512-NEXT: vpavgb (%rdi), %ymm0, %ymm0 126 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) 127 ; AVX512-NEXT: vzeroupper 128 ; AVX512-NEXT: retq 129 %1 = load <32 x i8>, <32 x i8>* %a 130 %2 = load <32 x i8>, <32 x i8>* %b 131 %3 = zext <32 x i8> %1 to <32 x i32> 132 %4 = zext <32 x i8> %2 to <32 x i32> 133 %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 134 %6 = add nuw nsw <32 x i32> %5, %4 135 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 136 %8 = trunc <32 x i32> %7 to <32 x i8> 137 store <32 x i8> %8, <32 x i8>* undef, align 4 138 ret void 139 } 140 141 define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { 142 ; SSE2-LABEL: avg_v48i8: 143 ; SSE2: # %bb.0: 144 ; SSE2-NEXT: movdqa (%rdi), %xmm1 145 ; SSE2-NEXT: movdqa 16(%rdi), %xmm6 146 ; SSE2-NEXT: movdqa 32(%rdi), %xmm11 147 ; SSE2-NEXT: movdqa (%rsi), %xmm12 148 ; SSE2-NEXT: movdqa 16(%rsi), %xmm13 149 ; SSE2-NEXT: movdqa 32(%rsi), %xmm0 150 ; SSE2-NEXT: pxor %xmm7, %xmm7 151 ; SSE2-NEXT: movdqa %xmm1, %xmm4 152 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] 153 ; SSE2-NEXT: movdqa %xmm4, %xmm2 154 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] 155 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] 156 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] 157 ; SSE2-NEXT: movdqa %xmm1, %xmm10 158 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] 159 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] 160 ; SSE2-NEXT: movdqa %xmm6, %xmm5 161 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] 162 ; SSE2-NEXT: movdqa %xmm5, %xmm15 163 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] 164 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] 165 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] 166 ; SSE2-NEXT: movdqa %xmm6, %xmm14 167 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] 168 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] 169 ; SSE2-NEXT: movdqa %xmm12, %xmm3 170 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] 171 ; SSE2-NEXT: movdqa %xmm3, %xmm8 172 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] 173 ; SSE2-NEXT: paddd %xmm2, %xmm8 174 ; SSE2-NEXT: movdqa %xmm11, %xmm2 175 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] 176 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] 177 ; SSE2-NEXT: paddd %xmm4, %xmm3 178 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] 179 ; SSE2-NEXT: movdqa %xmm12, %xmm9 180 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] 181 ; SSE2-NEXT: paddd %xmm10, %xmm9 182 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] 183 ; SSE2-NEXT: paddd %xmm1, %xmm12 184 ; SSE2-NEXT: movdqa %xmm13, %xmm4 185 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] 186 ; SSE2-NEXT: movdqa %xmm4, %xmm10 187 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] 188 ; SSE2-NEXT: paddd %xmm15, %xmm10 189 ; SSE2-NEXT: movdqa %xmm2, %xmm15 190 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] 191 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] 192 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] 193 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] 194 ; SSE2-NEXT: paddd %xmm5, %xmm4 195 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] 196 ; SSE2-NEXT: movdqa %xmm13, %xmm1 197 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] 198 ; SSE2-NEXT: paddd %xmm14, %xmm1 199 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] 200 ; SSE2-NEXT: paddd %xmm6, %xmm13 201 ; SSE2-NEXT: movdqa %xmm0, %xmm6 202 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] 203 ; SSE2-NEXT: movdqa %xmm6, %xmm14 204 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] 205 ; SSE2-NEXT: paddd %xmm15, %xmm14 206 ; SSE2-NEXT: movdqa %xmm11, %xmm5 207 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] 208 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] 209 ; SSE2-NEXT: paddd %xmm2, %xmm6 210 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] 211 ; SSE2-NEXT: movdqa %xmm0, %xmm2 212 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] 213 ; SSE2-NEXT: paddd %xmm5, %xmm2 214 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3] 215 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] 216 ; SSE2-NEXT: paddd %xmm11, %xmm0 217 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 218 ; SSE2-NEXT: psubd %xmm5, %xmm8 219 ; SSE2-NEXT: psubd %xmm5, %xmm3 220 ; SSE2-NEXT: psubd %xmm5, %xmm9 221 ; SSE2-NEXT: psubd %xmm5, %xmm12 222 ; SSE2-NEXT: psubd %xmm5, %xmm10 223 ; SSE2-NEXT: psubd %xmm5, %xmm4 224 ; SSE2-NEXT: psubd %xmm5, %xmm1 225 ; SSE2-NEXT: psubd %xmm5, %xmm13 226 ; SSE2-NEXT: psubd %xmm5, %xmm14 227 ; SSE2-NEXT: psubd %xmm5, %xmm6 228 ; SSE2-NEXT: psubd %xmm5, %xmm2 229 ; SSE2-NEXT: psubd %xmm5, %xmm0 230 ; SSE2-NEXT: psrld $1, %xmm3 231 ; SSE2-NEXT: psrld $1, %xmm8 232 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255] 233 ; SSE2-NEXT: pand %xmm7, %xmm8 234 ; SSE2-NEXT: pand %xmm7, %xmm3 235 ; SSE2-NEXT: packuswb %xmm8, %xmm3 236 ; SSE2-NEXT: psrld $1, %xmm12 237 ; SSE2-NEXT: psrld $1, %xmm9 238 ; SSE2-NEXT: pand %xmm7, %xmm9 239 ; SSE2-NEXT: pand %xmm7, %xmm12 240 ; SSE2-NEXT: packuswb %xmm9, %xmm12 241 ; SSE2-NEXT: packuswb %xmm3, %xmm12 242 ; SSE2-NEXT: psrld $1, %xmm4 243 ; SSE2-NEXT: psrld $1, %xmm10 244 ; SSE2-NEXT: pand %xmm7, %xmm10 245 ; SSE2-NEXT: pand %xmm7, %xmm4 246 ; SSE2-NEXT: packuswb %xmm10, %xmm4 247 ; SSE2-NEXT: psrld $1, %xmm13 248 ; SSE2-NEXT: psrld $1, %xmm1 249 ; SSE2-NEXT: pand %xmm7, %xmm1 250 ; SSE2-NEXT: pand %xmm7, %xmm13 251 ; SSE2-NEXT: packuswb %xmm1, %xmm13 252 ; SSE2-NEXT: packuswb %xmm4, %xmm13 253 ; SSE2-NEXT: psrld $1, %xmm6 254 ; SSE2-NEXT: psrld $1, %xmm14 255 ; SSE2-NEXT: pand %xmm7, %xmm14 256 ; SSE2-NEXT: pand %xmm7, %xmm6 257 ; SSE2-NEXT: packuswb %xmm14, %xmm6 258 ; SSE2-NEXT: psrld $1, %xmm0 259 ; SSE2-NEXT: psrld $1, %xmm2 260 ; SSE2-NEXT: pand %xmm7, %xmm2 261 ; SSE2-NEXT: pand %xmm7, %xmm0 262 ; SSE2-NEXT: packuswb %xmm2, %xmm0 263 ; SSE2-NEXT: packuswb %xmm6, %xmm0 264 ; SSE2-NEXT: movdqu %xmm0, (%rax) 265 ; SSE2-NEXT: movdqu %xmm13, (%rax) 266 ; SSE2-NEXT: movdqu %xmm12, (%rax) 267 ; SSE2-NEXT: retq 268 ; 269 ; AVX1-LABEL: avg_v48i8: 270 ; AVX1: # %bb.0: 271 ; AVX1-NEXT: vmovdqa (%rdi), %ymm2 272 ; AVX1-NEXT: vmovdqa 32(%rdi), %ymm5 273 ; AVX1-NEXT: vmovdqa (%rsi), %ymm1 274 ; AVX1-NEXT: vmovdqa 32(%rsi), %ymm0 275 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 276 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] 277 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero 278 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,1,2,3] 279 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero 280 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,3] 281 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero 282 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 283 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] 284 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 285 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,1,2,3] 286 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 287 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,3] 288 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 289 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 290 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,0,1] 291 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 292 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] 293 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 294 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[1,1,2,3] 295 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 296 ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 297 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero 298 ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 299 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 300 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] 301 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero 302 ; AVX1-NEXT: vpaddd %xmm5, %xmm7, %xmm5 303 ; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 304 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,1,2,3] 305 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero 306 ; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm9 307 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,3] 308 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero 309 ; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm8 310 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 311 ; AVX1-NEXT: vpaddd %xmm2, %xmm11, %xmm11 312 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,0,1] 313 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero 314 ; AVX1-NEXT: vpaddd %xmm7, %xmm12, %xmm12 315 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[3,1,2,3] 316 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero 317 ; AVX1-NEXT: vpaddd %xmm5, %xmm13, %xmm13 318 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3] 319 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero 320 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 321 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 322 ; AVX1-NEXT: vpaddd %xmm1, %xmm15, %xmm15 323 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 324 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero 325 ; AVX1-NEXT: vpaddd %xmm4, %xmm14, %xmm4 326 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,1,2,3] 327 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero 328 ; AVX1-NEXT: vpaddd %xmm6, %xmm10, %xmm6 329 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 330 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 331 ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 332 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 333 ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 334 ; AVX1-NEXT: vpcmpeqd %xmm7, %xmm7, %xmm7 335 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 336 ; AVX1-NEXT: vpsubd %xmm7, %xmm1, %xmm10 337 ; AVX1-NEXT: vpsubd %xmm7, %xmm9, %xmm9 338 ; AVX1-NEXT: vpsubd %xmm7, %xmm8, %xmm8 339 ; AVX1-NEXT: vpsubd %xmm7, %xmm11, %xmm11 340 ; AVX1-NEXT: vpsubd %xmm7, %xmm12, %xmm12 341 ; AVX1-NEXT: vpsubd %xmm7, %xmm13, %xmm5 342 ; AVX1-NEXT: vpsubd %xmm7, %xmm3, %xmm3 343 ; AVX1-NEXT: vpsubd %xmm7, %xmm15, %xmm1 344 ; AVX1-NEXT: vpsubd %xmm7, %xmm4, %xmm4 345 ; AVX1-NEXT: vpsubd %xmm7, %xmm6, %xmm6 346 ; AVX1-NEXT: vpsubd %xmm7, %xmm2, %xmm2 347 ; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0 348 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 349 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 350 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 351 ; AVX1-NEXT: vpsrld $1, %xmm6, %xmm2 352 ; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 353 ; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2 354 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 355 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 356 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 357 ; AVX1-NEXT: vpsrld $1, %xmm5, %xmm3 358 ; AVX1-NEXT: vpsrld $1, %xmm12, %xmm4 359 ; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 360 ; AVX1-NEXT: vpsrld $1, %xmm11, %xmm4 361 ; AVX1-NEXT: vpsrld $1, %xmm8, %xmm5 362 ; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 363 ; AVX1-NEXT: vpsrld $1, %xmm9, %xmm5 364 ; AVX1-NEXT: vpsrld $1, %xmm10, %xmm6 365 ; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 366 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 367 ; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5 368 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4 369 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] 370 ; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3 371 ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1 372 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 373 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 374 ; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2 375 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 376 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 377 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) 378 ; AVX1-NEXT: vmovups %ymm1, (%rax) 379 ; AVX1-NEXT: vzeroupper 380 ; AVX1-NEXT: retq 381 ; 382 ; AVX2-LABEL: avg_v48i8: 383 ; AVX2: # %bb.0: 384 ; AVX2-NEXT: vmovdqa (%rdi), %ymm1 385 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 386 ; AVX2-NEXT: vmovdqa (%rsi), %ymm3 387 ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm0 388 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 389 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,0,1] 390 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero 391 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,1,2,3] 392 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero 393 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 394 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 395 ; AVX2-NEXT: vpand %ymm9, %ymm5, %ymm5 396 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero 397 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 398 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero 399 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero 400 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero 401 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 402 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero 403 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6 404 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,3,0,1] 405 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 406 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[3,1,2,3] 407 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero 408 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 409 ; AVX2-NEXT: vpand %ymm9, %ymm2, %ymm2 410 ; AVX2-NEXT: vpaddd %ymm2, %ymm5, %ymm2 411 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero 412 ; AVX2-NEXT: vpaddd %ymm4, %ymm7, %ymm4 413 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 414 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero 415 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 416 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero 417 ; AVX2-NEXT: vpaddd %ymm3, %ymm11, %ymm3 418 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 419 ; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 420 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 421 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 422 ; AVX2-NEXT: vpaddd %ymm0, %ymm10, %ymm0 423 ; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 424 ; AVX2-NEXT: vpsubd %ymm6, %ymm2, %ymm2 425 ; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4 426 ; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1 427 ; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm3 428 ; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5 429 ; AVX2-NEXT: vpsubd %ymm6, %ymm0, %ymm0 430 ; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 431 ; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 432 ; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5 433 ; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 434 ; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 435 ; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 436 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6 437 ; AVX2-NEXT: vpackusdw %xmm6, %xmm4, %xmm4 438 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 439 ; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4 440 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm7 441 ; AVX2-NEXT: vpackusdw %xmm7, %xmm1, %xmm1 442 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 443 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] 444 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 445 ; AVX2-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 446 ; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 447 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 448 ; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 449 ; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3 450 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] 451 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 452 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm2 453 ; AVX2-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 454 ; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 455 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 456 ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 457 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0 458 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] 459 ; AVX2-NEXT: vmovdqu %xmm0, (%rax) 460 ; AVX2-NEXT: vmovdqu %ymm1, (%rax) 461 ; AVX2-NEXT: vzeroupper 462 ; AVX2-NEXT: retq 463 ; 464 ; AVX512F-LABEL: avg_v48i8: 465 ; AVX512F: # %bb.0: 466 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 467 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 468 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm2 469 ; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm3 470 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4 471 ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm5 472 ; AVX512F-NEXT: vpavgb %xmm5, %xmm4, %xmm4 473 ; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0 474 ; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 475 ; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1 476 ; AVX512F-NEXT: vmovdqu %xmm1, (%rax) 477 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) 478 ; AVX512F-NEXT: vzeroupper 479 ; AVX512F-NEXT: retq 480 ; 481 ; AVX512BW-LABEL: avg_v48i8: 482 ; AVX512BW: # %bb.0: 483 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 484 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 485 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 486 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm3 487 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero 488 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 489 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 490 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 491 ; AVX512BW-NEXT: vpaddd %zmm4, %zmm2, %zmm2 492 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm4 493 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero 494 ; AVX512BW-NEXT: vpaddd %zmm4, %zmm3, %zmm3 495 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 496 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 497 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 498 ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 499 ; AVX512BW-NEXT: vpsubd %zmm1, %zmm2, %zmm2 500 ; AVX512BW-NEXT: vpsubd %zmm1, %zmm3, %zmm3 501 ; AVX512BW-NEXT: vpsubd %zmm1, %zmm0, %zmm0 502 ; AVX512BW-NEXT: vpsrld $1, %zmm0, %zmm0 503 ; AVX512BW-NEXT: vpsrld $1, %zmm3, %zmm1 504 ; AVX512BW-NEXT: vpsrld $1, %zmm2, %zmm2 505 ; AVX512BW-NEXT: vpmovdw %zmm2, %ymm2 506 ; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 507 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 508 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 509 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 510 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 511 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 512 ; AVX512BW-NEXT: vmovdqu %ymm1, (%rax) 513 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, (%rax) 514 ; AVX512BW-NEXT: vzeroupper 515 ; AVX512BW-NEXT: retq 516 %1 = load <48 x i8>, <48 x i8>* %a 517 %2 = load <48 x i8>, <48 x i8>* %b 518 %3 = zext <48 x i8> %1 to <48 x i32> 519 %4 = zext <48 x i8> %2 to <48 x i32> 520 %5 = add nuw nsw <48 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 521 %6 = add nuw nsw <48 x i32> %5, %4 522 %7 = lshr <48 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 523 %8 = trunc <48 x i32> %7 to <48 x i8> 524 store <48 x i8> %8, <48 x i8>* undef, align 4 525 ret void 526 } 527 528 define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind { 529 ; SSE2-LABEL: avg_v64i8: 530 ; SSE2: # %bb.0: 531 ; SSE2-NEXT: movdqa (%rsi), %xmm0 532 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 533 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2 534 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 535 ; SSE2-NEXT: pavgb (%rdi), %xmm0 536 ; SSE2-NEXT: pavgb 16(%rdi), %xmm1 537 ; SSE2-NEXT: pavgb 32(%rdi), %xmm2 538 ; SSE2-NEXT: pavgb 48(%rdi), %xmm3 539 ; SSE2-NEXT: movdqu %xmm3, (%rax) 540 ; SSE2-NEXT: movdqu %xmm2, (%rax) 541 ; SSE2-NEXT: movdqu %xmm1, (%rax) 542 ; SSE2-NEXT: movdqu %xmm0, (%rax) 543 ; SSE2-NEXT: retq 544 ; 545 ; AVX1-LABEL: avg_v64i8: 546 ; AVX1: # %bb.0: 547 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 548 ; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 549 ; AVX1-NEXT: vmovdqa (%rsi), %ymm2 550 ; AVX1-NEXT: vmovdqa 32(%rsi), %ymm3 551 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 552 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 553 ; AVX1-NEXT: vpavgb %xmm4, %xmm5, %xmm4 554 ; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0 555 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 556 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 557 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 558 ; AVX1-NEXT: vpavgb %xmm2, %xmm4, %xmm2 559 ; AVX1-NEXT: vpavgb %xmm1, %xmm3, %xmm1 560 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 561 ; AVX1-NEXT: vmovups %ymm1, (%rax) 562 ; AVX1-NEXT: vmovups %ymm0, (%rax) 563 ; AVX1-NEXT: vzeroupper 564 ; AVX1-NEXT: retq 565 ; 566 ; AVX2-LABEL: avg_v64i8: 567 ; AVX2: # %bb.0: 568 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0 569 ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 570 ; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 571 ; AVX2-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 572 ; AVX2-NEXT: vmovdqu %ymm1, (%rax) 573 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) 574 ; AVX2-NEXT: vzeroupper 575 ; AVX2-NEXT: retq 576 ; 577 ; AVX512F-LABEL: avg_v64i8: 578 ; AVX512F: # %bb.0: 579 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 580 ; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 581 ; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm0 582 ; AVX512F-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 583 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) 584 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) 585 ; AVX512F-NEXT: vzeroupper 586 ; AVX512F-NEXT: retq 587 ; 588 ; AVX512BW-LABEL: avg_v64i8: 589 ; AVX512BW: # %bb.0: 590 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 591 ; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0 592 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) 593 ; AVX512BW-NEXT: vzeroupper 594 ; AVX512BW-NEXT: retq 595 %1 = load <64 x i8>, <64 x i8>* %a 596 %2 = load <64 x i8>, <64 x i8>* %b 597 %3 = zext <64 x i8> %1 to <64 x i32> 598 %4 = zext <64 x i8> %2 to <64 x i32> 599 %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 600 %6 = add nuw nsw <64 x i32> %5, %4 601 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 602 %8 = trunc <64 x i32> %7 to <64 x i8> 603 store <64 x i8> %8, <64 x i8>* undef, align 4 604 ret void 605 } 606 607 define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) nounwind { 608 ; SSE2-LABEL: avg_v4i16: 609 ; SSE2: # %bb.0: 610 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 611 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 612 ; SSE2-NEXT: pavgw %xmm0, %xmm1 613 ; SSE2-NEXT: movq %xmm1, (%rax) 614 ; SSE2-NEXT: retq 615 ; 616 ; AVX-LABEL: avg_v4i16: 617 ; AVX: # %bb.0: 618 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 619 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 620 ; AVX-NEXT: vpavgw %xmm0, %xmm1, %xmm0 621 ; AVX-NEXT: vmovq %xmm0, (%rax) 622 ; AVX-NEXT: retq 623 %1 = load <4 x i16>, <4 x i16>* %a 624 %2 = load <4 x i16>, <4 x i16>* %b 625 %3 = zext <4 x i16> %1 to <4 x i32> 626 %4 = zext <4 x i16> %2 to <4 x i32> 627 %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1> 628 %6 = add nuw nsw <4 x i32> %5, %4 629 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1> 630 %8 = trunc <4 x i32> %7 to <4 x i16> 631 store <4 x i16> %8, <4 x i16>* undef, align 4 632 ret void 633 } 634 635 define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) nounwind { 636 ; SSE2-LABEL: avg_v8i16: 637 ; SSE2: # %bb.0: 638 ; SSE2-NEXT: movdqa (%rsi), %xmm0 639 ; SSE2-NEXT: pavgw (%rdi), %xmm0 640 ; SSE2-NEXT: movdqu %xmm0, (%rax) 641 ; SSE2-NEXT: retq 642 ; 643 ; AVX-LABEL: avg_v8i16: 644 ; AVX: # %bb.0: 645 ; AVX-NEXT: vmovdqa (%rsi), %xmm0 646 ; AVX-NEXT: vpavgw (%rdi), %xmm0, %xmm0 647 ; AVX-NEXT: vmovdqu %xmm0, (%rax) 648 ; AVX-NEXT: retq 649 %1 = load <8 x i16>, <8 x i16>* %a 650 %2 = load <8 x i16>, <8 x i16>* %b 651 %3 = zext <8 x i16> %1 to <8 x i32> 652 %4 = zext <8 x i16> %2 to <8 x i32> 653 %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 654 %6 = add nuw nsw <8 x i32> %5, %4 655 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 656 %8 = trunc <8 x i32> %7 to <8 x i16> 657 store <8 x i16> %8, <8 x i16>* undef, align 4 658 ret void 659 } 660 661 define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind { 662 ; SSE2-LABEL: avg_v16i16: 663 ; SSE2: # %bb.0: 664 ; SSE2-NEXT: movdqa (%rsi), %xmm0 665 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 666 ; SSE2-NEXT: pavgw (%rdi), %xmm0 667 ; SSE2-NEXT: pavgw 16(%rdi), %xmm1 668 ; SSE2-NEXT: movdqu %xmm1, (%rax) 669 ; SSE2-NEXT: movdqu %xmm0, (%rax) 670 ; SSE2-NEXT: retq 671 ; 672 ; AVX1-LABEL: avg_v16i16: 673 ; AVX1: # %bb.0: 674 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 675 ; AVX1-NEXT: vmovdqa (%rsi), %ymm1 676 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 677 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 678 ; AVX1-NEXT: vpavgw %xmm2, %xmm3, %xmm2 679 ; AVX1-NEXT: vpavgw %xmm0, %xmm1, %xmm0 680 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 681 ; AVX1-NEXT: vmovups %ymm0, (%rax) 682 ; AVX1-NEXT: vzeroupper 683 ; AVX1-NEXT: retq 684 ; 685 ; AVX2-LABEL: avg_v16i16: 686 ; AVX2: # %bb.0: 687 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0 688 ; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 689 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) 690 ; AVX2-NEXT: vzeroupper 691 ; AVX2-NEXT: retq 692 ; 693 ; AVX512-LABEL: avg_v16i16: 694 ; AVX512: # %bb.0: 695 ; AVX512-NEXT: vmovdqa (%rsi), %ymm0 696 ; AVX512-NEXT: vpavgw (%rdi), %ymm0, %ymm0 697 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) 698 ; AVX512-NEXT: vzeroupper 699 ; AVX512-NEXT: retq 700 %1 = load <16 x i16>, <16 x i16>* %a 701 %2 = load <16 x i16>, <16 x i16>* %b 702 %3 = zext <16 x i16> %1 to <16 x i32> 703 %4 = zext <16 x i16> %2 to <16 x i32> 704 %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 705 %6 = add nuw nsw <16 x i32> %5, %4 706 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 707 %8 = trunc <16 x i32> %7 to <16 x i16> 708 store <16 x i16> %8, <16 x i16>* undef, align 4 709 ret void 710 } 711 712 define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind { 713 ; SSE2-LABEL: avg_v32i16: 714 ; SSE2: # %bb.0: 715 ; SSE2-NEXT: movdqa (%rsi), %xmm0 716 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 717 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2 718 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 719 ; SSE2-NEXT: pavgw (%rdi), %xmm0 720 ; SSE2-NEXT: pavgw 16(%rdi), %xmm1 721 ; SSE2-NEXT: pavgw 32(%rdi), %xmm2 722 ; SSE2-NEXT: pavgw 48(%rdi), %xmm3 723 ; SSE2-NEXT: movdqu %xmm3, (%rax) 724 ; SSE2-NEXT: movdqu %xmm2, (%rax) 725 ; SSE2-NEXT: movdqu %xmm1, (%rax) 726 ; SSE2-NEXT: movdqu %xmm0, (%rax) 727 ; SSE2-NEXT: retq 728 ; 729 ; AVX1-LABEL: avg_v32i16: 730 ; AVX1: # %bb.0: 731 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 732 ; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 733 ; AVX1-NEXT: vmovdqa (%rsi), %ymm2 734 ; AVX1-NEXT: vmovdqa 32(%rsi), %ymm3 735 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 736 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 737 ; AVX1-NEXT: vpavgw %xmm4, %xmm5, %xmm4 738 ; AVX1-NEXT: vpavgw %xmm0, %xmm2, %xmm0 739 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 740 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 741 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 742 ; AVX1-NEXT: vpavgw %xmm2, %xmm4, %xmm2 743 ; AVX1-NEXT: vpavgw %xmm1, %xmm3, %xmm1 744 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 745 ; AVX1-NEXT: vmovups %ymm1, (%rax) 746 ; AVX1-NEXT: vmovups %ymm0, (%rax) 747 ; AVX1-NEXT: vzeroupper 748 ; AVX1-NEXT: retq 749 ; 750 ; AVX2-LABEL: avg_v32i16: 751 ; AVX2: # %bb.0: 752 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0 753 ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 754 ; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 755 ; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 756 ; AVX2-NEXT: vmovdqu %ymm1, (%rax) 757 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) 758 ; AVX2-NEXT: vzeroupper 759 ; AVX2-NEXT: retq 760 ; 761 ; AVX512F-LABEL: avg_v32i16: 762 ; AVX512F: # %bb.0: 763 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 764 ; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 765 ; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0 766 ; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 767 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) 768 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) 769 ; AVX512F-NEXT: vzeroupper 770 ; AVX512F-NEXT: retq 771 ; 772 ; AVX512BW-LABEL: avg_v32i16: 773 ; AVX512BW: # %bb.0: 774 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 775 ; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0 776 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) 777 ; AVX512BW-NEXT: vzeroupper 778 ; AVX512BW-NEXT: retq 779 %1 = load <32 x i16>, <32 x i16>* %a 780 %2 = load <32 x i16>, <32 x i16>* %b 781 %3 = zext <32 x i16> %1 to <32 x i32> 782 %4 = zext <32 x i16> %2 to <32 x i32> 783 %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 784 %6 = add nuw nsw <32 x i32> %5, %4 785 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 786 %8 = trunc <32 x i32> %7 to <32 x i16> 787 store <32 x i16> %8, <32 x i16>* undef, align 4 788 ret void 789 } 790 791 define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) nounwind { 792 ; SSE2-LABEL: avg_v4i8_2: 793 ; SSE2: # %bb.0: 794 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 795 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 796 ; SSE2-NEXT: pavgb %xmm0, %xmm1 797 ; SSE2-NEXT: movd %xmm1, (%rax) 798 ; SSE2-NEXT: retq 799 ; 800 ; AVX-LABEL: avg_v4i8_2: 801 ; AVX: # %bb.0: 802 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 803 ; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 804 ; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0 805 ; AVX-NEXT: vmovd %xmm0, (%rax) 806 ; AVX-NEXT: retq 807 %1 = load <4 x i8>, <4 x i8>* %a 808 %2 = load <4 x i8>, <4 x i8>* %b 809 %3 = zext <4 x i8> %1 to <4 x i32> 810 %4 = zext <4 x i8> %2 to <4 x i32> 811 %5 = add nuw nsw <4 x i32> %3, %4 812 %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1> 813 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1> 814 %8 = trunc <4 x i32> %7 to <4 x i8> 815 store <4 x i8> %8, <4 x i8>* undef, align 4 816 ret void 817 } 818 819 define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) nounwind { 820 ; SSE2-LABEL: avg_v8i8_2: 821 ; SSE2: # %bb.0: 822 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 823 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 824 ; SSE2-NEXT: pavgb %xmm0, %xmm1 825 ; SSE2-NEXT: movq %xmm1, (%rax) 826 ; SSE2-NEXT: retq 827 ; 828 ; AVX-LABEL: avg_v8i8_2: 829 ; AVX: # %bb.0: 830 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 831 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 832 ; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0 833 ; AVX-NEXT: vmovq %xmm0, (%rax) 834 ; AVX-NEXT: retq 835 %1 = load <8 x i8>, <8 x i8>* %a 836 %2 = load <8 x i8>, <8 x i8>* %b 837 %3 = zext <8 x i8> %1 to <8 x i32> 838 %4 = zext <8 x i8> %2 to <8 x i32> 839 %5 = add nuw nsw <8 x i32> %3, %4 840 %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 841 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 842 %8 = trunc <8 x i32> %7 to <8 x i8> 843 store <8 x i8> %8, <8 x i8>* undef, align 4 844 ret void 845 } 846 847 define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) nounwind { 848 ; SSE2-LABEL: avg_v16i8_2: 849 ; SSE2: # %bb.0: 850 ; SSE2-NEXT: movdqa (%rdi), %xmm0 851 ; SSE2-NEXT: pavgb (%rsi), %xmm0 852 ; SSE2-NEXT: movdqu %xmm0, (%rax) 853 ; SSE2-NEXT: retq 854 ; 855 ; AVX-LABEL: avg_v16i8_2: 856 ; AVX: # %bb.0: 857 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 858 ; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0 859 ; AVX-NEXT: vmovdqu %xmm0, (%rax) 860 ; AVX-NEXT: retq 861 %1 = load <16 x i8>, <16 x i8>* %a 862 %2 = load <16 x i8>, <16 x i8>* %b 863 %3 = zext <16 x i8> %1 to <16 x i32> 864 %4 = zext <16 x i8> %2 to <16 x i32> 865 %5 = add nuw nsw <16 x i32> %3, %4 866 %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 867 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 868 %8 = trunc <16 x i32> %7 to <16 x i8> 869 store <16 x i8> %8, <16 x i8>* undef, align 4 870 ret void 871 } 872 873 define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind { 874 ; SSE2-LABEL: avg_v32i8_2: 875 ; SSE2: # %bb.0: 876 ; SSE2-NEXT: movdqa (%rdi), %xmm0 877 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 878 ; SSE2-NEXT: pavgb (%rsi), %xmm0 879 ; SSE2-NEXT: pavgb 16(%rsi), %xmm1 880 ; SSE2-NEXT: movdqu %xmm1, (%rax) 881 ; SSE2-NEXT: movdqu %xmm0, (%rax) 882 ; SSE2-NEXT: retq 883 ; 884 ; AVX1-LABEL: avg_v32i8_2: 885 ; AVX1: # %bb.0: 886 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 887 ; AVX1-NEXT: vmovdqa (%rsi), %ymm1 888 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 889 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 890 ; AVX1-NEXT: vpavgb %xmm2, %xmm3, %xmm2 891 ; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 892 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 893 ; AVX1-NEXT: vmovups %ymm0, (%rax) 894 ; AVX1-NEXT: vzeroupper 895 ; AVX1-NEXT: retq 896 ; 897 ; AVX2-LABEL: avg_v32i8_2: 898 ; AVX2: # %bb.0: 899 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 900 ; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 901 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) 902 ; AVX2-NEXT: vzeroupper 903 ; AVX2-NEXT: retq 904 ; 905 ; AVX512-LABEL: avg_v32i8_2: 906 ; AVX512: # %bb.0: 907 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 908 ; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0 909 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) 910 ; AVX512-NEXT: vzeroupper 911 ; AVX512-NEXT: retq 912 %1 = load <32 x i8>, <32 x i8>* %a 913 %2 = load <32 x i8>, <32 x i8>* %b 914 %3 = zext <32 x i8> %1 to <32 x i32> 915 %4 = zext <32 x i8> %2 to <32 x i32> 916 %5 = add nuw nsw <32 x i32> %3, %4 917 %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 918 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 919 %8 = trunc <32 x i32> %7 to <32 x i8> 920 store <32 x i8> %8, <32 x i8>* undef, align 4 921 ret void 922 } 923 924 define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind { 925 ; SSE2-LABEL: avg_v64i8_2: 926 ; SSE2: # %bb.0: 927 ; SSE2-NEXT: movdqa (%rsi), %xmm0 928 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 929 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2 930 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 931 ; SSE2-NEXT: pavgb %xmm0, %xmm0 932 ; SSE2-NEXT: pavgb %xmm1, %xmm1 933 ; SSE2-NEXT: pavgb %xmm2, %xmm2 934 ; SSE2-NEXT: pavgb %xmm3, %xmm3 935 ; SSE2-NEXT: movdqu %xmm3, (%rax) 936 ; SSE2-NEXT: movdqu %xmm2, (%rax) 937 ; SSE2-NEXT: movdqu %xmm1, (%rax) 938 ; SSE2-NEXT: movdqu %xmm0, (%rax) 939 ; SSE2-NEXT: retq 940 ; 941 ; AVX1-LABEL: avg_v64i8_2: 942 ; AVX1: # %bb.0: 943 ; AVX1-NEXT: vmovdqa (%rsi), %ymm0 944 ; AVX1-NEXT: vmovdqa 32(%rsi), %ymm1 945 ; AVX1-NEXT: vpavgb %xmm0, %xmm0, %xmm2 946 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 947 ; AVX1-NEXT: vpavgb %xmm0, %xmm0, %xmm0 948 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 949 ; AVX1-NEXT: vpavgb %xmm1, %xmm1, %xmm2 950 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 951 ; AVX1-NEXT: vpavgb %xmm1, %xmm1, %xmm1 952 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 953 ; AVX1-NEXT: vmovups %ymm1, (%rax) 954 ; AVX1-NEXT: vmovups %ymm0, (%rax) 955 ; AVX1-NEXT: vzeroupper 956 ; AVX1-NEXT: retq 957 ; 958 ; AVX2-LABEL: avg_v64i8_2: 959 ; AVX2: # %bb.0: 960 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0 961 ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 962 ; AVX2-NEXT: vpavgb %ymm0, %ymm0, %ymm0 963 ; AVX2-NEXT: vpavgb %ymm1, %ymm1, %ymm1 964 ; AVX2-NEXT: vmovdqu %ymm1, (%rax) 965 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) 966 ; AVX2-NEXT: vzeroupper 967 ; AVX2-NEXT: retq 968 ; 969 ; AVX512F-LABEL: avg_v64i8_2: 970 ; AVX512F: # %bb.0: 971 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 972 ; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 973 ; AVX512F-NEXT: vpavgb %ymm0, %ymm0, %ymm0 974 ; AVX512F-NEXT: vpavgb %ymm1, %ymm1, %ymm1 975 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) 976 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) 977 ; AVX512F-NEXT: vzeroupper 978 ; AVX512F-NEXT: retq 979 ; 980 ; AVX512BW-LABEL: avg_v64i8_2: 981 ; AVX512BW: # %bb.0: 982 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 983 ; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0 984 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) 985 ; AVX512BW-NEXT: vzeroupper 986 ; AVX512BW-NEXT: retq 987 %1 = load <64 x i8>, <64 x i8>* %a 988 %2 = load <64 x i8>, <64 x i8>* %b 989 %3 = zext <64 x i8> %1 to <64 x i32> 990 %4 = zext <64 x i8> %2 to <64 x i32> 991 %5 = add nuw nsw <64 x i32> %4, %4 992 %6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 993 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 994 %8 = trunc <64 x i32> %7 to <64 x i8> 995 store <64 x i8> %8, <64 x i8>* undef, align 4 996 ret void 997 } 998 999 1000 define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) nounwind { 1001 ; SSE2-LABEL: avg_v4i16_2: 1002 ; SSE2: # %bb.0: 1003 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 1004 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 1005 ; SSE2-NEXT: pavgw %xmm0, %xmm1 1006 ; SSE2-NEXT: movq %xmm1, (%rax) 1007 ; SSE2-NEXT: retq 1008 ; 1009 ; AVX-LABEL: avg_v4i16_2: 1010 ; AVX: # %bb.0: 1011 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1012 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1013 ; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 1014 ; AVX-NEXT: vmovq %xmm0, (%rax) 1015 ; AVX-NEXT: retq 1016 %1 = load <4 x i16>, <4 x i16>* %a 1017 %2 = load <4 x i16>, <4 x i16>* %b 1018 %3 = zext <4 x i16> %1 to <4 x i32> 1019 %4 = zext <4 x i16> %2 to <4 x i32> 1020 %5 = add nuw nsw <4 x i32> %3, %4 1021 %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1> 1022 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1> 1023 %8 = trunc <4 x i32> %7 to <4 x i16> 1024 store <4 x i16> %8, <4 x i16>* undef, align 4 1025 ret void 1026 } 1027 1028 define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) nounwind { 1029 ; SSE2-LABEL: avg_v8i16_2: 1030 ; SSE2: # %bb.0: 1031 ; SSE2-NEXT: movdqa (%rdi), %xmm0 1032 ; SSE2-NEXT: pavgw (%rsi), %xmm0 1033 ; SSE2-NEXT: movdqu %xmm0, (%rax) 1034 ; SSE2-NEXT: retq 1035 ; 1036 ; AVX-LABEL: avg_v8i16_2: 1037 ; AVX: # %bb.0: 1038 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 1039 ; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0 1040 ; AVX-NEXT: vmovdqu %xmm0, (%rax) 1041 ; AVX-NEXT: retq 1042 %1 = load <8 x i16>, <8 x i16>* %a 1043 %2 = load <8 x i16>, <8 x i16>* %b 1044 %3 = zext <8 x i16> %1 to <8 x i32> 1045 %4 = zext <8 x i16> %2 to <8 x i32> 1046 %5 = add nuw nsw <8 x i32> %3, %4 1047 %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1048 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1049 %8 = trunc <8 x i32> %7 to <8 x i16> 1050 store <8 x i16> %8, <8 x i16>* undef, align 4 1051 ret void 1052 } 1053 1054 define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind { 1055 ; SSE2-LABEL: avg_v16i16_2: 1056 ; SSE2: # %bb.0: 1057 ; SSE2-NEXT: movdqa (%rdi), %xmm0 1058 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 1059 ; SSE2-NEXT: pavgw (%rsi), %xmm0 1060 ; SSE2-NEXT: pavgw 16(%rsi), %xmm1 1061 ; SSE2-NEXT: movdqu %xmm1, (%rax) 1062 ; SSE2-NEXT: movdqu %xmm0, (%rax) 1063 ; SSE2-NEXT: retq 1064 ; 1065 ; AVX1-LABEL: avg_v16i16_2: 1066 ; AVX1: # %bb.0: 1067 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 1068 ; AVX1-NEXT: vmovdqa (%rsi), %ymm1 1069 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1070 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1071 ; AVX1-NEXT: vpavgw %xmm2, %xmm3, %xmm2 1072 ; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 1073 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1074 ; AVX1-NEXT: vmovups %ymm0, (%rax) 1075 ; AVX1-NEXT: vzeroupper 1076 ; AVX1-NEXT: retq 1077 ; 1078 ; AVX2-LABEL: avg_v16i16_2: 1079 ; AVX2: # %bb.0: 1080 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1081 ; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 1082 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) 1083 ; AVX2-NEXT: vzeroupper 1084 ; AVX2-NEXT: retq 1085 ; 1086 ; AVX512-LABEL: avg_v16i16_2: 1087 ; AVX512: # %bb.0: 1088 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 1089 ; AVX512-NEXT: vpavgw (%rsi), %ymm0, %ymm0 1090 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) 1091 ; AVX512-NEXT: vzeroupper 1092 ; AVX512-NEXT: retq 1093 %1 = load <16 x i16>, <16 x i16>* %a 1094 %2 = load <16 x i16>, <16 x i16>* %b 1095 %3 = zext <16 x i16> %1 to <16 x i32> 1096 %4 = zext <16 x i16> %2 to <16 x i32> 1097 %5 = add nuw nsw <16 x i32> %3, %4 1098 %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1099 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1100 %8 = trunc <16 x i32> %7 to <16 x i16> 1101 store <16 x i16> %8, <16 x i16>* undef, align 4 1102 ret void 1103 } 1104 1105 define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind { 1106 ; SSE2-LABEL: avg_v32i16_2: 1107 ; SSE2: # %bb.0: 1108 ; SSE2-NEXT: movdqa (%rdi), %xmm0 1109 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 1110 ; SSE2-NEXT: movdqa 32(%rdi), %xmm2 1111 ; SSE2-NEXT: movdqa 48(%rdi), %xmm3 1112 ; SSE2-NEXT: pavgw (%rsi), %xmm0 1113 ; SSE2-NEXT: pavgw 16(%rsi), %xmm1 1114 ; SSE2-NEXT: pavgw 32(%rsi), %xmm2 1115 ; SSE2-NEXT: pavgw 48(%rsi), %xmm3 1116 ; SSE2-NEXT: movdqu %xmm3, (%rax) 1117 ; SSE2-NEXT: movdqu %xmm2, (%rax) 1118 ; SSE2-NEXT: movdqu %xmm1, (%rax) 1119 ; SSE2-NEXT: movdqu %xmm0, (%rax) 1120 ; SSE2-NEXT: retq 1121 ; 1122 ; AVX1-LABEL: avg_v32i16_2: 1123 ; AVX1: # %bb.0: 1124 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 1125 ; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 1126 ; AVX1-NEXT: vmovdqa (%rsi), %ymm2 1127 ; AVX1-NEXT: vmovdqa 32(%rsi), %ymm3 1128 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 1129 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 1130 ; AVX1-NEXT: vpavgw %xmm4, %xmm5, %xmm4 1131 ; AVX1-NEXT: vpavgw %xmm2, %xmm0, %xmm0 1132 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 1133 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 1134 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1135 ; AVX1-NEXT: vpavgw %xmm2, %xmm4, %xmm2 1136 ; AVX1-NEXT: vpavgw %xmm3, %xmm1, %xmm1 1137 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1138 ; AVX1-NEXT: vmovups %ymm1, (%rax) 1139 ; AVX1-NEXT: vmovups %ymm0, (%rax) 1140 ; AVX1-NEXT: vzeroupper 1141 ; AVX1-NEXT: retq 1142 ; 1143 ; AVX2-LABEL: avg_v32i16_2: 1144 ; AVX2: # %bb.0: 1145 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1146 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 1147 ; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 1148 ; AVX2-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 1149 ; AVX2-NEXT: vmovdqu %ymm1, (%rax) 1150 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) 1151 ; AVX2-NEXT: vzeroupper 1152 ; AVX2-NEXT: retq 1153 ; 1154 ; AVX512F-LABEL: avg_v32i16_2: 1155 ; AVX512F: # %bb.0: 1156 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 1157 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 1158 ; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0 1159 ; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 1160 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) 1161 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) 1162 ; AVX512F-NEXT: vzeroupper 1163 ; AVX512F-NEXT: retq 1164 ; 1165 ; AVX512BW-LABEL: avg_v32i16_2: 1166 ; AVX512BW: # %bb.0: 1167 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1168 ; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 1169 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) 1170 ; AVX512BW-NEXT: vzeroupper 1171 ; AVX512BW-NEXT: retq 1172 %1 = load <32 x i16>, <32 x i16>* %a 1173 %2 = load <32 x i16>, <32 x i16>* %b 1174 %3 = zext <32 x i16> %1 to <32 x i32> 1175 %4 = zext <32 x i16> %2 to <32 x i32> 1176 %5 = add nuw nsw <32 x i32> %3, %4 1177 %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1178 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1179 %8 = trunc <32 x i32> %7 to <32 x i16> 1180 store <32 x i16> %8, <32 x i16>* undef, align 4 1181 ret void 1182 } 1183 1184 define void @avg_v4i8_const(<4 x i8>* %a) nounwind { 1185 ; SSE2-LABEL: avg_v4i8_const: 1186 ; SSE2: # %bb.0: 1187 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1188 ; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0 1189 ; SSE2-NEXT: movd %xmm0, (%rax) 1190 ; SSE2-NEXT: retq 1191 ; 1192 ; AVX-LABEL: avg_v4i8_const: 1193 ; AVX: # %bb.0: 1194 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1195 ; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 1196 ; AVX-NEXT: vmovd %xmm0, (%rax) 1197 ; AVX-NEXT: retq 1198 %1 = load <4 x i8>, <4 x i8>* %a 1199 %2 = zext <4 x i8> %1 to <4 x i32> 1200 %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4> 1201 %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1> 1202 %5 = trunc <4 x i32> %4 to <4 x i8> 1203 store <4 x i8> %5, <4 x i8>* undef, align 4 1204 ret void 1205 } 1206 1207 define void @avg_v8i8_const(<8 x i8>* %a) nounwind { 1208 ; SSE2-LABEL: avg_v8i8_const: 1209 ; SSE2: # %bb.0: 1210 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 1211 ; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0 1212 ; SSE2-NEXT: movq %xmm0, (%rax) 1213 ; SSE2-NEXT: retq 1214 ; 1215 ; AVX-LABEL: avg_v8i8_const: 1216 ; AVX: # %bb.0: 1217 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1218 ; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 1219 ; AVX-NEXT: vmovq %xmm0, (%rax) 1220 ; AVX-NEXT: retq 1221 %1 = load <8 x i8>, <8 x i8>* %a 1222 %2 = zext <8 x i8> %1 to <8 x i32> 1223 %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 1224 %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1225 %5 = trunc <8 x i32> %4 to <8 x i8> 1226 store <8 x i8> %5, <8 x i8>* undef, align 4 1227 ret void 1228 } 1229 1230 define void @avg_v16i8_const(<16 x i8>* %a) nounwind { 1231 ; SSE2-LABEL: avg_v16i8_const: 1232 ; SSE2: # %bb.0: 1233 ; SSE2-NEXT: movdqa (%rdi), %xmm0 1234 ; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0 1235 ; SSE2-NEXT: movdqu %xmm0, (%rax) 1236 ; SSE2-NEXT: retq 1237 ; 1238 ; AVX-LABEL: avg_v16i8_const: 1239 ; AVX: # %bb.0: 1240 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 1241 ; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 1242 ; AVX-NEXT: vmovdqu %xmm0, (%rax) 1243 ; AVX-NEXT: retq 1244 %1 = load <16 x i8>, <16 x i8>* %a 1245 %2 = zext <16 x i8> %1 to <16 x i32> 1246 %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 1247 %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1248 %5 = trunc <16 x i32> %4 to <16 x i8> 1249 store <16 x i8> %5, <16 x i8>* undef, align 4 1250 ret void 1251 } 1252 1253 define void @avg_v32i8_const(<32 x i8>* %a) nounwind { 1254 ; SSE2-LABEL: avg_v32i8_const: 1255 ; SSE2: # %bb.0: 1256 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] 1257 ; SSE2-NEXT: movdqa (%rdi), %xmm1 1258 ; SSE2-NEXT: pavgb %xmm0, %xmm1 1259 ; SSE2-NEXT: pavgb 16(%rdi), %xmm0 1260 ; SSE2-NEXT: movdqu %xmm0, (%rax) 1261 ; SSE2-NEXT: movdqu %xmm1, (%rax) 1262 ; SSE2-NEXT: retq 1263 ; 1264 ; AVX1-LABEL: avg_v32i8_const: 1265 ; AVX1: # %bb.0: 1266 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 1267 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1268 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] 1269 ; AVX1-NEXT: vpavgb %xmm2, %xmm1, %xmm1 1270 ; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0 1271 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1272 ; AVX1-NEXT: vmovups %ymm0, (%rax) 1273 ; AVX1-NEXT: vzeroupper 1274 ; AVX1-NEXT: retq 1275 ; 1276 ; AVX2-LABEL: avg_v32i8_const: 1277 ; AVX2: # %bb.0: 1278 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1279 ; AVX2-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0 1280 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) 1281 ; AVX2-NEXT: vzeroupper 1282 ; AVX2-NEXT: retq 1283 ; 1284 ; AVX512-LABEL: avg_v32i8_const: 1285 ; AVX512: # %bb.0: 1286 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 1287 ; AVX512-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0 1288 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) 1289 ; AVX512-NEXT: vzeroupper 1290 ; AVX512-NEXT: retq 1291 %1 = load <32 x i8>, <32 x i8>* %a 1292 %2 = zext <32 x i8> %1 to <32 x i32> 1293 %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 1294 %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1295 %5 = trunc <32 x i32> %4 to <32 x i8> 1296 store <32 x i8> %5, <32 x i8>* undef, align 4 1297 ret void 1298 } 1299 1300 define void @avg_v64i8_const(<64 x i8>* %a) nounwind { 1301 ; SSE2-LABEL: avg_v64i8_const: 1302 ; SSE2: # %bb.0: 1303 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] 1304 ; SSE2-NEXT: movdqa (%rdi), %xmm1 1305 ; SSE2-NEXT: pavgb %xmm0, %xmm1 1306 ; SSE2-NEXT: movdqa 16(%rdi), %xmm2 1307 ; SSE2-NEXT: pavgb %xmm0, %xmm2 1308 ; SSE2-NEXT: movdqa 32(%rdi), %xmm3 1309 ; SSE2-NEXT: pavgb %xmm0, %xmm3 1310 ; SSE2-NEXT: pavgb 48(%rdi), %xmm0 1311 ; SSE2-NEXT: movdqu %xmm0, (%rax) 1312 ; SSE2-NEXT: movdqu %xmm3, (%rax) 1313 ; SSE2-NEXT: movdqu %xmm2, (%rax) 1314 ; SSE2-NEXT: movdqu %xmm1, (%rax) 1315 ; SSE2-NEXT: retq 1316 ; 1317 ; AVX1-LABEL: avg_v64i8_const: 1318 ; AVX1: # %bb.0: 1319 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 1320 ; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 1321 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1322 ; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] 1323 ; AVX1-NEXT: vpavgb %xmm3, %xmm2, %xmm2 1324 ; AVX1-NEXT: vpavgb %xmm3, %xmm0, %xmm0 1325 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1326 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1327 ; AVX1-NEXT: vpavgb %xmm3, %xmm2, %xmm2 1328 ; AVX1-NEXT: vpavgb %xmm3, %xmm1, %xmm1 1329 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1330 ; AVX1-NEXT: vmovups %ymm1, (%rax) 1331 ; AVX1-NEXT: vmovups %ymm0, (%rax) 1332 ; AVX1-NEXT: vzeroupper 1333 ; AVX1-NEXT: retq 1334 ; 1335 ; AVX2-LABEL: avg_v64i8_const: 1336 ; AVX2: # %bb.0: 1337 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528] 1338 ; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm1 1339 ; AVX2-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0 1340 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) 1341 ; AVX2-NEXT: vmovdqu %ymm1, (%rax) 1342 ; AVX2-NEXT: vzeroupper 1343 ; AVX2-NEXT: retq 1344 ; 1345 ; AVX512F-LABEL: avg_v64i8_const: 1346 ; AVX512F: # %bb.0: 1347 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528] 1348 ; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm1 1349 ; AVX512F-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0 1350 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) 1351 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) 1352 ; AVX512F-NEXT: vzeroupper 1353 ; AVX512F-NEXT: retq 1354 ; 1355 ; AVX512BW-LABEL: avg_v64i8_const: 1356 ; AVX512BW: # %bb.0: 1357 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1358 ; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %zmm0, %zmm0 1359 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) 1360 ; AVX512BW-NEXT: vzeroupper 1361 ; AVX512BW-NEXT: retq 1362 %1 = load <64 x i8>, <64 x i8>* %a 1363 %2 = zext <64 x i8> %1 to <64 x i32> 1364 %3 = add nuw nsw <64 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 1365 %4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1366 %5 = trunc <64 x i32> %4 to <64 x i8> 1367 store <64 x i8> %5, <64 x i8>* undef, align 4 1368 ret void 1369 } 1370 1371 define void @avg_v4i16_const(<4 x i16>* %a) nounwind { 1372 ; SSE2-LABEL: avg_v4i16_const: 1373 ; SSE2: # %bb.0: 1374 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 1375 ; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0 1376 ; SSE2-NEXT: movq %xmm0, (%rax) 1377 ; SSE2-NEXT: retq 1378 ; 1379 ; AVX-LABEL: avg_v4i16_const: 1380 ; AVX: # %bb.0: 1381 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1382 ; AVX-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 1383 ; AVX-NEXT: vmovq %xmm0, (%rax) 1384 ; AVX-NEXT: retq 1385 %1 = load <4 x i16>, <4 x i16>* %a 1386 %2 = zext <4 x i16> %1 to <4 x i32> 1387 %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4> 1388 %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1> 1389 %5 = trunc <4 x i32> %4 to <4 x i16> 1390 store <4 x i16> %5, <4 x i16>* undef, align 4 1391 ret void 1392 } 1393 1394 define void @avg_v8i16_const(<8 x i16>* %a) nounwind { 1395 ; SSE2-LABEL: avg_v8i16_const: 1396 ; SSE2: # %bb.0: 1397 ; SSE2-NEXT: movdqa (%rdi), %xmm0 1398 ; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0 1399 ; SSE2-NEXT: movdqu %xmm0, (%rax) 1400 ; SSE2-NEXT: retq 1401 ; 1402 ; AVX-LABEL: avg_v8i16_const: 1403 ; AVX: # %bb.0: 1404 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 1405 ; AVX-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 1406 ; AVX-NEXT: vmovdqu %xmm0, (%rax) 1407 ; AVX-NEXT: retq 1408 %1 = load <8 x i16>, <8 x i16>* %a 1409 %2 = zext <8 x i16> %1 to <8 x i32> 1410 %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 1411 %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1412 %5 = trunc <8 x i32> %4 to <8 x i16> 1413 store <8 x i16> %5, <8 x i16>* undef, align 4 1414 ret void 1415 } 1416 1417 define void @avg_v16i16_const(<16 x i16>* %a) nounwind { 1418 ; SSE2-LABEL: avg_v16i16_const: 1419 ; SSE2: # %bb.0: 1420 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] 1421 ; SSE2-NEXT: movdqa (%rdi), %xmm1 1422 ; SSE2-NEXT: pavgw %xmm0, %xmm1 1423 ; SSE2-NEXT: pavgw 16(%rdi), %xmm0 1424 ; SSE2-NEXT: movdqu %xmm0, (%rax) 1425 ; SSE2-NEXT: movdqu %xmm1, (%rax) 1426 ; SSE2-NEXT: retq 1427 ; 1428 ; AVX1-LABEL: avg_v16i16_const: 1429 ; AVX1: # %bb.0: 1430 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 1431 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1432 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] 1433 ; AVX1-NEXT: vpavgw %xmm2, %xmm1, %xmm1 1434 ; AVX1-NEXT: vpavgw %xmm2, %xmm0, %xmm0 1435 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1436 ; AVX1-NEXT: vmovups %ymm0, (%rax) 1437 ; AVX1-NEXT: vzeroupper 1438 ; AVX1-NEXT: retq 1439 ; 1440 ; AVX2-LABEL: avg_v16i16_const: 1441 ; AVX2: # %bb.0: 1442 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1443 ; AVX2-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0 1444 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) 1445 ; AVX2-NEXT: vzeroupper 1446 ; AVX2-NEXT: retq 1447 ; 1448 ; AVX512-LABEL: avg_v16i16_const: 1449 ; AVX512: # %bb.0: 1450 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 1451 ; AVX512-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0 1452 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) 1453 ; AVX512-NEXT: vzeroupper 1454 ; AVX512-NEXT: retq 1455 %1 = load <16 x i16>, <16 x i16>* %a 1456 %2 = zext <16 x i16> %1 to <16 x i32> 1457 %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 1458 %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1459 %5 = trunc <16 x i32> %4 to <16 x i16> 1460 store <16 x i16> %5, <16 x i16>* undef, align 4 1461 ret void 1462 } 1463 1464 define void @avg_v32i16_const(<32 x i16>* %a) nounwind { 1465 ; SSE2-LABEL: avg_v32i16_const: 1466 ; SSE2: # %bb.0: 1467 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] 1468 ; SSE2-NEXT: movdqa (%rdi), %xmm1 1469 ; SSE2-NEXT: pavgw %xmm0, %xmm1 1470 ; SSE2-NEXT: movdqa 16(%rdi), %xmm2 1471 ; SSE2-NEXT: pavgw %xmm0, %xmm2 1472 ; SSE2-NEXT: movdqa 32(%rdi), %xmm3 1473 ; SSE2-NEXT: pavgw %xmm0, %xmm3 1474 ; SSE2-NEXT: pavgw 48(%rdi), %xmm0 1475 ; SSE2-NEXT: movdqu %xmm0, (%rax) 1476 ; SSE2-NEXT: movdqu %xmm3, (%rax) 1477 ; SSE2-NEXT: movdqu %xmm2, (%rax) 1478 ; SSE2-NEXT: movdqu %xmm1, (%rax) 1479 ; SSE2-NEXT: retq 1480 ; 1481 ; AVX1-LABEL: avg_v32i16_const: 1482 ; AVX1: # %bb.0: 1483 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 1484 ; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 1485 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1486 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7] 1487 ; AVX1-NEXT: vpavgw %xmm3, %xmm2, %xmm2 1488 ; AVX1-NEXT: vpavgw %xmm3, %xmm0, %xmm0 1489 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1490 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1491 ; AVX1-NEXT: vpavgw %xmm3, %xmm2, %xmm2 1492 ; AVX1-NEXT: vpavgw %xmm3, %xmm1, %xmm1 1493 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1494 ; AVX1-NEXT: vmovups %ymm1, (%rax) 1495 ; AVX1-NEXT: vmovups %ymm0, (%rax) 1496 ; AVX1-NEXT: vzeroupper 1497 ; AVX1-NEXT: retq 1498 ; 1499 ; AVX2-LABEL: avg_v32i16_const: 1500 ; AVX2: # %bb.0: 1501 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] 1502 ; AVX2-NEXT: # ymm0 = mem[0,1,0,1] 1503 ; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm1 1504 ; AVX2-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0 1505 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) 1506 ; AVX2-NEXT: vmovdqu %ymm1, (%rax) 1507 ; AVX2-NEXT: vzeroupper 1508 ; AVX2-NEXT: retq 1509 ; 1510 ; AVX512F-LABEL: avg_v32i16_const: 1511 ; AVX512F: # %bb.0: 1512 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] 1513 ; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] 1514 ; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm1 1515 ; AVX512F-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0 1516 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) 1517 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) 1518 ; AVX512F-NEXT: vzeroupper 1519 ; AVX512F-NEXT: retq 1520 ; 1521 ; AVX512BW-LABEL: avg_v32i16_const: 1522 ; AVX512BW: # %bb.0: 1523 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1524 ; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %zmm0, %zmm0 1525 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) 1526 ; AVX512BW-NEXT: vzeroupper 1527 ; AVX512BW-NEXT: retq 1528 %1 = load <32 x i16>, <32 x i16>* %a 1529 %2 = zext <32 x i16> %1 to <32 x i32> 1530 %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 1531 %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1532 %5 = trunc <32 x i32> %4 to <32 x i16> 1533 store <32 x i16> %5, <32 x i16>* undef, align 4 1534 ret void 1535 } 1536 1537 define <16 x i8> @avg_v16i8_3(<16 x i8> %a, <16 x i8> %b) nounwind { 1538 ; SSE2-LABEL: avg_v16i8_3: 1539 ; SSE2: # %bb.0: 1540 ; SSE2-NEXT: pavgb %xmm1, %xmm0 1541 ; SSE2-NEXT: retq 1542 ; 1543 ; AVX-LABEL: avg_v16i8_3: 1544 ; AVX: # %bb.0: 1545 ; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0 1546 ; AVX-NEXT: retq 1547 %za = zext <16 x i8> %a to <16 x i16> 1548 %zb = zext <16 x i8> %b to <16 x i16> 1549 %add = add nuw nsw <16 x i16> %za, %zb 1550 %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1551 %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1552 %res = trunc <16 x i16> %lshr to <16 x i8> 1553 ret <16 x i8> %res 1554 } 1555 1556 define <32 x i8> @avg_v32i8_3(<32 x i8> %a, <32 x i8> %b) nounwind { 1557 ; SSE2-LABEL: avg_v32i8_3: 1558 ; SSE2: # %bb.0: 1559 ; SSE2-NEXT: pavgb %xmm2, %xmm0 1560 ; SSE2-NEXT: pavgb %xmm3, %xmm1 1561 ; SSE2-NEXT: retq 1562 ; 1563 ; AVX1-LABEL: avg_v32i8_3: 1564 ; AVX1: # %bb.0: 1565 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1566 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1567 ; AVX1-NEXT: vpavgb %xmm2, %xmm3, %xmm2 1568 ; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 1569 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1570 ; AVX1-NEXT: retq 1571 ; 1572 ; AVX2-LABEL: avg_v32i8_3: 1573 ; AVX2: # %bb.0: 1574 ; AVX2-NEXT: vpavgb %ymm1, %ymm0, %ymm0 1575 ; AVX2-NEXT: retq 1576 ; 1577 ; AVX512-LABEL: avg_v32i8_3: 1578 ; AVX512: # %bb.0: 1579 ; AVX512-NEXT: vpavgb %ymm1, %ymm0, %ymm0 1580 ; AVX512-NEXT: retq 1581 %za = zext <32 x i8> %a to <32 x i16> 1582 %zb = zext <32 x i8> %b to <32 x i16> 1583 %add = add nuw nsw <32 x i16> %za, %zb 1584 %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1585 %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1586 %res = trunc <32 x i16> %lshr to <32 x i8> 1587 ret <32 x i8> %res 1588 } 1589 1590 define <64 x i8> @avg_v64i8_3(<64 x i8> %a, <64 x i8> %b) nounwind { 1591 ; SSE2-LABEL: avg_v64i8_3: 1592 ; SSE2: # %bb.0: 1593 ; SSE2-NEXT: pavgb %xmm4, %xmm0 1594 ; SSE2-NEXT: pavgb %xmm5, %xmm1 1595 ; SSE2-NEXT: pavgb %xmm6, %xmm2 1596 ; SSE2-NEXT: pavgb %xmm7, %xmm3 1597 ; SSE2-NEXT: retq 1598 ; 1599 ; AVX1-LABEL: avg_v64i8_3: 1600 ; AVX1: # %bb.0: 1601 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 1602 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 1603 ; AVX1-NEXT: vpavgb %xmm4, %xmm5, %xmm4 1604 ; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0 1605 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 1606 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 1607 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1608 ; AVX1-NEXT: vpavgb %xmm2, %xmm4, %xmm2 1609 ; AVX1-NEXT: vpavgb %xmm3, %xmm1, %xmm1 1610 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1611 ; AVX1-NEXT: retq 1612 ; 1613 ; AVX2-LABEL: avg_v64i8_3: 1614 ; AVX2: # %bb.0: 1615 ; AVX2-NEXT: vpavgb %ymm2, %ymm0, %ymm0 1616 ; AVX2-NEXT: vpavgb %ymm3, %ymm1, %ymm1 1617 ; AVX2-NEXT: retq 1618 ; 1619 ; AVX512F-LABEL: avg_v64i8_3: 1620 ; AVX512F: # %bb.0: 1621 ; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0 1622 ; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1 1623 ; AVX512F-NEXT: retq 1624 ; 1625 ; AVX512BW-LABEL: avg_v64i8_3: 1626 ; AVX512BW: # %bb.0: 1627 ; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm0 1628 ; AVX512BW-NEXT: retq 1629 %za = zext <64 x i8> %a to <64 x i16> 1630 %zb = zext <64 x i8> %b to <64 x i16> 1631 %add = add nuw nsw <64 x i16> %za, %zb 1632 %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1633 %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1634 %res = trunc <64 x i16> %lshr to <64 x i8> 1635 ret <64 x i8> %res 1636 } 1637 1638 define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind { 1639 ; SSE2-LABEL: avg_v512i8_3: 1640 ; SSE2: # %bb.0: 1641 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1642 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1643 ; SSE2-NEXT: movdqa %xmm8, 496(%rdi) 1644 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1645 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1646 ; SSE2-NEXT: movdqa %xmm8, 480(%rdi) 1647 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1648 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1649 ; SSE2-NEXT: movdqa %xmm8, 464(%rdi) 1650 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1651 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1652 ; SSE2-NEXT: movdqa %xmm8, 448(%rdi) 1653 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1654 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1655 ; SSE2-NEXT: movdqa %xmm8, 432(%rdi) 1656 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1657 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1658 ; SSE2-NEXT: movdqa %xmm8, 416(%rdi) 1659 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1660 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1661 ; SSE2-NEXT: movdqa %xmm8, 400(%rdi) 1662 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1663 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1664 ; SSE2-NEXT: movdqa %xmm8, 384(%rdi) 1665 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1666 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1667 ; SSE2-NEXT: movdqa %xmm8, 368(%rdi) 1668 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1669 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1670 ; SSE2-NEXT: movdqa %xmm8, 352(%rdi) 1671 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1672 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1673 ; SSE2-NEXT: movdqa %xmm8, 336(%rdi) 1674 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1675 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1676 ; SSE2-NEXT: movdqa %xmm8, 320(%rdi) 1677 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1678 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1679 ; SSE2-NEXT: movdqa %xmm8, 304(%rdi) 1680 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1681 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1682 ; SSE2-NEXT: movdqa %xmm8, 288(%rdi) 1683 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1684 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1685 ; SSE2-NEXT: movdqa %xmm8, 272(%rdi) 1686 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1687 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1688 ; SSE2-NEXT: movdqa %xmm8, 256(%rdi) 1689 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1690 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1691 ; SSE2-NEXT: movdqa %xmm8, 240(%rdi) 1692 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1693 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1694 ; SSE2-NEXT: movdqa %xmm8, 224(%rdi) 1695 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1696 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1697 ; SSE2-NEXT: movdqa %xmm8, 208(%rdi) 1698 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1699 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1700 ; SSE2-NEXT: movdqa %xmm8, 192(%rdi) 1701 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1702 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1703 ; SSE2-NEXT: movdqa %xmm8, 176(%rdi) 1704 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1705 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1706 ; SSE2-NEXT: movdqa %xmm8, 160(%rdi) 1707 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1708 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1709 ; SSE2-NEXT: movdqa %xmm8, 144(%rdi) 1710 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 1711 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 1712 ; SSE2-NEXT: movdqa %xmm8, 128(%rdi) 1713 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm7 1714 ; SSE2-NEXT: movdqa %xmm7, 112(%rdi) 1715 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm6 1716 ; SSE2-NEXT: movdqa %xmm6, 96(%rdi) 1717 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm5 1718 ; SSE2-NEXT: movdqa %xmm5, 80(%rdi) 1719 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm4 1720 ; SSE2-NEXT: movdqa %xmm4, 64(%rdi) 1721 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm3 1722 ; SSE2-NEXT: movdqa %xmm3, 48(%rdi) 1723 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm2 1724 ; SSE2-NEXT: movdqa %xmm2, 32(%rdi) 1725 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm1 1726 ; SSE2-NEXT: movdqa %xmm1, 16(%rdi) 1727 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm0 1728 ; SSE2-NEXT: movdqa %xmm0, (%rdi) 1729 ; SSE2-NEXT: movq %rdi, %rax 1730 ; SSE2-NEXT: retq 1731 ; 1732 ; AVX1-LABEL: avg_v512i8_3: 1733 ; AVX1: # %bb.0: 1734 ; AVX1-NEXT: pushq %rbp 1735 ; AVX1-NEXT: movq %rsp, %rbp 1736 ; AVX1-NEXT: andq $-32, %rsp 1737 ; AVX1-NEXT: subq $128, %rsp 1738 ; AVX1-NEXT: vmovdqa 144(%rbp), %ymm8 1739 ; AVX1-NEXT: vmovdqa 112(%rbp), %ymm9 1740 ; AVX1-NEXT: vmovdqa 80(%rbp), %ymm10 1741 ; AVX1-NEXT: vmovdqa 48(%rbp), %ymm11 1742 ; AVX1-NEXT: vmovdqa 16(%rbp), %ymm12 1743 ; AVX1-NEXT: vmovdqa 272(%rbp), %ymm13 1744 ; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm14 1745 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm15 1746 ; AVX1-NEXT: vpavgb %xmm14, %xmm15, %xmm14 1747 ; AVX1-NEXT: vmovdqa 304(%rbp), %ymm15 1748 ; AVX1-NEXT: vpavgb %xmm13, %xmm0, %xmm0 1749 ; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 1750 ; AVX1-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1751 ; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm14 1752 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 1753 ; AVX1-NEXT: vpavgb %xmm14, %xmm0, %xmm0 1754 ; AVX1-NEXT: vmovdqa 336(%rbp), %ymm14 1755 ; AVX1-NEXT: vpavgb %xmm15, %xmm1, %xmm1 1756 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1757 ; AVX1-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1758 ; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm0 1759 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 1760 ; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0 1761 ; AVX1-NEXT: vmovdqa 368(%rbp), %ymm1 1762 ; AVX1-NEXT: vpavgb %xmm14, %xmm2, %xmm2 1763 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 1764 ; AVX1-NEXT: vmovaps %ymm0, (%rsp) # 32-byte Spill 1765 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 1766 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 1767 ; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0 1768 ; AVX1-NEXT: vmovdqa 400(%rbp), %ymm2 1769 ; AVX1-NEXT: vpavgb %xmm1, %xmm3, %xmm1 1770 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm3 1771 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 1772 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm1 1773 ; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0 1774 ; AVX1-NEXT: vmovdqa 432(%rbp), %ymm1 1775 ; AVX1-NEXT: vpavgb %xmm2, %xmm4, %xmm2 1776 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm4 1777 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 1778 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2 1779 ; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0 1780 ; AVX1-NEXT: vmovdqa 464(%rbp), %ymm2 1781 ; AVX1-NEXT: vpavgb %xmm1, %xmm5, %xmm1 1782 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm5 1783 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 1784 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm1 1785 ; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0 1786 ; AVX1-NEXT: vmovdqa 496(%rbp), %ymm1 1787 ; AVX1-NEXT: vpavgb %xmm2, %xmm6, %xmm2 1788 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm6 1789 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 1790 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm2 1791 ; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0 1792 ; AVX1-NEXT: vmovdqa 528(%rbp), %ymm2 1793 ; AVX1-NEXT: vpavgb %xmm1, %xmm7, %xmm1 1794 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm7 1795 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 1796 ; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm1 1797 ; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0 1798 ; AVX1-NEXT: vmovdqa 560(%rbp), %ymm1 1799 ; AVX1-NEXT: vpavgb %xmm2, %xmm12, %xmm2 1800 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm12 1801 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 1802 ; AVX1-NEXT: vextractf128 $1, %ymm11, %xmm2 1803 ; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0 1804 ; AVX1-NEXT: vmovdqa 592(%rbp), %ymm2 1805 ; AVX1-NEXT: vpavgb %xmm1, %xmm11, %xmm1 1806 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm11 1807 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 1808 ; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm1 1809 ; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0 1810 ; AVX1-NEXT: vmovdqa 624(%rbp), %ymm1 1811 ; AVX1-NEXT: vpavgb %xmm2, %xmm10, %xmm2 1812 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm10 1813 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 1814 ; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm2 1815 ; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0 1816 ; AVX1-NEXT: vmovdqa 656(%rbp), %ymm2 1817 ; AVX1-NEXT: vpavgb %xmm1, %xmm9, %xmm1 1818 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm9 1819 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 1820 ; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1 1821 ; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0 1822 ; AVX1-NEXT: vmovdqa 176(%rbp), %ymm1 1823 ; AVX1-NEXT: vpavgb %xmm2, %xmm8, %xmm2 1824 ; AVX1-NEXT: vmovdqa 688(%rbp), %ymm8 1825 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 1826 ; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm2 1827 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm13 1828 ; AVX1-NEXT: vpavgb %xmm2, %xmm13, %xmm2 1829 ; AVX1-NEXT: vpavgb %xmm8, %xmm1, %xmm1 1830 ; AVX1-NEXT: vmovdqa 208(%rbp), %ymm8 1831 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm13 1832 ; AVX1-NEXT: vmovdqa 720(%rbp), %ymm2 1833 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 1834 ; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm15 1835 ; AVX1-NEXT: vpavgb %xmm1, %xmm15, %xmm1 1836 ; AVX1-NEXT: vpavgb %xmm2, %xmm8, %xmm2 1837 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1838 ; AVX1-NEXT: vmovdqa 240(%rbp), %ymm15 1839 ; AVX1-NEXT: vmovdqa 752(%rbp), %ymm8 1840 ; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm2 1841 ; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm14 1842 ; AVX1-NEXT: vpavgb %xmm2, %xmm14, %xmm2 1843 ; AVX1-NEXT: vpavgb %xmm8, %xmm15, %xmm8 1844 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm2 1845 ; AVX1-NEXT: vmovaps %ymm2, 480(%rdi) 1846 ; AVX1-NEXT: vmovaps %ymm1, 448(%rdi) 1847 ; AVX1-NEXT: vmovaps %ymm13, 416(%rdi) 1848 ; AVX1-NEXT: vmovaps %ymm0, 384(%rdi) 1849 ; AVX1-NEXT: vmovaps %ymm9, 352(%rdi) 1850 ; AVX1-NEXT: vmovaps %ymm10, 320(%rdi) 1851 ; AVX1-NEXT: vmovaps %ymm11, 288(%rdi) 1852 ; AVX1-NEXT: vmovaps %ymm12, 256(%rdi) 1853 ; AVX1-NEXT: vmovaps %ymm7, 224(%rdi) 1854 ; AVX1-NEXT: vmovaps %ymm6, 192(%rdi) 1855 ; AVX1-NEXT: vmovaps %ymm5, 160(%rdi) 1856 ; AVX1-NEXT: vmovaps %ymm4, 128(%rdi) 1857 ; AVX1-NEXT: vmovaps %ymm3, 96(%rdi) 1858 ; AVX1-NEXT: vmovaps (%rsp), %ymm0 # 32-byte Reload 1859 ; AVX1-NEXT: vmovaps %ymm0, 64(%rdi) 1860 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1861 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdi) 1862 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1863 ; AVX1-NEXT: vmovaps %ymm0, (%rdi) 1864 ; AVX1-NEXT: movq %rdi, %rax 1865 ; AVX1-NEXT: movq %rbp, %rsp 1866 ; AVX1-NEXT: popq %rbp 1867 ; AVX1-NEXT: vzeroupper 1868 ; AVX1-NEXT: retq 1869 ; 1870 ; AVX2-LABEL: avg_v512i8_3: 1871 ; AVX2: # %bb.0: 1872 ; AVX2-NEXT: pushq %rbp 1873 ; AVX2-NEXT: movq %rsp, %rbp 1874 ; AVX2-NEXT: andq $-32, %rsp 1875 ; AVX2-NEXT: subq $32, %rsp 1876 ; AVX2-NEXT: vmovdqa 240(%rbp), %ymm8 1877 ; AVX2-NEXT: vmovdqa 208(%rbp), %ymm9 1878 ; AVX2-NEXT: vmovdqa 176(%rbp), %ymm10 1879 ; AVX2-NEXT: vmovdqa 144(%rbp), %ymm11 1880 ; AVX2-NEXT: vmovdqa 112(%rbp), %ymm12 1881 ; AVX2-NEXT: vmovdqa 80(%rbp), %ymm13 1882 ; AVX2-NEXT: vmovdqa 48(%rbp), %ymm14 1883 ; AVX2-NEXT: vmovdqa 16(%rbp), %ymm15 1884 ; AVX2-NEXT: vpavgb 272(%rbp), %ymm0, %ymm0 1885 ; AVX2-NEXT: vpavgb 304(%rbp), %ymm1, %ymm1 1886 ; AVX2-NEXT: vpavgb 336(%rbp), %ymm2, %ymm2 1887 ; AVX2-NEXT: vpavgb 368(%rbp), %ymm3, %ymm3 1888 ; AVX2-NEXT: vpavgb 400(%rbp), %ymm4, %ymm4 1889 ; AVX2-NEXT: vpavgb 432(%rbp), %ymm5, %ymm5 1890 ; AVX2-NEXT: vpavgb 464(%rbp), %ymm6, %ymm6 1891 ; AVX2-NEXT: vpavgb 496(%rbp), %ymm7, %ymm7 1892 ; AVX2-NEXT: vpavgb 528(%rbp), %ymm15, %ymm15 1893 ; AVX2-NEXT: vpavgb 560(%rbp), %ymm14, %ymm14 1894 ; AVX2-NEXT: vpavgb 592(%rbp), %ymm13, %ymm13 1895 ; AVX2-NEXT: vpavgb 624(%rbp), %ymm12, %ymm12 1896 ; AVX2-NEXT: vpavgb 656(%rbp), %ymm11, %ymm11 1897 ; AVX2-NEXT: vpavgb 688(%rbp), %ymm10, %ymm10 1898 ; AVX2-NEXT: vpavgb 720(%rbp), %ymm9, %ymm9 1899 ; AVX2-NEXT: vpavgb 752(%rbp), %ymm8, %ymm8 1900 ; AVX2-NEXT: vmovdqa %ymm8, 480(%rdi) 1901 ; AVX2-NEXT: vmovdqa %ymm9, 448(%rdi) 1902 ; AVX2-NEXT: vmovdqa %ymm10, 416(%rdi) 1903 ; AVX2-NEXT: vmovdqa %ymm11, 384(%rdi) 1904 ; AVX2-NEXT: vmovdqa %ymm12, 352(%rdi) 1905 ; AVX2-NEXT: vmovdqa %ymm13, 320(%rdi) 1906 ; AVX2-NEXT: vmovdqa %ymm14, 288(%rdi) 1907 ; AVX2-NEXT: vmovdqa %ymm15, 256(%rdi) 1908 ; AVX2-NEXT: vmovdqa %ymm7, 224(%rdi) 1909 ; AVX2-NEXT: vmovdqa %ymm6, 192(%rdi) 1910 ; AVX2-NEXT: vmovdqa %ymm5, 160(%rdi) 1911 ; AVX2-NEXT: vmovdqa %ymm4, 128(%rdi) 1912 ; AVX2-NEXT: vmovdqa %ymm3, 96(%rdi) 1913 ; AVX2-NEXT: vmovdqa %ymm2, 64(%rdi) 1914 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdi) 1915 ; AVX2-NEXT: vmovdqa %ymm0, (%rdi) 1916 ; AVX2-NEXT: movq %rdi, %rax 1917 ; AVX2-NEXT: movq %rbp, %rsp 1918 ; AVX2-NEXT: popq %rbp 1919 ; AVX2-NEXT: vzeroupper 1920 ; AVX2-NEXT: retq 1921 ; 1922 ; AVX512F-LABEL: avg_v512i8_3: 1923 ; AVX512F: # %bb.0: 1924 ; AVX512F-NEXT: pushq %rbp 1925 ; AVX512F-NEXT: movq %rsp, %rbp 1926 ; AVX512F-NEXT: andq $-32, %rsp 1927 ; AVX512F-NEXT: subq $32, %rsp 1928 ; AVX512F-NEXT: vmovdqa 240(%rbp), %ymm8 1929 ; AVX512F-NEXT: vmovdqa 208(%rbp), %ymm9 1930 ; AVX512F-NEXT: vmovdqa 176(%rbp), %ymm10 1931 ; AVX512F-NEXT: vmovdqa 144(%rbp), %ymm11 1932 ; AVX512F-NEXT: vmovdqa 112(%rbp), %ymm12 1933 ; AVX512F-NEXT: vmovdqa 80(%rbp), %ymm13 1934 ; AVX512F-NEXT: vmovdqa 48(%rbp), %ymm14 1935 ; AVX512F-NEXT: vmovdqa 16(%rbp), %ymm15 1936 ; AVX512F-NEXT: vpavgb 272(%rbp), %ymm0, %ymm0 1937 ; AVX512F-NEXT: vpavgb 304(%rbp), %ymm1, %ymm1 1938 ; AVX512F-NEXT: vpavgb 336(%rbp), %ymm2, %ymm2 1939 ; AVX512F-NEXT: vpavgb 368(%rbp), %ymm3, %ymm3 1940 ; AVX512F-NEXT: vpavgb 400(%rbp), %ymm4, %ymm4 1941 ; AVX512F-NEXT: vpavgb 432(%rbp), %ymm5, %ymm5 1942 ; AVX512F-NEXT: vpavgb 464(%rbp), %ymm6, %ymm6 1943 ; AVX512F-NEXT: vpavgb 496(%rbp), %ymm7, %ymm7 1944 ; AVX512F-NEXT: vpavgb 528(%rbp), %ymm15, %ymm15 1945 ; AVX512F-NEXT: vpavgb 560(%rbp), %ymm14, %ymm14 1946 ; AVX512F-NEXT: vpavgb 592(%rbp), %ymm13, %ymm13 1947 ; AVX512F-NEXT: vpavgb 624(%rbp), %ymm12, %ymm12 1948 ; AVX512F-NEXT: vpavgb 656(%rbp), %ymm11, %ymm11 1949 ; AVX512F-NEXT: vpavgb 688(%rbp), %ymm10, %ymm10 1950 ; AVX512F-NEXT: vpavgb 720(%rbp), %ymm9, %ymm9 1951 ; AVX512F-NEXT: vpavgb 752(%rbp), %ymm8, %ymm8 1952 ; AVX512F-NEXT: vmovdqa %ymm8, 480(%rdi) 1953 ; AVX512F-NEXT: vmovdqa %ymm9, 448(%rdi) 1954 ; AVX512F-NEXT: vmovdqa %ymm10, 416(%rdi) 1955 ; AVX512F-NEXT: vmovdqa %ymm11, 384(%rdi) 1956 ; AVX512F-NEXT: vmovdqa %ymm12, 352(%rdi) 1957 ; AVX512F-NEXT: vmovdqa %ymm13, 320(%rdi) 1958 ; AVX512F-NEXT: vmovdqa %ymm14, 288(%rdi) 1959 ; AVX512F-NEXT: vmovdqa %ymm15, 256(%rdi) 1960 ; AVX512F-NEXT: vmovdqa %ymm7, 224(%rdi) 1961 ; AVX512F-NEXT: vmovdqa %ymm6, 192(%rdi) 1962 ; AVX512F-NEXT: vmovdqa %ymm5, 160(%rdi) 1963 ; AVX512F-NEXT: vmovdqa %ymm4, 128(%rdi) 1964 ; AVX512F-NEXT: vmovdqa %ymm3, 96(%rdi) 1965 ; AVX512F-NEXT: vmovdqa %ymm2, 64(%rdi) 1966 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdi) 1967 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdi) 1968 ; AVX512F-NEXT: movq %rdi, %rax 1969 ; AVX512F-NEXT: movq %rbp, %rsp 1970 ; AVX512F-NEXT: popq %rbp 1971 ; AVX512F-NEXT: vzeroupper 1972 ; AVX512F-NEXT: retq 1973 ; 1974 ; AVX512BW-LABEL: avg_v512i8_3: 1975 ; AVX512BW: # %bb.0: 1976 ; AVX512BW-NEXT: pushq %rbp 1977 ; AVX512BW-NEXT: movq %rsp, %rbp 1978 ; AVX512BW-NEXT: andq $-64, %rsp 1979 ; AVX512BW-NEXT: subq $64, %rsp 1980 ; AVX512BW-NEXT: vpavgb 16(%rbp), %zmm0, %zmm0 1981 ; AVX512BW-NEXT: vpavgb 80(%rbp), %zmm1, %zmm1 1982 ; AVX512BW-NEXT: vpavgb 144(%rbp), %zmm2, %zmm2 1983 ; AVX512BW-NEXT: vpavgb 208(%rbp), %zmm3, %zmm3 1984 ; AVX512BW-NEXT: vpavgb 272(%rbp), %zmm4, %zmm4 1985 ; AVX512BW-NEXT: vpavgb 336(%rbp), %zmm5, %zmm5 1986 ; AVX512BW-NEXT: vpavgb 400(%rbp), %zmm6, %zmm6 1987 ; AVX512BW-NEXT: vpavgb 464(%rbp), %zmm7, %zmm7 1988 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdi) 1989 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdi) 1990 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdi) 1991 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdi) 1992 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdi) 1993 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdi) 1994 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdi) 1995 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdi) 1996 ; AVX512BW-NEXT: movq %rdi, %rax 1997 ; AVX512BW-NEXT: movq %rbp, %rsp 1998 ; AVX512BW-NEXT: popq %rbp 1999 ; AVX512BW-NEXT: vzeroupper 2000 ; AVX512BW-NEXT: retq 2001 %za = zext <512 x i8> %a to <512 x i16> 2002 %zb = zext <512 x i8> %b to <512 x i16> 2003 %add = add nuw nsw <512 x i16> %za, %zb 2004 %add1 = add nuw nsw <512 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 2005 %lshr = lshr <512 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 2006 %res = trunc <512 x i16> %lshr to <512 x i8> 2007 ret <512 x i8> %res 2008 } 2009 2010 ; This is not an avg, but its structurally similar and previously caused a crash 2011 ; because the constants can't be read with APInt::getZExtValue. 2012 define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind { 2013 ; SSE2-LABEL: not_avg_v16i8_wide_constants: 2014 ; SSE2: # %bb.0: 2015 ; SSE2-NEXT: pushq %rbp 2016 ; SSE2-NEXT: pushq %r15 2017 ; SSE2-NEXT: pushq %r14 2018 ; SSE2-NEXT: pushq %r13 2019 ; SSE2-NEXT: pushq %r12 2020 ; SSE2-NEXT: pushq %rbx 2021 ; SSE2-NEXT: movaps (%rdi), %xmm1 2022 ; SSE2-NEXT: movaps (%rsi), %xmm0 2023 ; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) 2024 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 2025 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2026 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 2027 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2028 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 2029 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2030 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 2031 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2032 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d 2033 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d 2034 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d 2035 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d 2036 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d 2037 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d 2038 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi 2039 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi 2040 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx 2041 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx 2042 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp 2043 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 2044 ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 2045 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d 2046 ; SSE2-NEXT: leaq -1(%rax,%r9), %rax 2047 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx 2048 ; SSE2-NEXT: leaq -1(%rbp,%rbx), %rbp 2049 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx 2050 ; SSE2-NEXT: leaq -1(%rdx,%rbx), %rdx 2051 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx 2052 ; SSE2-NEXT: leaq -1(%rcx,%rbx), %rcx 2053 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx 2054 ; SSE2-NEXT: leaq -1(%rsi,%rbx), %rsi 2055 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx 2056 ; SSE2-NEXT: leaq -1(%rdi,%rbx), %r8 2057 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx 2058 ; SSE2-NEXT: leaq -1(%r11,%rbx), %r9 2059 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx 2060 ; SSE2-NEXT: leaq -1(%r10,%rbx), %r11 2061 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx 2062 ; SSE2-NEXT: leaq -1(%r13,%rbx), %r13 2063 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx 2064 ; SSE2-NEXT: leaq -1(%r12,%rbx), %r12 2065 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx 2066 ; SSE2-NEXT: leaq -1(%r15,%rbx), %r15 2067 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx 2068 ; SSE2-NEXT: leaq -1(%r14,%rbx), %r14 2069 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx 2070 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload 2071 ; SSE2-NEXT: leaq -1(%rdi,%rbx), %rdi 2072 ; SSE2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2073 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx 2074 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload 2075 ; SSE2-NEXT: leaq -1(%rdi,%rbx), %rbx 2076 ; SSE2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2077 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx 2078 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload 2079 ; SSE2-NEXT: leaq -1(%rdi,%rbx), %rbx 2080 ; SSE2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2081 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx 2082 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload 2083 ; SSE2-NEXT: leaq -1(%r10,%rbx), %rbx 2084 ; SSE2-NEXT: shrq %rax 2085 ; SSE2-NEXT: movd %eax, %xmm8 2086 ; SSE2-NEXT: shrq %rbp 2087 ; SSE2-NEXT: movd %ebp, %xmm15 2088 ; SSE2-NEXT: shrq %rdx 2089 ; SSE2-NEXT: movd %edx, %xmm9 2090 ; SSE2-NEXT: shrq %rcx 2091 ; SSE2-NEXT: movd %ecx, %xmm2 2092 ; SSE2-NEXT: shrq %rsi 2093 ; SSE2-NEXT: movd %esi, %xmm10 2094 ; SSE2-NEXT: shrq %r8 2095 ; SSE2-NEXT: movd %r8d, %xmm6 2096 ; SSE2-NEXT: shrq %r9 2097 ; SSE2-NEXT: movd %r9d, %xmm11 2098 ; SSE2-NEXT: shrq %r11 2099 ; SSE2-NEXT: movd %r11d, %xmm5 2100 ; SSE2-NEXT: shrq %r13 2101 ; SSE2-NEXT: movd %r13d, %xmm12 2102 ; SSE2-NEXT: shrq %r12 2103 ; SSE2-NEXT: movd %r12d, %xmm3 2104 ; SSE2-NEXT: shrq %r15 2105 ; SSE2-NEXT: movd %r15d, %xmm13 2106 ; SSE2-NEXT: shrq %r14 2107 ; SSE2-NEXT: movd %r14d, %xmm7 2108 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2109 ; SSE2-NEXT: shrq %rax 2110 ; SSE2-NEXT: movd %eax, %xmm14 2111 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2112 ; SSE2-NEXT: shrq %rax 2113 ; SSE2-NEXT: movd %eax, %xmm4 2114 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2115 ; SSE2-NEXT: shrq %rax 2116 ; SSE2-NEXT: movd %eax, %xmm0 2117 ; SSE2-NEXT: shrq %rbx 2118 ; SSE2-NEXT: movd %ebx, %xmm1 2119 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] 2120 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] 2121 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] 2122 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] 2123 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] 2124 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 2125 ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] 2126 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] 2127 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] 2128 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] 2129 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] 2130 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2131 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] 2132 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] 2133 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] 2134 ; SSE2-NEXT: movdqu %xmm1, (%rax) 2135 ; SSE2-NEXT: popq %rbx 2136 ; SSE2-NEXT: popq %r12 2137 ; SSE2-NEXT: popq %r13 2138 ; SSE2-NEXT: popq %r14 2139 ; SSE2-NEXT: popq %r15 2140 ; SSE2-NEXT: popq %rbp 2141 ; SSE2-NEXT: retq 2142 ; 2143 ; AVX1-LABEL: not_avg_v16i8_wide_constants: 2144 ; AVX1: # %bb.0: 2145 ; AVX1-NEXT: pushq %rbp 2146 ; AVX1-NEXT: pushq %r15 2147 ; AVX1-NEXT: pushq %r14 2148 ; AVX1-NEXT: pushq %r13 2149 ; AVX1-NEXT: pushq %r12 2150 ; AVX1-NEXT: pushq %rbx 2151 ; AVX1-NEXT: subq $24, %rsp 2152 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 2153 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 2154 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 2155 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 2156 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 2157 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero 2158 ; AVX1-NEXT: vpextrq $1, %xmm5, %rbx 2159 ; AVX1-NEXT: vmovq %xmm5, %rbp 2160 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 2161 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 2162 ; AVX1-NEXT: vpextrq $1, %xmm4, %rsi 2163 ; AVX1-NEXT: vmovq %xmm4, %rcx 2164 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 2165 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 2166 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero 2167 ; AVX1-NEXT: vpextrq $1, %xmm4, %r8 2168 ; AVX1-NEXT: vmovq %xmm4, %r11 2169 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 2170 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero 2171 ; AVX1-NEXT: vpextrq $1, %xmm3, %r13 2172 ; AVX1-NEXT: vmovq %xmm3, %r12 2173 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 2174 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero 2175 ; AVX1-NEXT: vpextrq $1, %xmm4, %r15 2176 ; AVX1-NEXT: vmovq %xmm4, %rdi 2177 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 2178 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero 2179 ; AVX1-NEXT: vpextrq $1, %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 2180 ; AVX1-NEXT: vmovq %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 2181 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2182 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 2183 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero 2184 ; AVX1-NEXT: vpextrq $1, %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 2185 ; AVX1-NEXT: vmovq %xmm3, %r10 2186 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2187 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 2188 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 2189 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero 2190 ; AVX1-NEXT: vpextrq $1, %xmm4, %rdx 2191 ; AVX1-NEXT: addq %rbx, %rdx 2192 ; AVX1-NEXT: vmovq %xmm4, %r9 2193 ; AVX1-NEXT: addq %rbp, %r9 2194 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 2195 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero 2196 ; AVX1-NEXT: vpextrq $1, %xmm3, %rax 2197 ; AVX1-NEXT: addq %rsi, %rax 2198 ; AVX1-NEXT: movq %rax, %r14 2199 ; AVX1-NEXT: vmovq %xmm3, %rbp 2200 ; AVX1-NEXT: addq %rcx, %rbp 2201 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 2202 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 2203 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero 2204 ; AVX1-NEXT: vpextrq $1, %xmm3, %rsi 2205 ; AVX1-NEXT: addq %r8, %rsi 2206 ; AVX1-NEXT: vmovq %xmm3, %rax 2207 ; AVX1-NEXT: addq %r11, %rax 2208 ; AVX1-NEXT: movq %rax, %r11 2209 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 2210 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 2211 ; AVX1-NEXT: vpextrq $1, %xmm2, %rax 2212 ; AVX1-NEXT: addq %r13, %rax 2213 ; AVX1-NEXT: movq %rax, %rcx 2214 ; AVX1-NEXT: vmovq %xmm2, %rax 2215 ; AVX1-NEXT: addq %r12, %rax 2216 ; AVX1-NEXT: movq %rax, %r8 2217 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 2218 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero 2219 ; AVX1-NEXT: vpextrq $1, %xmm3, %rax 2220 ; AVX1-NEXT: addq %r15, %rax 2221 ; AVX1-NEXT: movq %rax, %rbx 2222 ; AVX1-NEXT: vmovq %xmm3, %rax 2223 ; AVX1-NEXT: addq %rdi, %rax 2224 ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2225 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 2226 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 2227 ; AVX1-NEXT: vpextrq $1, %xmm2, %rax 2228 ; AVX1-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload 2229 ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2230 ; AVX1-NEXT: vmovq %xmm2, %rax 2231 ; AVX1-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload 2232 ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2233 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 2234 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 2235 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 2236 ; AVX1-NEXT: vpextrq $1, %xmm2, %rax 2237 ; AVX1-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload 2238 ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2239 ; AVX1-NEXT: vmovq %xmm2, %r12 2240 ; AVX1-NEXT: addq %r10, %r12 2241 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax 2242 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 2243 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 2244 ; AVX1-NEXT: vpextrq $1, %xmm0, %r10 2245 ; AVX1-NEXT: addq %rax, %r10 2246 ; AVX1-NEXT: vmovq %xmm1, %rax 2247 ; AVX1-NEXT: vmovq %xmm0, %rdi 2248 ; AVX1-NEXT: addq %rax, %rdi 2249 ; AVX1-NEXT: addq $-1, %rdx 2250 ; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2251 ; AVX1-NEXT: movl $0, %eax 2252 ; AVX1-NEXT: adcq $-1, %rax 2253 ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2254 ; AVX1-NEXT: addq $-1, %r9 2255 ; AVX1-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2256 ; AVX1-NEXT: movl $0, %eax 2257 ; AVX1-NEXT: adcq $-1, %rax 2258 ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2259 ; AVX1-NEXT: addq $-1, %r14 2260 ; AVX1-NEXT: movq %r14, (%rsp) # 8-byte Spill 2261 ; AVX1-NEXT: movl $0, %eax 2262 ; AVX1-NEXT: adcq $-1, %rax 2263 ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2264 ; AVX1-NEXT: addq $-1, %rbp 2265 ; AVX1-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2266 ; AVX1-NEXT: movl $0, %eax 2267 ; AVX1-NEXT: adcq $-1, %rax 2268 ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2269 ; AVX1-NEXT: addq $-1, %rsi 2270 ; AVX1-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2271 ; AVX1-NEXT: movl $0, %eax 2272 ; AVX1-NEXT: adcq $-1, %rax 2273 ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2274 ; AVX1-NEXT: addq $-1, %r11 2275 ; AVX1-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2276 ; AVX1-NEXT: movl $0, %eax 2277 ; AVX1-NEXT: adcq $-1, %rax 2278 ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2279 ; AVX1-NEXT: addq $-1, %rcx 2280 ; AVX1-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2281 ; AVX1-NEXT: movl $0, %ebp 2282 ; AVX1-NEXT: adcq $-1, %rbp 2283 ; AVX1-NEXT: addq $-1, %r8 2284 ; AVX1-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2285 ; AVX1-NEXT: movl $0, %r15d 2286 ; AVX1-NEXT: adcq $-1, %r15 2287 ; AVX1-NEXT: addq $-1, %rbx 2288 ; AVX1-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2289 ; AVX1-NEXT: movl $0, %eax 2290 ; AVX1-NEXT: adcq $-1, %rax 2291 ; AVX1-NEXT: movq %rax, %rsi 2292 ; AVX1-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 2293 ; AVX1-NEXT: movl $0, %r13d 2294 ; AVX1-NEXT: adcq $-1, %r13 2295 ; AVX1-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 2296 ; AVX1-NEXT: movl $0, %r14d 2297 ; AVX1-NEXT: adcq $-1, %r14 2298 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 2299 ; AVX1-NEXT: addq $-1, %rdx 2300 ; AVX1-NEXT: movl $0, %r11d 2301 ; AVX1-NEXT: adcq $-1, %r11 2302 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2303 ; AVX1-NEXT: addq $-1, %rax 2304 ; AVX1-NEXT: movl $0, %ebx 2305 ; AVX1-NEXT: adcq $-1, %rbx 2306 ; AVX1-NEXT: addq $-1, %r12 2307 ; AVX1-NEXT: movl $0, %r9d 2308 ; AVX1-NEXT: adcq $-1, %r9 2309 ; AVX1-NEXT: addq $-1, %r10 2310 ; AVX1-NEXT: movl $0, %r8d 2311 ; AVX1-NEXT: adcq $-1, %r8 2312 ; AVX1-NEXT: addq $-1, %rdi 2313 ; AVX1-NEXT: movl $0, %ecx 2314 ; AVX1-NEXT: adcq $-1, %rcx 2315 ; AVX1-NEXT: shldq $63, %rdi, %rcx 2316 ; AVX1-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2317 ; AVX1-NEXT: shldq $63, %r10, %r8 2318 ; AVX1-NEXT: shldq $63, %r12, %r9 2319 ; AVX1-NEXT: shldq $63, %rax, %rbx 2320 ; AVX1-NEXT: shldq $63, %rdx, %r11 2321 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 2322 ; AVX1-NEXT: shldq $63, %rdx, %r14 2323 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 2324 ; AVX1-NEXT: shldq $63, %rdx, %r13 2325 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2326 ; AVX1-NEXT: shldq $63, %rax, %rsi 2327 ; AVX1-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2328 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2329 ; AVX1-NEXT: shldq $63, %rax, %r15 2330 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2331 ; AVX1-NEXT: shldq $63, %rax, %rbp 2332 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload 2333 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2334 ; AVX1-NEXT: shldq $63, %rax, %rsi 2335 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload 2336 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2337 ; AVX1-NEXT: shldq $63, %rax, %rcx 2338 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload 2339 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2340 ; AVX1-NEXT: shldq $63, %rax, %rdi 2341 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload 2342 ; AVX1-NEXT: movq (%rsp), %rax # 8-byte Reload 2343 ; AVX1-NEXT: shldq $63, %rax, %r12 2344 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload 2345 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2346 ; AVX1-NEXT: shldq $63, %rax, %r10 2347 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 2348 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2349 ; AVX1-NEXT: shldq $63, %rdx, %rax 2350 ; AVX1-NEXT: vmovq %rax, %xmm8 2351 ; AVX1-NEXT: vmovq %r10, %xmm0 2352 ; AVX1-NEXT: vmovq %r12, %xmm1 2353 ; AVX1-NEXT: vmovq %rdi, %xmm11 2354 ; AVX1-NEXT: vmovq %rcx, %xmm2 2355 ; AVX1-NEXT: vmovq %rsi, %xmm13 2356 ; AVX1-NEXT: vmovq %rbp, %xmm14 2357 ; AVX1-NEXT: vmovq %r15, %xmm15 2358 ; AVX1-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 8-byte Folded Reload 2359 ; AVX1-NEXT: # xmm9 = mem[0],zero 2360 ; AVX1-NEXT: vmovq %r13, %xmm10 2361 ; AVX1-NEXT: vmovq %r14, %xmm12 2362 ; AVX1-NEXT: vmovq %r11, %xmm3 2363 ; AVX1-NEXT: vmovq %rbx, %xmm4 2364 ; AVX1-NEXT: vmovq %r9, %xmm5 2365 ; AVX1-NEXT: vmovq %r8, %xmm6 2366 ; AVX1-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 8-byte Folded Reload 2367 ; AVX1-NEXT: # xmm7 = mem[0],zero 2368 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm0[0],xmm8[0] 2369 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm11[0],xmm1[0] 2370 ; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[0,2] 2371 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm13[0],xmm2[0] 2372 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm15[0],xmm14[0] 2373 ; AVX1-NEXT: vshufps {{.*#+}} xmm11 = xmm0[0,2],xmm1[0,2] 2374 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2375 ; AVX1-NEXT: vpshufb %xmm1, %xmm8, %xmm0 2376 ; AVX1-NEXT: vpshufb %xmm1, %xmm11, %xmm2 2377 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2378 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm10[0],xmm9[0] 2379 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm12[0] 2380 ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 2381 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm4[0] 2382 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 2383 ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 2384 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm2 2385 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm6[0] 2386 ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[0,2] 2387 ; AVX1-NEXT: vpshufb %xmm1, %xmm3, %xmm1 2388 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 2389 ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 2390 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2391 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) 2392 ; AVX1-NEXT: addq $24, %rsp 2393 ; AVX1-NEXT: popq %rbx 2394 ; AVX1-NEXT: popq %r12 2395 ; AVX1-NEXT: popq %r13 2396 ; AVX1-NEXT: popq %r14 2397 ; AVX1-NEXT: popq %r15 2398 ; AVX1-NEXT: popq %rbp 2399 ; AVX1-NEXT: retq 2400 ; 2401 ; AVX2-LABEL: not_avg_v16i8_wide_constants: 2402 ; AVX2: # %bb.0: 2403 ; AVX2-NEXT: pushq %rbp 2404 ; AVX2-NEXT: pushq %r15 2405 ; AVX2-NEXT: pushq %r14 2406 ; AVX2-NEXT: pushq %r13 2407 ; AVX2-NEXT: pushq %r12 2408 ; AVX2-NEXT: pushq %rbx 2409 ; AVX2-NEXT: subq $16, %rsp 2410 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 2411 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 2412 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2413 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2414 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 2415 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 2416 ; AVX2-NEXT: vpextrq $1, %xmm4, %rbx 2417 ; AVX2-NEXT: vmovq %xmm4, %rbp 2418 ; AVX2-NEXT: vpextrq $1, %xmm3, %rdi 2419 ; AVX2-NEXT: vmovq %xmm3, %rcx 2420 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 2421 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 2422 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 2423 ; AVX2-NEXT: vpextrq $1, %xmm3, %rdx 2424 ; AVX2-NEXT: vmovq %xmm3, %r9 2425 ; AVX2-NEXT: vpextrq $1, %xmm2, %r11 2426 ; AVX2-NEXT: vmovq %xmm2, %r12 2427 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2428 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 2429 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 2430 ; AVX2-NEXT: vpextrq $1, %xmm3, %r15 2431 ; AVX2-NEXT: vmovq %xmm3, %rsi 2432 ; AVX2-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 2433 ; AVX2-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 2434 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 2435 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 2436 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2437 ; AVX2-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 2438 ; AVX2-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 2439 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2440 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2441 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 2442 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 2443 ; AVX2-NEXT: vpextrq $1, %xmm4, %rax 2444 ; AVX2-NEXT: addq %rbx, %rax 2445 ; AVX2-NEXT: movq %rax, %rbx 2446 ; AVX2-NEXT: vmovq %xmm4, %r13 2447 ; AVX2-NEXT: addq %rbp, %r13 2448 ; AVX2-NEXT: vpextrq $1, %xmm3, %r10 2449 ; AVX2-NEXT: addq %rdi, %r10 2450 ; AVX2-NEXT: vmovq %xmm3, %r14 2451 ; AVX2-NEXT: addq %rcx, %r14 2452 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 2453 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 2454 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 2455 ; AVX2-NEXT: vpextrq $1, %xmm3, %rax 2456 ; AVX2-NEXT: addq %rdx, %rax 2457 ; AVX2-NEXT: movq %rax, %rcx 2458 ; AVX2-NEXT: vmovq %xmm3, %r8 2459 ; AVX2-NEXT: addq %r9, %r8 2460 ; AVX2-NEXT: vpextrq $1, %xmm2, %rax 2461 ; AVX2-NEXT: addq %r11, %rax 2462 ; AVX2-NEXT: movq %rax, %r11 2463 ; AVX2-NEXT: vmovq %xmm2, %rax 2464 ; AVX2-NEXT: addq %r12, %rax 2465 ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2466 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2467 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 2468 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 2469 ; AVX2-NEXT: vpextrq $1, %xmm3, %rax 2470 ; AVX2-NEXT: addq %r15, %rax 2471 ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2472 ; AVX2-NEXT: vmovq %xmm3, %rax 2473 ; AVX2-NEXT: addq %rsi, %rax 2474 ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2475 ; AVX2-NEXT: vpextrq $1, %xmm2, %rax 2476 ; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload 2477 ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2478 ; AVX2-NEXT: vmovq %xmm2, %rax 2479 ; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload 2480 ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2481 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 2482 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 2483 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2484 ; AVX2-NEXT: vpextrq $1, %xmm2, %rbp 2485 ; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload 2486 ; AVX2-NEXT: vmovq %xmm2, %r9 2487 ; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload 2488 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax 2489 ; AVX2-NEXT: vpextrq $1, %xmm0, %rdi 2490 ; AVX2-NEXT: addq %rax, %rdi 2491 ; AVX2-NEXT: vmovq %xmm1, %rdx 2492 ; AVX2-NEXT: vmovq %xmm0, %rsi 2493 ; AVX2-NEXT: addq %rdx, %rsi 2494 ; AVX2-NEXT: addq $-1, %rbx 2495 ; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2496 ; AVX2-NEXT: movl $0, %eax 2497 ; AVX2-NEXT: adcq $-1, %rax 2498 ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2499 ; AVX2-NEXT: addq $-1, %r13 2500 ; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2501 ; AVX2-NEXT: movl $0, %eax 2502 ; AVX2-NEXT: adcq $-1, %rax 2503 ; AVX2-NEXT: movq %rax, (%rsp) # 8-byte Spill 2504 ; AVX2-NEXT: addq $-1, %r10 2505 ; AVX2-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2506 ; AVX2-NEXT: movl $0, %eax 2507 ; AVX2-NEXT: adcq $-1, %rax 2508 ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2509 ; AVX2-NEXT: addq $-1, %r14 2510 ; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2511 ; AVX2-NEXT: movl $0, %r13d 2512 ; AVX2-NEXT: adcq $-1, %r13 2513 ; AVX2-NEXT: addq $-1, %rcx 2514 ; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2515 ; AVX2-NEXT: movl $0, %eax 2516 ; AVX2-NEXT: adcq $-1, %rax 2517 ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2518 ; AVX2-NEXT: addq $-1, %r8 2519 ; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2520 ; AVX2-NEXT: movl $0, %r15d 2521 ; AVX2-NEXT: adcq $-1, %r15 2522 ; AVX2-NEXT: addq $-1, %r11 2523 ; AVX2-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2524 ; AVX2-NEXT: movl $0, %ebx 2525 ; AVX2-NEXT: adcq $-1, %rbx 2526 ; AVX2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 2527 ; AVX2-NEXT: movl $0, %r8d 2528 ; AVX2-NEXT: adcq $-1, %r8 2529 ; AVX2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 2530 ; AVX2-NEXT: movl $0, %eax 2531 ; AVX2-NEXT: adcq $-1, %rax 2532 ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2533 ; AVX2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 2534 ; AVX2-NEXT: movl $0, %eax 2535 ; AVX2-NEXT: adcq $-1, %rax 2536 ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2537 ; AVX2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 2538 ; AVX2-NEXT: movl $0, %r12d 2539 ; AVX2-NEXT: adcq $-1, %r12 2540 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload 2541 ; AVX2-NEXT: addq $-1, %rcx 2542 ; AVX2-NEXT: movl $0, %r11d 2543 ; AVX2-NEXT: adcq $-1, %r11 2544 ; AVX2-NEXT: addq $-1, %rbp 2545 ; AVX2-NEXT: movl $0, %r14d 2546 ; AVX2-NEXT: adcq $-1, %r14 2547 ; AVX2-NEXT: addq $-1, %r9 2548 ; AVX2-NEXT: movl $0, %r10d 2549 ; AVX2-NEXT: adcq $-1, %r10 2550 ; AVX2-NEXT: addq $-1, %rdi 2551 ; AVX2-NEXT: movl $0, %edx 2552 ; AVX2-NEXT: adcq $-1, %rdx 2553 ; AVX2-NEXT: addq $-1, %rsi 2554 ; AVX2-NEXT: movl $0, %eax 2555 ; AVX2-NEXT: adcq $-1, %rax 2556 ; AVX2-NEXT: shldq $63, %rsi, %rax 2557 ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2558 ; AVX2-NEXT: shldq $63, %rdi, %rdx 2559 ; AVX2-NEXT: shldq $63, %r9, %r10 2560 ; AVX2-NEXT: shldq $63, %rbp, %r14 2561 ; AVX2-NEXT: shldq $63, %rcx, %r11 2562 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload 2563 ; AVX2-NEXT: shldq $63, %rcx, %r12 2564 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload 2565 ; AVX2-NEXT: shldq $63, %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 2566 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload 2567 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload 2568 ; AVX2-NEXT: shldq $63, %rcx, %r9 2569 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload 2570 ; AVX2-NEXT: shldq $63, %rcx, %r8 2571 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2572 ; AVX2-NEXT: shldq $63, %rax, %rbx 2573 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2574 ; AVX2-NEXT: shldq $63, %rax, %r15 2575 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2576 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload 2577 ; AVX2-NEXT: shldq $63, %rcx, %rax 2578 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload 2579 ; AVX2-NEXT: shldq $63, %rcx, %r13 2580 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload 2581 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload 2582 ; AVX2-NEXT: shldq $63, %rcx, %rbp 2583 ; AVX2-NEXT: movq (%rsp), %rdi # 8-byte Reload 2584 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload 2585 ; AVX2-NEXT: shldq $63, %rcx, %rdi 2586 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload 2587 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload 2588 ; AVX2-NEXT: shldq $63, %rcx, %rsi 2589 ; AVX2-NEXT: vmovq %rsi, %xmm8 2590 ; AVX2-NEXT: vmovq %rdi, %xmm9 2591 ; AVX2-NEXT: vmovq %rbp, %xmm10 2592 ; AVX2-NEXT: vmovq %r13, %xmm11 2593 ; AVX2-NEXT: vmovq %rax, %xmm12 2594 ; AVX2-NEXT: vmovq %r15, %xmm13 2595 ; AVX2-NEXT: vmovq %rbx, %xmm14 2596 ; AVX2-NEXT: vmovq %r8, %xmm15 2597 ; AVX2-NEXT: vmovq %r9, %xmm0 2598 ; AVX2-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Folded Reload 2599 ; AVX2-NEXT: # xmm1 = mem[0],zero 2600 ; AVX2-NEXT: vmovq %r12, %xmm2 2601 ; AVX2-NEXT: vmovq %r11, %xmm3 2602 ; AVX2-NEXT: vmovq %r14, %xmm4 2603 ; AVX2-NEXT: vmovq %r10, %xmm5 2604 ; AVX2-NEXT: vmovq %rdx, %xmm6 2605 ; AVX2-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 8-byte Folded Reload 2606 ; AVX2-NEXT: # xmm7 = mem[0],zero 2607 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm9[0],xmm8[0] 2608 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm10[0] 2609 ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 2610 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm13[0],xmm12[0] 2611 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm15[0],xmm14[0] 2612 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 2613 ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] 2614 ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] 2615 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] 2616 ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] 2617 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm8 2618 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2619 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] 2620 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 2621 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 2622 ; AVX2-NEXT: vpshufb %ymm1, %ymm8, %ymm2 2623 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] 2624 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm4[0] 2625 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 2626 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 2627 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 2628 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2629 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm6[0] 2630 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 2631 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] 2632 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] 2633 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 2634 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 2635 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2636 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 2637 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2638 ; AVX2-NEXT: vmovdqu %xmm0, (%rax) 2639 ; AVX2-NEXT: addq $16, %rsp 2640 ; AVX2-NEXT: popq %rbx 2641 ; AVX2-NEXT: popq %r12 2642 ; AVX2-NEXT: popq %r13 2643 ; AVX2-NEXT: popq %r14 2644 ; AVX2-NEXT: popq %r15 2645 ; AVX2-NEXT: popq %rbp 2646 ; AVX2-NEXT: vzeroupper 2647 ; AVX2-NEXT: retq 2648 ; 2649 ; AVX512-LABEL: not_avg_v16i8_wide_constants: 2650 ; AVX512: # %bb.0: 2651 ; AVX512-NEXT: pushq %rbp 2652 ; AVX512-NEXT: pushq %r15 2653 ; AVX512-NEXT: pushq %r14 2654 ; AVX512-NEXT: pushq %r13 2655 ; AVX512-NEXT: pushq %r12 2656 ; AVX512-NEXT: pushq %rbx 2657 ; AVX512-NEXT: subq $24, %rsp 2658 ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 2659 ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 2660 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2661 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 2662 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 2663 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 2664 ; AVX512-NEXT: vpextrq $1, %xmm4, %rbx 2665 ; AVX512-NEXT: vmovq %xmm4, %rbp 2666 ; AVX512-NEXT: vpextrq $1, %xmm3, %rdi 2667 ; AVX512-NEXT: vmovq %xmm3, %rsi 2668 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 2669 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 2670 ; AVX512-NEXT: vpextrq $1, %xmm3, %rdx 2671 ; AVX512-NEXT: vmovq %xmm3, %r8 2672 ; AVX512-NEXT: vpextrq $1, %xmm2, %r13 2673 ; AVX512-NEXT: vmovq %xmm2, %r12 2674 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 2675 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2676 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 2677 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 2678 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 2679 ; AVX512-NEXT: vpextrq $1, %xmm3, %r15 2680 ; AVX512-NEXT: vmovq %xmm3, %r14 2681 ; AVX512-NEXT: vpextrq $1, %xmm2, %r9 2682 ; AVX512-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 2683 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 2684 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 2685 ; AVX512-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 2686 ; AVX512-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 2687 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2688 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 2689 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 2690 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 2691 ; AVX512-NEXT: vpextrq $1, %xmm4, %rax 2692 ; AVX512-NEXT: addq %rbx, %rax 2693 ; AVX512-NEXT: movq %rax, %rbx 2694 ; AVX512-NEXT: vmovq %xmm4, %rax 2695 ; AVX512-NEXT: addq %rbp, %rax 2696 ; AVX512-NEXT: movq %rax, %rbp 2697 ; AVX512-NEXT: vpextrq $1, %xmm3, %rax 2698 ; AVX512-NEXT: addq %rdi, %rax 2699 ; AVX512-NEXT: movq %rax, %rdi 2700 ; AVX512-NEXT: vmovq %xmm3, %r10 2701 ; AVX512-NEXT: addq %rsi, %r10 2702 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 2703 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 2704 ; AVX512-NEXT: vpextrq $1, %xmm3, %rcx 2705 ; AVX512-NEXT: addq %rdx, %rcx 2706 ; AVX512-NEXT: vmovq %xmm3, %rax 2707 ; AVX512-NEXT: addq %r8, %rax 2708 ; AVX512-NEXT: movq %rax, %r8 2709 ; AVX512-NEXT: vpextrq $1, %xmm2, %rsi 2710 ; AVX512-NEXT: addq %r13, %rsi 2711 ; AVX512-NEXT: vmovq %xmm2, %r11 2712 ; AVX512-NEXT: addq %r12, %r11 2713 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 2714 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2715 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 2716 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 2717 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 2718 ; AVX512-NEXT: vpextrq $1, %xmm3, %rax 2719 ; AVX512-NEXT: addq %r15, %rax 2720 ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2721 ; AVX512-NEXT: vmovq %xmm3, %rax 2722 ; AVX512-NEXT: addq %r14, %rax 2723 ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2724 ; AVX512-NEXT: vpextrq $1, %xmm2, %rax 2725 ; AVX512-NEXT: addq %r9, %rax 2726 ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2727 ; AVX512-NEXT: vmovq %xmm2, %rax 2728 ; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload 2729 ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2730 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 2731 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 2732 ; AVX512-NEXT: vpextrq $1, %xmm2, %rax 2733 ; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload 2734 ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2735 ; AVX512-NEXT: vmovq %xmm2, %r14 2736 ; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload 2737 ; AVX512-NEXT: vpextrq $1, %xmm0, %rax 2738 ; AVX512-NEXT: vpextrq $1, %xmm1, %r9 2739 ; AVX512-NEXT: addq %rax, %r9 2740 ; AVX512-NEXT: vmovq %xmm0, %rax 2741 ; AVX512-NEXT: vmovq %xmm1, %rdx 2742 ; AVX512-NEXT: addq %rax, %rdx 2743 ; AVX512-NEXT: addq $-1, %rbx 2744 ; AVX512-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2745 ; AVX512-NEXT: movl $0, %eax 2746 ; AVX512-NEXT: adcq $-1, %rax 2747 ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2748 ; AVX512-NEXT: addq $-1, %rbp 2749 ; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2750 ; AVX512-NEXT: movl $0, %eax 2751 ; AVX512-NEXT: adcq $-1, %rax 2752 ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2753 ; AVX512-NEXT: addq $-1, %rdi 2754 ; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2755 ; AVX512-NEXT: movl $0, %eax 2756 ; AVX512-NEXT: adcq $-1, %rax 2757 ; AVX512-NEXT: movq %rax, (%rsp) # 8-byte Spill 2758 ; AVX512-NEXT: addq $-1, %r10 2759 ; AVX512-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2760 ; AVX512-NEXT: movl $0, %eax 2761 ; AVX512-NEXT: adcq $-1, %rax 2762 ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2763 ; AVX512-NEXT: addq $-1, %rcx 2764 ; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2765 ; AVX512-NEXT: movl $0, %eax 2766 ; AVX512-NEXT: adcq $-1, %rax 2767 ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2768 ; AVX512-NEXT: addq $-1, %r8 2769 ; AVX512-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2770 ; AVX512-NEXT: movl $0, %eax 2771 ; AVX512-NEXT: adcq $-1, %rax 2772 ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2773 ; AVX512-NEXT: addq $-1, %rsi 2774 ; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2775 ; AVX512-NEXT: movl $0, %r13d 2776 ; AVX512-NEXT: adcq $-1, %r13 2777 ; AVX512-NEXT: addq $-1, %r11 2778 ; AVX512-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2779 ; AVX512-NEXT: movl $0, %r15d 2780 ; AVX512-NEXT: adcq $-1, %r15 2781 ; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 2782 ; AVX512-NEXT: movl $0, %eax 2783 ; AVX512-NEXT: adcq $-1, %rax 2784 ; AVX512-NEXT: movq %rax, %rsi 2785 ; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 2786 ; AVX512-NEXT: movl $0, %r12d 2787 ; AVX512-NEXT: adcq $-1, %r12 2788 ; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 2789 ; AVX512-NEXT: movl $0, %ebx 2790 ; AVX512-NEXT: adcq $-1, %rbx 2791 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload 2792 ; AVX512-NEXT: addq $-1, %rbp 2793 ; AVX512-NEXT: movl $0, %r11d 2794 ; AVX512-NEXT: adcq $-1, %r11 2795 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2796 ; AVX512-NEXT: addq $-1, %rax 2797 ; AVX512-NEXT: movl $0, %r10d 2798 ; AVX512-NEXT: adcq $-1, %r10 2799 ; AVX512-NEXT: addq $-1, %r14 2800 ; AVX512-NEXT: movl $0, %r8d 2801 ; AVX512-NEXT: adcq $-1, %r8 2802 ; AVX512-NEXT: addq $-1, %r9 2803 ; AVX512-NEXT: movl $0, %edi 2804 ; AVX512-NEXT: adcq $-1, %rdi 2805 ; AVX512-NEXT: addq $-1, %rdx 2806 ; AVX512-NEXT: movl $0, %ecx 2807 ; AVX512-NEXT: adcq $-1, %rcx 2808 ; AVX512-NEXT: shldq $63, %rdx, %rcx 2809 ; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2810 ; AVX512-NEXT: shldq $63, %r9, %rdi 2811 ; AVX512-NEXT: shldq $63, %r14, %r8 2812 ; AVX512-NEXT: shldq $63, %rax, %r10 2813 ; AVX512-NEXT: shldq $63, %rbp, %r11 2814 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 2815 ; AVX512-NEXT: shldq $63, %rdx, %rbx 2816 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 2817 ; AVX512-NEXT: shldq $63, %rdx, %r12 2818 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 2819 ; AVX512-NEXT: shldq $63, %rdx, %rsi 2820 ; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2821 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2822 ; AVX512-NEXT: shldq $63, %rax, %r15 2823 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2824 ; AVX512-NEXT: shldq $63, %rax, %r13 2825 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload 2826 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2827 ; AVX512-NEXT: shldq $63, %rax, %rsi 2828 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload 2829 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2830 ; AVX512-NEXT: shldq $63, %rax, %rcx 2831 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 2832 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 2833 ; AVX512-NEXT: shldq $63, %rdx, %rax 2834 ; AVX512-NEXT: movq (%rsp), %r14 # 8-byte Reload 2835 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 2836 ; AVX512-NEXT: shldq $63, %rdx, %r14 2837 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload 2838 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 2839 ; AVX512-NEXT: shldq $63, %rdx, %r9 2840 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 2841 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload 2842 ; AVX512-NEXT: shldq $63, %rdx, %rbp 2843 ; AVX512-NEXT: vmovq %rbp, %xmm8 2844 ; AVX512-NEXT: vmovq %r9, %xmm9 2845 ; AVX512-NEXT: vmovq %r14, %xmm10 2846 ; AVX512-NEXT: vmovq %rax, %xmm11 2847 ; AVX512-NEXT: vmovq %rcx, %xmm12 2848 ; AVX512-NEXT: vmovq %rsi, %xmm13 2849 ; AVX512-NEXT: vmovq %r13, %xmm14 2850 ; AVX512-NEXT: vmovq %r15, %xmm15 2851 ; AVX512-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload 2852 ; AVX512-NEXT: # xmm0 = mem[0],zero 2853 ; AVX512-NEXT: vmovq %r12, %xmm1 2854 ; AVX512-NEXT: vmovq %rbx, %xmm2 2855 ; AVX512-NEXT: vmovq %r11, %xmm3 2856 ; AVX512-NEXT: vmovq %r10, %xmm4 2857 ; AVX512-NEXT: vmovq %r8, %xmm5 2858 ; AVX512-NEXT: vmovq %rdi, %xmm6 2859 ; AVX512-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 8-byte Folded Reload 2860 ; AVX512-NEXT: # xmm7 = mem[0],zero 2861 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm9[0],xmm8[0] 2862 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm10[0] 2863 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 2864 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm13[0],xmm12[0] 2865 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm15[0],xmm14[0] 2866 ; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 2867 ; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 2868 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2869 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] 2870 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 2871 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0] 2872 ; AVX512-NEXT: vpmovqd %zmm8, %ymm2 2873 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm7[0],xmm6[0] 2874 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 2875 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 2876 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 2877 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 2878 ; AVX512-NEXT: vpmovdb %zmm0, (%rax) 2879 ; AVX512-NEXT: addq $24, %rsp 2880 ; AVX512-NEXT: popq %rbx 2881 ; AVX512-NEXT: popq %r12 2882 ; AVX512-NEXT: popq %r13 2883 ; AVX512-NEXT: popq %r14 2884 ; AVX512-NEXT: popq %r15 2885 ; AVX512-NEXT: popq %rbp 2886 ; AVX512-NEXT: vzeroupper 2887 ; AVX512-NEXT: retq 2888 %1 = load <16 x i8>, <16 x i8>* %a 2889 %2 = load <16 x i8>, <16 x i8>* %b 2890 %3 = zext <16 x i8> %1 to <16 x i128> 2891 %4 = zext <16 x i8> %2 to <16 x i128> 2892 %5 = add nuw nsw <16 x i128> %3, <i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1> 2893 %6 = add nuw nsw <16 x i128> %5, %4 2894 %7 = lshr <16 x i128> %6, <i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1> 2895 %8 = trunc <16 x i128> %7 to <16 x i8> 2896 store <16 x i8> %8, <16 x i8>* undef, align 4 2897 ret void 2898 } 2899