1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL 4 5 define <16 x i8> @avg_v16i8_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %src, i16 %mask) nounwind { 6 ; AVX512F-LABEL: avg_v16i8_mask: 7 ; AVX512F: # %bb.0: 8 ; AVX512F-NEXT: vpavgb %xmm1, %xmm0, %xmm0 9 ; AVX512F-NEXT: kmovw %edi, %k1 10 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 11 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 12 ; AVX512F-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 13 ; AVX512F-NEXT: vzeroupper 14 ; AVX512F-NEXT: retq 15 ; 16 ; AVX512BWVL-LABEL: avg_v16i8_mask: 17 ; AVX512BWVL: # %bb.0: 18 ; AVX512BWVL-NEXT: kmovd %edi, %k1 19 ; AVX512BWVL-NEXT: vpavgb %xmm1, %xmm0, %xmm2 {%k1} 20 ; AVX512BWVL-NEXT: vmovdqa %xmm2, %xmm0 21 ; AVX512BWVL-NEXT: retq 22 %za = zext <16 x i8> %a to <16 x i16> 23 %zb = zext <16 x i8> %b to <16 x i16> 24 %add = add nuw nsw <16 x i16> %za, %zb 25 %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 26 %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 27 %trunc = trunc <16 x i16> %lshr to <16 x i8> 28 %mask1 = bitcast i16 %mask to <16 x i1> 29 %res = select <16 x i1> %mask1, <16 x i8> %trunc, <16 x i8> %src 30 ret <16 x i8> %res 31 } 32 33 define <16 x i8> @avg_v16i8_maskz(<16 x i8> %a, <16 x i8> %b, i16 %mask) nounwind { 34 ; AVX512F-LABEL: avg_v16i8_maskz: 35 ; AVX512F: # %bb.0: 36 ; AVX512F-NEXT: vpavgb %xmm1, %xmm0, %xmm0 37 ; AVX512F-NEXT: kmovw %edi, %k1 38 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 39 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 40 ; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0 41 ; AVX512F-NEXT: vzeroupper 42 ; AVX512F-NEXT: retq 43 ; 44 ; AVX512BWVL-LABEL: avg_v16i8_maskz: 45 ; AVX512BWVL: # %bb.0: 46 ; AVX512BWVL-NEXT: kmovd %edi, %k1 47 ; AVX512BWVL-NEXT: vpavgb %xmm1, %xmm0, %xmm0 {%k1} {z} 48 ; AVX512BWVL-NEXT: retq 49 %za = zext <16 x i8> %a to <16 x i16> 50 %zb = zext <16 x i8> %b to <16 x i16> 51 %add = add nuw nsw <16 x i16> %za, %zb 52 %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 53 %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 54 %trunc = trunc <16 x i16> %lshr to <16 x i8> 55 %mask1 = bitcast i16 %mask to <16 x i1> 56 %res = select <16 x i1> %mask1, <16 x i8> %trunc, <16 x i8> zeroinitializer 57 ret <16 x i8> %res 58 } 59 60 define <32 x i8> @avg_v32i8_mask(<32 x i8> %a, <32 x i8> %b, <32 x i8> %src, i32 %mask) nounwind { 61 ; AVX512F-LABEL: avg_v32i8_mask: 62 ; AVX512F: # %bb.0: 63 ; AVX512F-NEXT: kmovw %edi, %k1 64 ; AVX512F-NEXT: shrl $16, %edi 65 ; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0 66 ; AVX512F-NEXT: kmovw %edi, %k2 67 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 68 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 69 ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} 70 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 71 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 72 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 73 ; AVX512F-NEXT: retq 74 ; 75 ; AVX512BWVL-LABEL: avg_v32i8_mask: 76 ; AVX512BWVL: # %bb.0: 77 ; AVX512BWVL-NEXT: kmovd %edi, %k1 78 ; AVX512BWVL-NEXT: vpavgb %ymm1, %ymm0, %ymm2 {%k1} 79 ; AVX512BWVL-NEXT: vmovdqa %ymm2, %ymm0 80 ; AVX512BWVL-NEXT: retq 81 %za = zext <32 x i8> %a to <32 x i16> 82 %zb = zext <32 x i8> %b to <32 x i16> 83 %add = add nuw nsw <32 x i16> %za, %zb 84 %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 85 %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 86 %trunc = trunc <32 x i16> %lshr to <32 x i8> 87 %mask1 = bitcast i32 %mask to <32 x i1> 88 %res = select <32 x i1> %mask1, <32 x i8> %trunc, <32 x i8> %src 89 ret <32 x i8> %res 90 } 91 92 define <32 x i8> @avg_v32i8_maskz(<32 x i8> %a, <32 x i8> %b, i32 %mask) nounwind { 93 ; AVX512F-LABEL: avg_v32i8_maskz: 94 ; AVX512F: # %bb.0: 95 ; AVX512F-NEXT: kmovw %edi, %k1 96 ; AVX512F-NEXT: shrl $16, %edi 97 ; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0 98 ; AVX512F-NEXT: kmovw %edi, %k2 99 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 100 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 101 ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} 102 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 103 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 104 ; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0 105 ; AVX512F-NEXT: retq 106 ; 107 ; AVX512BWVL-LABEL: avg_v32i8_maskz: 108 ; AVX512BWVL: # %bb.0: 109 ; AVX512BWVL-NEXT: kmovd %edi, %k1 110 ; AVX512BWVL-NEXT: vpavgb %ymm1, %ymm0, %ymm0 {%k1} {z} 111 ; AVX512BWVL-NEXT: retq 112 %za = zext <32 x i8> %a to <32 x i16> 113 %zb = zext <32 x i8> %b to <32 x i16> 114 %add = add nuw nsw <32 x i16> %za, %zb 115 %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 116 %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 117 %trunc = trunc <32 x i16> %lshr to <32 x i8> 118 %mask1 = bitcast i32 %mask to <32 x i1> 119 %res = select <32 x i1> %mask1, <32 x i8> %trunc, <32 x i8> zeroinitializer 120 ret <32 x i8> %res 121 } 122 123 define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64 %mask) nounwind { 124 ; AVX512F-LABEL: avg_v64i8_mask: 125 ; AVX512F: # %bb.0: 126 ; AVX512F-NEXT: movq %rdi, %rax 127 ; AVX512F-NEXT: movq %rdi, %rcx 128 ; AVX512F-NEXT: kmovw %edi, %k1 129 ; AVX512F-NEXT: movl %edi, %edx 130 ; AVX512F-NEXT: shrl $16, %edx 131 ; AVX512F-NEXT: shrq $32, %rax 132 ; AVX512F-NEXT: shrq $48, %rcx 133 ; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1 134 ; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0 135 ; AVX512F-NEXT: kmovw %ecx, %k2 136 ; AVX512F-NEXT: kmovw %eax, %k3 137 ; AVX512F-NEXT: kmovw %edx, %k4 138 ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} 139 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 140 ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k4} {z} 141 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 142 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 143 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0 144 ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z} 145 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 146 ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} 147 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 148 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 149 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1 150 ; AVX512F-NEXT: retq 151 ; 152 ; AVX512BWVL-LABEL: avg_v64i8_mask: 153 ; AVX512BWVL: # %bb.0: 154 ; AVX512BWVL-NEXT: kmovq %rdi, %k1 155 ; AVX512BWVL-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1} 156 ; AVX512BWVL-NEXT: vmovdqa64 %zmm2, %zmm0 157 ; AVX512BWVL-NEXT: retq 158 %za = zext <64 x i8> %a to <64 x i16> 159 %zb = zext <64 x i8> %b to <64 x i16> 160 %add = add nuw nsw <64 x i16> %za, %zb 161 %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 162 %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 163 %trunc = trunc <64 x i16> %lshr to <64 x i8> 164 %mask1 = bitcast i64 %mask to <64 x i1> 165 %res = select <64 x i1> %mask1, <64 x i8> %trunc, <64 x i8> %src 166 ret <64 x i8> %res 167 } 168 169 define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind { 170 ; AVX512F-LABEL: avg_v64i8_maskz: 171 ; AVX512F: # %bb.0: 172 ; AVX512F-NEXT: movq %rdi, %rax 173 ; AVX512F-NEXT: movq %rdi, %rcx 174 ; AVX512F-NEXT: kmovw %edi, %k1 175 ; AVX512F-NEXT: movl %edi, %edx 176 ; AVX512F-NEXT: shrl $16, %edx 177 ; AVX512F-NEXT: shrq $32, %rax 178 ; AVX512F-NEXT: shrq $48, %rcx 179 ; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1 180 ; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0 181 ; AVX512F-NEXT: kmovw %ecx, %k2 182 ; AVX512F-NEXT: kmovw %eax, %k3 183 ; AVX512F-NEXT: kmovw %edx, %k4 184 ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} 185 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 186 ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k4} {z} 187 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 188 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 189 ; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0 190 ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z} 191 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 192 ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} 193 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 194 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 195 ; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm1 196 ; AVX512F-NEXT: retq 197 ; 198 ; AVX512BWVL-LABEL: avg_v64i8_maskz: 199 ; AVX512BWVL: # %bb.0: 200 ; AVX512BWVL-NEXT: kmovq %rdi, %k1 201 ; AVX512BWVL-NEXT: vpavgb %zmm1, %zmm0, %zmm0 {%k1} {z} 202 ; AVX512BWVL-NEXT: retq 203 %za = zext <64 x i8> %a to <64 x i16> 204 %zb = zext <64 x i8> %b to <64 x i16> 205 %add = add nuw nsw <64 x i16> %za, %zb 206 %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 207 %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 208 %trunc = trunc <64 x i16> %lshr to <64 x i8> 209 %mask1 = bitcast i64 %mask to <64 x i1> 210 %res = select <64 x i1> %mask1, <64 x i8> %trunc, <64 x i8> zeroinitializer 211 ret <64 x i8> %res 212 } 213 214 define <8 x i16> @avg_v8i16_mask(<8 x i16> %a, <8 x i16> %b, <8 x i16> %src, i8 %mask) nounwind { 215 ; AVX512F-LABEL: avg_v8i16_mask: 216 ; AVX512F: # %bb.0: 217 ; AVX512F-NEXT: vpavgw %xmm1, %xmm0, %xmm0 218 ; AVX512F-NEXT: kmovw %edi, %k1 219 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 220 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 221 ; AVX512F-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 222 ; AVX512F-NEXT: vzeroupper 223 ; AVX512F-NEXT: retq 224 ; 225 ; AVX512BWVL-LABEL: avg_v8i16_mask: 226 ; AVX512BWVL: # %bb.0: 227 ; AVX512BWVL-NEXT: kmovd %edi, %k1 228 ; AVX512BWVL-NEXT: vpavgw %xmm1, %xmm0, %xmm2 {%k1} 229 ; AVX512BWVL-NEXT: vmovdqa %xmm2, %xmm0 230 ; AVX512BWVL-NEXT: retq 231 %za = zext <8 x i16> %a to <8 x i32> 232 %zb = zext <8 x i16> %b to <8 x i32> 233 %add = add nuw nsw <8 x i32> %za, %zb 234 %add1 = add nuw nsw <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 235 %lshr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 236 %trunc = trunc <8 x i32> %lshr to <8 x i16> 237 %mask1 = bitcast i8 %mask to <8 x i1> 238 %res = select <8 x i1> %mask1, <8 x i16> %trunc, <8 x i16> %src 239 ret <8 x i16> %res 240 } 241 242 define <8 x i16> @avg_v8i16_maskz(<8 x i16> %a, <8 x i16> %b, i8 %mask) nounwind { 243 ; AVX512F-LABEL: avg_v8i16_maskz: 244 ; AVX512F: # %bb.0: 245 ; AVX512F-NEXT: vpavgw %xmm1, %xmm0, %xmm0 246 ; AVX512F-NEXT: kmovw %edi, %k1 247 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 248 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 249 ; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0 250 ; AVX512F-NEXT: vzeroupper 251 ; AVX512F-NEXT: retq 252 ; 253 ; AVX512BWVL-LABEL: avg_v8i16_maskz: 254 ; AVX512BWVL: # %bb.0: 255 ; AVX512BWVL-NEXT: kmovd %edi, %k1 256 ; AVX512BWVL-NEXT: vpavgw %xmm1, %xmm0, %xmm0 {%k1} {z} 257 ; AVX512BWVL-NEXT: retq 258 %za = zext <8 x i16> %a to <8 x i32> 259 %zb = zext <8 x i16> %b to <8 x i32> 260 %add = add nuw nsw <8 x i32> %za, %zb 261 %add1 = add nuw nsw <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 262 %lshr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 263 %trunc = trunc <8 x i32> %lshr to <8 x i16> 264 %mask1 = bitcast i8 %mask to <8 x i1> 265 %res = select <8 x i1> %mask1, <8 x i16> %trunc, <8 x i16> zeroinitializer 266 ret <8 x i16> %res 267 } 268 269 define <16 x i16> @avg_v16i16_mask(<16 x i16> %a, <16 x i16> %b, <16 x i16> %src, i16 %mask) nounwind { 270 ; AVX512F-LABEL: avg_v16i16_mask: 271 ; AVX512F: # %bb.0: 272 ; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0 273 ; AVX512F-NEXT: kmovw %edi, %k1 274 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 275 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 276 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 277 ; AVX512F-NEXT: retq 278 ; 279 ; AVX512BWVL-LABEL: avg_v16i16_mask: 280 ; AVX512BWVL: # %bb.0: 281 ; AVX512BWVL-NEXT: kmovd %edi, %k1 282 ; AVX512BWVL-NEXT: vpavgw %ymm1, %ymm0, %ymm2 {%k1} 283 ; AVX512BWVL-NEXT: vmovdqa %ymm2, %ymm0 284 ; AVX512BWVL-NEXT: retq 285 %za = zext <16 x i16> %a to <16 x i32> 286 %zb = zext <16 x i16> %b to <16 x i32> 287 %add = add nuw nsw <16 x i32> %za, %zb 288 %add1 = add nuw nsw <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 289 %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 290 %trunc = trunc <16 x i32> %lshr to <16 x i16> 291 %mask1 = bitcast i16 %mask to <16 x i1> 292 %res = select <16 x i1> %mask1, <16 x i16> %trunc, <16 x i16> %src 293 ret <16 x i16> %res 294 } 295 296 define <16 x i16> @avg_v16i16_maskz(<16 x i16> %a, <16 x i16> %b, i16 %mask) nounwind { 297 ; AVX512F-LABEL: avg_v16i16_maskz: 298 ; AVX512F: # %bb.0: 299 ; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0 300 ; AVX512F-NEXT: kmovw %edi, %k1 301 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 302 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 303 ; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0 304 ; AVX512F-NEXT: retq 305 ; 306 ; AVX512BWVL-LABEL: avg_v16i16_maskz: 307 ; AVX512BWVL: # %bb.0: 308 ; AVX512BWVL-NEXT: kmovd %edi, %k1 309 ; AVX512BWVL-NEXT: vpavgw %ymm1, %ymm0, %ymm0 {%k1} {z} 310 ; AVX512BWVL-NEXT: retq 311 %za = zext <16 x i16> %a to <16 x i32> 312 %zb = zext <16 x i16> %b to <16 x i32> 313 %add = add nuw nsw <16 x i32> %za, %zb 314 %add1 = add nuw nsw <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 315 %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 316 %trunc = trunc <16 x i32> %lshr to <16 x i16> 317 %mask1 = bitcast i16 %mask to <16 x i1> 318 %res = select <16 x i1> %mask1, <16 x i16> %trunc, <16 x i16> zeroinitializer 319 ret <16 x i16> %res 320 } 321 322 define <32 x i16> @avg_v32i16_mask(<32 x i16> %a, <32 x i16> %b, <32 x i16> %src, i32 %mask) nounwind { 323 ; AVX512F-LABEL: avg_v32i16_mask: 324 ; AVX512F: # %bb.0: 325 ; AVX512F-NEXT: kmovw %edi, %k1 326 ; AVX512F-NEXT: shrl $16, %edi 327 ; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1 328 ; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0 329 ; AVX512F-NEXT: kmovw %edi, %k2 330 ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} 331 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 332 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0 333 ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} 334 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 335 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1 336 ; AVX512F-NEXT: retq 337 ; 338 ; AVX512BWVL-LABEL: avg_v32i16_mask: 339 ; AVX512BWVL: # %bb.0: 340 ; AVX512BWVL-NEXT: kmovd %edi, %k1 341 ; AVX512BWVL-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1} 342 ; AVX512BWVL-NEXT: vmovdqa64 %zmm2, %zmm0 343 ; AVX512BWVL-NEXT: retq 344 %za = zext <32 x i16> %a to <32 x i32> 345 %zb = zext <32 x i16> %b to <32 x i32> 346 %add = add nuw nsw <32 x i32> %za, %zb 347 %add1 = add nuw nsw <32 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 348 %lshr = lshr <32 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 349 %trunc = trunc <32 x i32> %lshr to <32 x i16> 350 %mask1 = bitcast i32 %mask to <32 x i1> 351 %res = select <32 x i1> %mask1, <32 x i16> %trunc, <32 x i16> %src 352 ret <32 x i16> %res 353 } 354 355 define <32 x i16> @avg_v32i16_maskz(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind { 356 ; AVX512F-LABEL: avg_v32i16_maskz: 357 ; AVX512F: # %bb.0: 358 ; AVX512F-NEXT: kmovw %edi, %k1 359 ; AVX512F-NEXT: shrl $16, %edi 360 ; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1 361 ; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0 362 ; AVX512F-NEXT: kmovw %edi, %k2 363 ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} 364 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 365 ; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0 366 ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} 367 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 368 ; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm1 369 ; AVX512F-NEXT: retq 370 ; 371 ; AVX512BWVL-LABEL: avg_v32i16_maskz: 372 ; AVX512BWVL: # %bb.0: 373 ; AVX512BWVL-NEXT: kmovd %edi, %k1 374 ; AVX512BWVL-NEXT: vpavgw %zmm1, %zmm0, %zmm0 {%k1} {z} 375 ; AVX512BWVL-NEXT: retq 376 %za = zext <32 x i16> %a to <32 x i32> 377 %zb = zext <32 x i16> %b to <32 x i32> 378 %add = add nuw nsw <32 x i32> %za, %zb 379 %add1 = add nuw nsw <32 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 380 %lshr = lshr <32 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 381 %trunc = trunc <32 x i32> %lshr to <32 x i16> 382 %mask1 = bitcast i32 %mask to <32 x i1> 383 %res = select <32 x i1> %mask1, <32 x i16> %trunc, <32 x i16> zeroinitializer 384 ret <32 x i16> %res 385 } 386