1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 --check-prefix=AVX2NOBW 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX2 --check-prefix=AVX512BW 5 6 ; 7 ; udiv by 7 8 ; 9 10 define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind { 11 ; AVX1-LABEL: test_div7_4i64: 12 ; AVX1: # %bb.0: 13 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 14 ; AVX1-NEXT: vpextrq $1, %xmm1, %rcx 15 ; AVX1-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 16 ; AVX1-NEXT: movq %rcx, %rax 17 ; AVX1-NEXT: mulq %rsi 18 ; AVX1-NEXT: subq %rdx, %rcx 19 ; AVX1-NEXT: shrq %rcx 20 ; AVX1-NEXT: addq %rdx, %rcx 21 ; AVX1-NEXT: shrq $2, %rcx 22 ; AVX1-NEXT: vmovq %rcx, %xmm2 23 ; AVX1-NEXT: vmovq %xmm1, %rcx 24 ; AVX1-NEXT: movq %rcx, %rax 25 ; AVX1-NEXT: mulq %rsi 26 ; AVX1-NEXT: subq %rdx, %rcx 27 ; AVX1-NEXT: shrq %rcx 28 ; AVX1-NEXT: addq %rdx, %rcx 29 ; AVX1-NEXT: shrq $2, %rcx 30 ; AVX1-NEXT: vmovq %rcx, %xmm1 31 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 32 ; AVX1-NEXT: vpextrq $1, %xmm0, %rcx 33 ; AVX1-NEXT: movq %rcx, %rax 34 ; AVX1-NEXT: mulq %rsi 35 ; AVX1-NEXT: subq %rdx, %rcx 36 ; AVX1-NEXT: shrq %rcx 37 ; AVX1-NEXT: addq %rdx, %rcx 38 ; AVX1-NEXT: shrq $2, %rcx 39 ; AVX1-NEXT: vmovq %rcx, %xmm2 40 ; AVX1-NEXT: vmovq %xmm0, %rcx 41 ; AVX1-NEXT: movq %rcx, %rax 42 ; AVX1-NEXT: mulq %rsi 43 ; AVX1-NEXT: subq %rdx, %rcx 44 ; AVX1-NEXT: shrq %rcx 45 ; AVX1-NEXT: addq %rdx, %rcx 46 ; AVX1-NEXT: shrq $2, %rcx 47 ; AVX1-NEXT: vmovq %rcx, %xmm0 48 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 49 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 50 ; AVX1-NEXT: retq 51 ; 52 ; AVX2-LABEL: test_div7_4i64: 53 ; AVX2: # %bb.0: 54 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 55 ; AVX2-NEXT: vpextrq $1, %xmm1, %rcx 56 ; AVX2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 57 ; AVX2-NEXT: movq %rcx, %rax 58 ; AVX2-NEXT: mulq %rsi 59 ; AVX2-NEXT: subq %rdx, %rcx 60 ; AVX2-NEXT: shrq %rcx 61 ; AVX2-NEXT: addq %rdx, %rcx 62 ; AVX2-NEXT: shrq $2, %rcx 63 ; AVX2-NEXT: vmovq %rcx, %xmm2 64 ; AVX2-NEXT: vmovq %xmm1, %rcx 65 ; AVX2-NEXT: movq %rcx, %rax 66 ; AVX2-NEXT: mulq %rsi 67 ; AVX2-NEXT: subq %rdx, %rcx 68 ; AVX2-NEXT: shrq %rcx 69 ; AVX2-NEXT: addq %rdx, %rcx 70 ; AVX2-NEXT: shrq $2, %rcx 71 ; AVX2-NEXT: vmovq %rcx, %xmm1 72 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 73 ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx 74 ; AVX2-NEXT: movq %rcx, %rax 75 ; AVX2-NEXT: mulq %rsi 76 ; AVX2-NEXT: subq %rdx, %rcx 77 ; AVX2-NEXT: shrq %rcx 78 ; AVX2-NEXT: addq %rdx, %rcx 79 ; AVX2-NEXT: shrq $2, %rcx 80 ; AVX2-NEXT: vmovq %rcx, %xmm2 81 ; AVX2-NEXT: vmovq %xmm0, %rcx 82 ; AVX2-NEXT: movq %rcx, %rax 83 ; AVX2-NEXT: mulq %rsi 84 ; AVX2-NEXT: subq %rdx, %rcx 85 ; AVX2-NEXT: shrq %rcx 86 ; AVX2-NEXT: addq %rdx, %rcx 87 ; AVX2-NEXT: shrq $2, %rcx 88 ; AVX2-NEXT: vmovq %rcx, %xmm0 89 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 90 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 91 ; AVX2-NEXT: retq 92 %res = udiv <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7> 93 ret <4 x i64> %res 94 } 95 96 define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind { 97 ; AVX1-LABEL: test_div7_8i32: 98 ; AVX1: # %bb.0: 99 ; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] 100 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 101 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 102 ; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 103 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 104 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 105 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 106 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm3 107 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 108 ; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 109 ; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2 110 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 111 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 112 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 113 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 114 ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 115 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 116 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 117 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 118 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 119 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 120 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 121 ; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0 122 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 123 ; AVX1-NEXT: retq 124 ; 125 ; AVX2-LABEL: test_div7_8i32: 126 ; AVX2: # %bb.0: 127 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] 128 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] 129 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] 130 ; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 131 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 132 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] 133 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] 134 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 135 ; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 136 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 137 ; AVX2-NEXT: vpsrld $2, %ymm0, %ymm0 138 ; AVX2-NEXT: retq 139 %res = udiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 140 ret <8 x i32> %res 141 } 142 143 define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind { 144 ; AVX1-LABEL: test_div7_16i16: 145 ; AVX1: # %bb.0: 146 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363] 147 ; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 148 ; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm3 149 ; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 150 ; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2 151 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2 152 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 153 ; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm1 154 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 155 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 156 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 157 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0 158 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 159 ; AVX1-NEXT: retq 160 ; 161 ; AVX2-LABEL: test_div7_16i16: 162 ; AVX2: # %bb.0: 163 ; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1 164 ; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 165 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 166 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 167 ; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm0 168 ; AVX2-NEXT: retq 169 %res = udiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 170 ret <16 x i16> %res 171 } 172 173 define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind { 174 ; AVX1-LABEL: test_div7_32i8: 175 ; AVX1: # %bb.0: 176 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 177 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 178 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37] 179 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 180 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 181 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] 182 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 183 ; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm4 184 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 185 ; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 186 ; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 187 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 188 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 189 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 190 ; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 191 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1 192 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 193 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 194 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 195 ; AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm5 196 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 197 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1] 198 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 199 ; AVX1-NEXT: vpmullw %xmm3, %xmm6, %xmm3 200 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 201 ; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3 202 ; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 203 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 204 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 205 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 206 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0 207 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 208 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 209 ; AVX1-NEXT: retq 210 ; 211 ; AVX2NOBW-LABEL: test_div7_32i8: 212 ; AVX2NOBW: # %bb.0: 213 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 214 ; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 215 ; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] 216 ; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm1, %ymm1 217 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 218 ; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 219 ; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm3, %ymm2 220 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 221 ; AVX2NOBW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm1[2,3] 222 ; AVX2NOBW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 223 ; AVX2NOBW-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 224 ; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 225 ; AVX2NOBW-NEXT: vpsrlw $1, %ymm0, %ymm0 226 ; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 227 ; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 228 ; AVX2NOBW-NEXT: vpsrlw $2, %ymm0, %ymm0 229 ; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 230 ; AVX2NOBW-NEXT: retq 231 ; 232 ; AVX512BW-LABEL: test_div7_32i8: 233 ; AVX512BW: # %bb.0: 234 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 235 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 236 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 237 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 238 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 239 ; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm0 240 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 241 ; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 242 ; AVX512BW-NEXT: vpsrlw $2, %ymm0, %ymm0 243 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 244 ; AVX512BW-NEXT: retq 245 %res = udiv <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> 246 ret <32 x i8> %res 247 } 248 249 ; 250 ; urem by 7 251 ; 252 253 define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { 254 ; AVX1-LABEL: test_rem7_4i64: 255 ; AVX1: # %bb.0: 256 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 257 ; AVX1-NEXT: vpextrq $1, %xmm1, %rcx 258 ; AVX1-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 259 ; AVX1-NEXT: movq %rcx, %rax 260 ; AVX1-NEXT: mulq %rsi 261 ; AVX1-NEXT: movq %rcx, %rax 262 ; AVX1-NEXT: subq %rdx, %rax 263 ; AVX1-NEXT: shrq %rax 264 ; AVX1-NEXT: addq %rdx, %rax 265 ; AVX1-NEXT: shrq $2, %rax 266 ; AVX1-NEXT: leaq (,%rax,8), %rdx 267 ; AVX1-NEXT: subq %rdx, %rax 268 ; AVX1-NEXT: addq %rcx, %rax 269 ; AVX1-NEXT: vmovq %rax, %xmm2 270 ; AVX1-NEXT: vmovq %xmm1, %rcx 271 ; AVX1-NEXT: movq %rcx, %rax 272 ; AVX1-NEXT: mulq %rsi 273 ; AVX1-NEXT: movq %rcx, %rax 274 ; AVX1-NEXT: subq %rdx, %rax 275 ; AVX1-NEXT: shrq %rax 276 ; AVX1-NEXT: addq %rdx, %rax 277 ; AVX1-NEXT: shrq $2, %rax 278 ; AVX1-NEXT: leaq (,%rax,8), %rdx 279 ; AVX1-NEXT: subq %rdx, %rax 280 ; AVX1-NEXT: addq %rcx, %rax 281 ; AVX1-NEXT: vmovq %rax, %xmm1 282 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 283 ; AVX1-NEXT: vpextrq $1, %xmm0, %rcx 284 ; AVX1-NEXT: movq %rcx, %rax 285 ; AVX1-NEXT: mulq %rsi 286 ; AVX1-NEXT: movq %rcx, %rax 287 ; AVX1-NEXT: subq %rdx, %rax 288 ; AVX1-NEXT: shrq %rax 289 ; AVX1-NEXT: addq %rdx, %rax 290 ; AVX1-NEXT: shrq $2, %rax 291 ; AVX1-NEXT: leaq (,%rax,8), %rdx 292 ; AVX1-NEXT: subq %rdx, %rax 293 ; AVX1-NEXT: addq %rcx, %rax 294 ; AVX1-NEXT: vmovq %rax, %xmm2 295 ; AVX1-NEXT: vmovq %xmm0, %rcx 296 ; AVX1-NEXT: movq %rcx, %rax 297 ; AVX1-NEXT: mulq %rsi 298 ; AVX1-NEXT: movq %rcx, %rax 299 ; AVX1-NEXT: subq %rdx, %rax 300 ; AVX1-NEXT: shrq %rax 301 ; AVX1-NEXT: addq %rdx, %rax 302 ; AVX1-NEXT: shrq $2, %rax 303 ; AVX1-NEXT: leaq (,%rax,8), %rdx 304 ; AVX1-NEXT: subq %rdx, %rax 305 ; AVX1-NEXT: addq %rcx, %rax 306 ; AVX1-NEXT: vmovq %rax, %xmm0 307 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 308 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 309 ; AVX1-NEXT: retq 310 ; 311 ; AVX2-LABEL: test_rem7_4i64: 312 ; AVX2: # %bb.0: 313 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 314 ; AVX2-NEXT: vpextrq $1, %xmm1, %rcx 315 ; AVX2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 316 ; AVX2-NEXT: movq %rcx, %rax 317 ; AVX2-NEXT: mulq %rsi 318 ; AVX2-NEXT: movq %rcx, %rax 319 ; AVX2-NEXT: subq %rdx, %rax 320 ; AVX2-NEXT: shrq %rax 321 ; AVX2-NEXT: addq %rdx, %rax 322 ; AVX2-NEXT: shrq $2, %rax 323 ; AVX2-NEXT: leaq (,%rax,8), %rdx 324 ; AVX2-NEXT: subq %rdx, %rax 325 ; AVX2-NEXT: addq %rcx, %rax 326 ; AVX2-NEXT: vmovq %rax, %xmm2 327 ; AVX2-NEXT: vmovq %xmm1, %rcx 328 ; AVX2-NEXT: movq %rcx, %rax 329 ; AVX2-NEXT: mulq %rsi 330 ; AVX2-NEXT: movq %rcx, %rax 331 ; AVX2-NEXT: subq %rdx, %rax 332 ; AVX2-NEXT: shrq %rax 333 ; AVX2-NEXT: addq %rdx, %rax 334 ; AVX2-NEXT: shrq $2, %rax 335 ; AVX2-NEXT: leaq (,%rax,8), %rdx 336 ; AVX2-NEXT: subq %rdx, %rax 337 ; AVX2-NEXT: addq %rcx, %rax 338 ; AVX2-NEXT: vmovq %rax, %xmm1 339 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 340 ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx 341 ; AVX2-NEXT: movq %rcx, %rax 342 ; AVX2-NEXT: mulq %rsi 343 ; AVX2-NEXT: movq %rcx, %rax 344 ; AVX2-NEXT: subq %rdx, %rax 345 ; AVX2-NEXT: shrq %rax 346 ; AVX2-NEXT: addq %rdx, %rax 347 ; AVX2-NEXT: shrq $2, %rax 348 ; AVX2-NEXT: leaq (,%rax,8), %rdx 349 ; AVX2-NEXT: subq %rdx, %rax 350 ; AVX2-NEXT: addq %rcx, %rax 351 ; AVX2-NEXT: vmovq %rax, %xmm2 352 ; AVX2-NEXT: vmovq %xmm0, %rcx 353 ; AVX2-NEXT: movq %rcx, %rax 354 ; AVX2-NEXT: mulq %rsi 355 ; AVX2-NEXT: movq %rcx, %rax 356 ; AVX2-NEXT: subq %rdx, %rax 357 ; AVX2-NEXT: shrq %rax 358 ; AVX2-NEXT: addq %rdx, %rax 359 ; AVX2-NEXT: shrq $2, %rax 360 ; AVX2-NEXT: leaq (,%rax,8), %rdx 361 ; AVX2-NEXT: subq %rdx, %rax 362 ; AVX2-NEXT: addq %rcx, %rax 363 ; AVX2-NEXT: vmovq %rax, %xmm0 364 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 365 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 366 ; AVX2-NEXT: retq 367 %res = urem <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7> 368 ret <4 x i64> %res 369 } 370 371 define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind { 372 ; AVX1-LABEL: test_rem7_8i32: 373 ; AVX1: # %bb.0: 374 ; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] 375 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 376 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 377 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 378 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 379 ; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm3 380 ; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 381 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 382 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 383 ; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm3 384 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 385 ; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 386 ; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2 387 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7] 388 ; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2 389 ; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2 390 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 391 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 392 ; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm4 393 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 394 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 395 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] 396 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm4 397 ; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 398 ; AVX1-NEXT: vpaddd %xmm1, %xmm4, %xmm1 399 ; AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 400 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 401 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 402 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 403 ; AVX1-NEXT: retq 404 ; 405 ; AVX2-LABEL: test_rem7_8i32: 406 ; AVX2: # %bb.0: 407 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] 408 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] 409 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] 410 ; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 411 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 412 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] 413 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] 414 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm2 415 ; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 416 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 417 ; AVX2-NEXT: vpsrld $2, %ymm1, %ymm1 418 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7] 419 ; AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 420 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 421 ; AVX2-NEXT: retq 422 %res = urem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 423 ret <8 x i32> %res 424 } 425 426 define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind { 427 ; AVX1-LABEL: test_rem7_16i16: 428 ; AVX1: # %bb.0: 429 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 430 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363] 431 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3 432 ; AVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm4 433 ; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4 434 ; AVX1-NEXT: vpaddw %xmm3, %xmm4, %xmm3 435 ; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3 436 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7] 437 ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 438 ; AVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm1 439 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm2 440 ; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm3 441 ; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 442 ; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2 443 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2 444 ; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 445 ; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0 446 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 447 ; AVX1-NEXT: retq 448 ; 449 ; AVX2-LABEL: test_rem7_16i16: 450 ; AVX2: # %bb.0: 451 ; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1 452 ; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm2 453 ; AVX2-NEXT: vpsrlw $1, %ymm2, %ymm2 454 ; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1 455 ; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm1 456 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 457 ; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 458 ; AVX2-NEXT: retq 459 %res = urem <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 460 ret <16 x i16> %res 461 } 462 463 define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { 464 ; AVX1-LABEL: test_rem7_32i8: 465 ; AVX1: # %bb.0: 466 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 467 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 468 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37] 469 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 470 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 471 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] 472 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 473 ; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm4 474 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 475 ; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 476 ; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm4 477 ; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4 478 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 479 ; AVX1-NEXT: vpand %xmm8, %xmm4, %xmm4 480 ; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 481 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2 482 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 483 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 484 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 485 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 486 ; AVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6 487 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 488 ; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6 489 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 490 ; AVX1-NEXT: vpmullw %xmm7, %xmm2, %xmm2 491 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 492 ; AVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2 493 ; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 494 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 495 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 496 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 497 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1] 498 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 499 ; AVX1-NEXT: vpmullw %xmm3, %xmm6, %xmm3 500 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 501 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 502 ; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm3 503 ; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 504 ; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3 505 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 506 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2 507 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 508 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 509 ; AVX1-NEXT: vpmullw %xmm7, %xmm3, %xmm3 510 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 511 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 512 ; AVX1-NEXT: vpmullw %xmm7, %xmm2, %xmm2 513 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 514 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 515 ; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 516 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 517 ; AVX1-NEXT: retq 518 ; 519 ; AVX2NOBW-LABEL: test_rem7_32i8: 520 ; AVX2NOBW: # %bb.0: 521 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 522 ; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 523 ; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] 524 ; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm1, %ymm1 525 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 526 ; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 527 ; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm3, %ymm2 528 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 529 ; AVX2NOBW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm1[2,3] 530 ; AVX2NOBW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 531 ; AVX2NOBW-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 532 ; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm2 533 ; AVX2NOBW-NEXT: vpsrlw $1, %ymm2, %ymm2 534 ; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 535 ; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 536 ; AVX2NOBW-NEXT: vpsrlw $2, %ymm1, %ymm1 537 ; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 538 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 539 ; AVX2NOBW-NEXT: vpmovsxbw %xmm2, %ymm2 540 ; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 541 ; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2 542 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm2, %xmm4 543 ; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 544 ; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 545 ; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 546 ; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] 547 ; AVX2NOBW-NEXT: vpmovsxbw %xmm1, %ymm1 548 ; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm1, %ymm1 549 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm3 550 ; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 551 ; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 552 ; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 553 ; AVX2NOBW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 554 ; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 555 ; AVX2NOBW-NEXT: retq 556 ; 557 ; AVX512BW-LABEL: test_rem7_32i8: 558 ; AVX512BW: # %bb.0: 559 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 560 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 561 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 562 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 563 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm2 564 ; AVX512BW-NEXT: vpsrlw $1, %ymm2, %ymm2 565 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 566 ; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 567 ; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm1 568 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 569 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 570 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 571 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 572 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 573 ; AVX512BW-NEXT: retq 574 %res = urem <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> 575 ret <32 x i8> %res 576 } 577