1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW 6 7 ; 8 ; Variable Rotates 9 ; 10 11 define <8 x i64> @var_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { 12 ; AVX512-LABEL: var_rotate_v8i64: 13 ; AVX512: # %bb.0: 14 ; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0 15 ; AVX512-NEXT: retq 16 %b64 = sub <8 x i64> <i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64>, %b 17 %shl = shl <8 x i64> %a, %b 18 %lshr = lshr <8 x i64> %a, %b64 19 %or = or <8 x i64> %shl, %lshr 20 ret <8 x i64> %or 21 } 22 23 define <16 x i32> @var_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { 24 ; AVX512-LABEL: var_rotate_v16i32: 25 ; AVX512: # %bb.0: 26 ; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0 27 ; AVX512-NEXT: retq 28 %b32 = sub <16 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b 29 %shl = shl <16 x i32> %a, %b 30 %lshr = lshr <16 x i32> %a, %b32 31 %or = or <16 x i32> %shl, %lshr 32 ret <16 x i32> %or 33 } 34 35 define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { 36 ; AVX512F-LABEL: var_rotate_v32i16: 37 ; AVX512F: # %bb.0: 38 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 39 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 40 ; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm4 41 ; AVX512F-NEXT: vpmovdw %zmm4, %ymm4 42 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 43 ; AVX512F-NEXT: vpsubw %ymm2, %ymm5, %ymm2 44 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 45 ; AVX512F-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0 46 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 47 ; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0 48 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero 49 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 50 ; AVX512F-NEXT: vpsllvd %zmm2, %zmm1, %zmm2 51 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 52 ; AVX512F-NEXT: vpsubw %ymm3, %ymm5, %ymm3 53 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero 54 ; AVX512F-NEXT: vpsrlvd %zmm3, %zmm1, %zmm1 55 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 56 ; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 57 ; AVX512F-NEXT: retq 58 ; 59 ; AVX512VL-LABEL: var_rotate_v32i16: 60 ; AVX512VL: # %bb.0: 61 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 62 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 63 ; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm4 64 ; AVX512VL-NEXT: vpmovdw %zmm4, %ymm4 65 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 66 ; AVX512VL-NEXT: vpsubw %ymm2, %ymm5, %ymm2 67 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 68 ; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0 69 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 70 ; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0 71 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero 72 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 73 ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm1, %zmm2 74 ; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2 75 ; AVX512VL-NEXT: vpsubw %ymm3, %ymm5, %ymm3 76 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero 77 ; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm1, %zmm1 78 ; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1 79 ; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1 80 ; AVX512VL-NEXT: retq 81 ; 82 ; AVX512BW-LABEL: var_rotate_v32i16: 83 ; AVX512BW: # %bb.0: 84 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 85 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm2, %zmm2 86 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 87 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 88 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 89 ; AVX512BW-NEXT: retq 90 ; 91 ; AVX512VLBW-LABEL: var_rotate_v32i16: 92 ; AVX512VLBW: # %bb.0: 93 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 94 ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm2, %zmm2 95 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 96 ; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 97 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 98 ; AVX512VLBW-NEXT: retq 99 %b16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b 100 %shl = shl <32 x i16> %a, %b 101 %lshr = lshr <32 x i16> %a, %b16 102 %or = or <32 x i16> %shl, %lshr 103 ret <32 x i16> %or 104 } 105 106 define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { 107 ; AVX512F-LABEL: var_rotate_v64i8: 108 ; AVX512F: # %bb.0: 109 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 110 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 111 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 112 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6 113 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 114 ; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 115 ; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4 116 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 117 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 118 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm4 119 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] 120 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 121 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm8 122 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 123 ; AVX512F-NEXT: vpand %ymm9, %ymm8, %ymm8 124 ; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4 125 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 126 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 127 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm4 128 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 129 ; AVX512F-NEXT: vpand %ymm8, %ymm4, %ymm4 130 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm10 131 ; AVX512F-NEXT: vpor %ymm4, %ymm10, %ymm4 132 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 133 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 134 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2 135 ; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 136 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm4 137 ; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 138 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 139 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 140 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 141 ; AVX512F-NEXT: vpsrlw $6, %ymm1, %ymm2 142 ; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 143 ; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm4 144 ; AVX512F-NEXT: vpand %ymm9, %ymm4, %ymm4 145 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 146 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 147 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 148 ; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2 149 ; AVX512F-NEXT: vpand %ymm8, %ymm2, %ymm2 150 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm4 151 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 152 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 153 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 154 ; AVX512F-NEXT: retq 155 ; 156 ; AVX512VL-LABEL: var_rotate_v64i8: 157 ; AVX512VL: # %bb.0: 158 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 159 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 160 ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 161 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6 162 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 163 ; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6 164 ; AVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4 165 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 166 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 167 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm4 168 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] 169 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 170 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm8 171 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 172 ; AVX512VL-NEXT: vpand %ymm9, %ymm8, %ymm8 173 ; AVX512VL-NEXT: vpor %ymm4, %ymm8, %ymm4 174 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 175 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 176 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm4 177 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 178 ; AVX512VL-NEXT: vpand %ymm8, %ymm4, %ymm4 179 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm10 180 ; AVX512VL-NEXT: vpor %ymm4, %ymm10, %ymm4 181 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 182 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 183 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2 184 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 185 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm4 186 ; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 187 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 188 ; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 189 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 190 ; AVX512VL-NEXT: vpsrlw $6, %ymm1, %ymm2 191 ; AVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2 192 ; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm4 193 ; AVX512VL-NEXT: vpand %ymm9, %ymm4, %ymm4 194 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 195 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 196 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 197 ; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm2 198 ; AVX512VL-NEXT: vpand %ymm8, %ymm2, %ymm2 199 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm4 200 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 201 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 202 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 203 ; AVX512VL-NEXT: retq 204 ; 205 ; AVX512BW-LABEL: var_rotate_v64i8: 206 ; AVX512BW: # %bb.0: 207 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 208 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm2 209 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3 210 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 211 ; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 212 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 213 ; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1} 214 ; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm4 215 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4 216 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 217 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 218 ; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} 219 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 220 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 221 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1} 222 ; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm1 223 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm2 224 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1 225 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k2 226 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 227 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 228 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} 229 ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1 230 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 231 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} 232 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm1 233 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 234 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 235 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1 236 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} 237 ; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0 238 ; AVX512BW-NEXT: retq 239 ; 240 ; AVX512VLBW-LABEL: var_rotate_v64i8: 241 ; AVX512VLBW: # %bb.0: 242 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 243 ; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm2, %zmm2 244 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3 245 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 246 ; AVX512VLBW-NEXT: vpsllw $5, %zmm1, %zmm1 247 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 248 ; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1} 249 ; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm4 250 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4 251 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 252 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 253 ; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} 254 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 255 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 256 ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1} 257 ; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm1 258 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm2 259 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1 260 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k2 261 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm1 262 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 263 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} 264 ; AVX512VLBW-NEXT: vpsrlw $2, %zmm0, %zmm1 265 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 266 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} 267 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm1 268 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 269 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 270 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1 271 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} 272 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0 273 ; AVX512VLBW-NEXT: retq 274 %b8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b 275 %shl = shl <64 x i8> %a, %b 276 %lshr = lshr <64 x i8> %a, %b8 277 %or = or <64 x i8> %shl, %lshr 278 ret <64 x i8> %or 279 } 280 281 ; 282 ; Uniform Variable Rotates 283 ; 284 285 define <8 x i64> @splatvar_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { 286 ; AVX512-LABEL: splatvar_rotate_v8i64: 287 ; AVX512: # %bb.0: 288 ; AVX512-NEXT: vpbroadcastq %xmm1, %zmm1 289 ; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0 290 ; AVX512-NEXT: retq 291 %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer 292 %splat64 = sub <8 x i64> <i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64>, %splat 293 %shl = shl <8 x i64> %a, %splat 294 %lshr = lshr <8 x i64> %a, %splat64 295 %or = or <8 x i64> %shl, %lshr 296 ret <8 x i64> %or 297 } 298 299 define <16 x i32> @splatvar_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { 300 ; AVX512-LABEL: splatvar_rotate_v16i32: 301 ; AVX512: # %bb.0: 302 ; AVX512-NEXT: vpbroadcastd %xmm1, %zmm1 303 ; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0 304 ; AVX512-NEXT: retq 305 %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer 306 %splat32 = sub <16 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %splat 307 %shl = shl <16 x i32> %a, %splat 308 %lshr = lshr <16 x i32> %a, %splat32 309 %or = or <16 x i32> %shl, %lshr 310 ret <16 x i32> %or 311 } 312 313 define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { 314 ; AVX512F-LABEL: splatvar_rotate_v32i16: 315 ; AVX512F: # %bb.0: 316 ; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm3 317 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 318 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm4 319 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 320 ; AVX512F-NEXT: vpsubw %ymm3, %ymm5, %ymm3 321 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 322 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm0, %ymm0 323 ; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0 324 ; AVX512F-NEXT: vpsllw %xmm2, %ymm1, %ymm2 325 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm1, %ymm1 326 ; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 327 ; AVX512F-NEXT: retq 328 ; 329 ; AVX512VL-LABEL: splatvar_rotate_v32i16: 330 ; AVX512VL: # %bb.0: 331 ; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm3 332 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 333 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm4 334 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 335 ; AVX512VL-NEXT: vpsubw %ymm3, %ymm5, %ymm3 336 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 337 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm0, %ymm0 338 ; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0 339 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm1, %ymm2 340 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm1 341 ; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1 342 ; AVX512VL-NEXT: retq 343 ; 344 ; AVX512BW-LABEL: splatvar_rotate_v32i16: 345 ; AVX512BW: # %bb.0: 346 ; AVX512BW-NEXT: vpbroadcastw %xmm1, %zmm2 347 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 348 ; AVX512BW-NEXT: vpsubw %zmm2, %zmm3, %zmm2 349 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 350 ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm1 351 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 352 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 353 ; AVX512BW-NEXT: retq 354 ; 355 ; AVX512VLBW-LABEL: splatvar_rotate_v32i16: 356 ; AVX512VLBW: # %bb.0: 357 ; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %zmm2 358 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 359 ; AVX512VLBW-NEXT: vpsubw %zmm2, %zmm3, %zmm2 360 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 361 ; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm1 362 ; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 363 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 364 ; AVX512VLBW-NEXT: retq 365 %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer 366 %splat16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat 367 %shl = shl <32 x i16> %a, %splat 368 %lshr = lshr <32 x i16> %a, %splat16 369 %or = or <32 x i16> %shl, %lshr 370 ret <32 x i16> %or 371 } 372 373 define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { 374 ; AVX512F-LABEL: splatvar_rotate_v64i8: 375 ; AVX512F: # %bb.0: 376 ; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 377 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 378 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 379 ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 380 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5 381 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 382 ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 383 ; AVX512F-NEXT: vpor %ymm3, %ymm5, %ymm3 384 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 385 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 386 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 387 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] 388 ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 389 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm7 390 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 391 ; AVX512F-NEXT: vpand %ymm8, %ymm7, %ymm7 392 ; AVX512F-NEXT: vpor %ymm3, %ymm7, %ymm3 393 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm7 394 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 395 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3 396 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 397 ; AVX512F-NEXT: vpand %ymm9, %ymm3, %ymm3 398 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm10 399 ; AVX512F-NEXT: vpor %ymm3, %ymm10, %ymm3 400 ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm10 401 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm3, %ymm0, %ymm0 402 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm3 403 ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 404 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm4 405 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 406 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 407 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 408 ; AVX512F-NEXT: vpsrlw $6, %ymm1, %ymm2 409 ; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 410 ; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm3 411 ; AVX512F-NEXT: vpand %ymm8, %ymm3, %ymm3 412 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2 413 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm1 414 ; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2 415 ; AVX512F-NEXT: vpand %ymm9, %ymm2, %ymm2 416 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm3 417 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2 418 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm2, %ymm1, %ymm1 419 ; AVX512F-NEXT: retq 420 ; 421 ; AVX512VL-LABEL: splatvar_rotate_v64i8: 422 ; AVX512VL: # %bb.0: 423 ; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 424 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 425 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 426 ; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 427 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5 428 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 429 ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 430 ; AVX512VL-NEXT: vpor %ymm3, %ymm5, %ymm3 431 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 432 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 433 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 434 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] 435 ; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 436 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm7 437 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 438 ; AVX512VL-NEXT: vpand %ymm8, %ymm7, %ymm7 439 ; AVX512VL-NEXT: vpor %ymm3, %ymm7, %ymm3 440 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm7 441 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 442 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 443 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 444 ; AVX512VL-NEXT: vpand %ymm9, %ymm3, %ymm3 445 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm10 446 ; AVX512VL-NEXT: vpor %ymm3, %ymm10, %ymm3 447 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm10 448 ; AVX512VL-NEXT: vpblendvb %ymm10, %ymm3, %ymm0, %ymm0 449 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm3 450 ; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 451 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm4 452 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 453 ; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 454 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 455 ; AVX512VL-NEXT: vpsrlw $6, %ymm1, %ymm2 456 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 457 ; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm3 458 ; AVX512VL-NEXT: vpand %ymm8, %ymm3, %ymm3 459 ; AVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2 460 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm1 461 ; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm2 462 ; AVX512VL-NEXT: vpand %ymm9, %ymm2, %ymm2 463 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm3 464 ; AVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2 465 ; AVX512VL-NEXT: vpblendvb %ymm10, %ymm2, %ymm1, %ymm1 466 ; AVX512VL-NEXT: retq 467 ; 468 ; AVX512BW-LABEL: splatvar_rotate_v64i8: 469 ; AVX512BW: # %bb.0: 470 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 471 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 472 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm2 473 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3 474 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 475 ; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 476 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 477 ; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1} 478 ; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm4 479 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4 480 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 481 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 482 ; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} 483 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 484 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 485 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1} 486 ; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm1 487 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm2 488 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1 489 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k2 490 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 491 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 492 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} 493 ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1 494 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 495 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} 496 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm1 497 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 498 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 499 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1 500 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} 501 ; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0 502 ; AVX512BW-NEXT: retq 503 ; 504 ; AVX512VLBW-LABEL: splatvar_rotate_v64i8: 505 ; AVX512VLBW: # %bb.0: 506 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1 507 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 508 ; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm2, %zmm2 509 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3 510 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 511 ; AVX512VLBW-NEXT: vpsllw $5, %zmm1, %zmm1 512 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 513 ; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1} 514 ; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm4 515 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4 516 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 517 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 518 ; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} 519 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 520 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 521 ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1} 522 ; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm1 523 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm2 524 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1 525 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k2 526 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm1 527 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 528 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} 529 ; AVX512VLBW-NEXT: vpsrlw $2, %zmm0, %zmm1 530 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 531 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} 532 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm1 533 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 534 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 535 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1 536 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} 537 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0 538 ; AVX512VLBW-NEXT: retq 539 %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer 540 %splat8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat 541 %shl = shl <64 x i8> %a, %splat 542 %lshr = lshr <64 x i8> %a, %splat8 543 %or = or <64 x i8> %shl, %lshr 544 ret <64 x i8> %or 545 } 546 547 ; 548 ; Constant Rotates 549 ; 550 551 define <8 x i64> @constant_rotate_v8i64(<8 x i64> %a) nounwind { 552 ; AVX512-LABEL: constant_rotate_v8i64: 553 ; AVX512: # %bb.0: 554 ; AVX512-NEXT: vprolvq {{.*}}(%rip), %zmm0, %zmm0 555 ; AVX512-NEXT: retq 556 %shl = shl <8 x i64> %a, <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60> 557 %lshr = lshr <8 x i64> %a, <i64 60, i64 50, i64 14, i64 4, i64 60, i64 50, i64 14, i64 4> 558 %or = or <8 x i64> %shl, %lshr 559 ret <8 x i64> %or 560 } 561 562 define <16 x i32> @constant_rotate_v16i32(<16 x i32> %a) nounwind { 563 ; AVX512-LABEL: constant_rotate_v16i32: 564 ; AVX512: # %bb.0: 565 ; AVX512-NEXT: vprolvd {{.*}}(%rip), %zmm0, %zmm0 566 ; AVX512-NEXT: retq 567 %shl = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 568 %lshr = lshr <16 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21> 569 %or = or <16 x i32> %shl, %lshr 570 ret <16 x i32> %or 571 } 572 573 define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind { 574 ; AVX512F-LABEL: constant_rotate_v32i16: 575 ; AVX512F: # %bb.0: 576 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] 577 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3 578 ; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0 579 ; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 580 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 581 ; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 582 ; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1 583 ; AVX512F-NEXT: retq 584 ; 585 ; AVX512VL-LABEL: constant_rotate_v32i16: 586 ; AVX512VL: # %bb.0: 587 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] 588 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3 589 ; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm0 590 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 591 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 592 ; AVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm1 593 ; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1 594 ; AVX512VL-NEXT: retq 595 ; 596 ; AVX512BW-LABEL: constant_rotate_v32i16: 597 ; AVX512BW: # %bb.0: 598 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1 599 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0 600 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 601 ; AVX512BW-NEXT: retq 602 ; 603 ; AVX512VLBW-LABEL: constant_rotate_v32i16: 604 ; AVX512VLBW: # %bb.0: 605 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1 606 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0 607 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 608 ; AVX512VLBW-NEXT: retq 609 %shl = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 610 %lshr = lshr <32 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1> 611 %or = or <32 x i16> %shl, %lshr 612 ret <32 x i16> %or 613 } 614 615 define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind { 616 ; AVX512F-LABEL: constant_rotate_v64i8: 617 ; AVX512F: # %bb.0: 618 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2 619 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 620 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 621 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 622 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 623 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 624 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 625 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] 626 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 627 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm2 628 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] 629 ; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 630 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm7 631 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 632 ; AVX512F-NEXT: vpand %ymm8, %ymm7, %ymm7 633 ; AVX512F-NEXT: vpor %ymm2, %ymm7, %ymm2 634 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm7 635 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm0 636 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2 637 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 638 ; AVX512F-NEXT: vpand %ymm9, %ymm2, %ymm2 639 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm10 640 ; AVX512F-NEXT: vpor %ymm2, %ymm10, %ymm2 641 ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm10 642 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm2, %ymm0, %ymm0 643 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2 644 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 645 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm3 646 ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 647 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2 648 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 649 ; AVX512F-NEXT: vpsrlw $6, %ymm1, %ymm2 650 ; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 651 ; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm3 652 ; AVX512F-NEXT: vpand %ymm8, %ymm3, %ymm3 653 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2 654 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm1 655 ; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2 656 ; AVX512F-NEXT: vpand %ymm9, %ymm2, %ymm2 657 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm3 658 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2 659 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm2, %ymm1, %ymm1 660 ; AVX512F-NEXT: retq 661 ; 662 ; AVX512VL-LABEL: constant_rotate_v64i8: 663 ; AVX512VL: # %bb.0: 664 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 665 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 666 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 667 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 668 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 669 ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 670 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 671 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] 672 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 673 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm2 674 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] 675 ; AVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2 676 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm7 677 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 678 ; AVX512VL-NEXT: vpand %ymm8, %ymm7, %ymm7 679 ; AVX512VL-NEXT: vpor %ymm2, %ymm7, %ymm2 680 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7 681 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm0 682 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2 683 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 684 ; AVX512VL-NEXT: vpand %ymm9, %ymm2, %ymm2 685 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm10 686 ; AVX512VL-NEXT: vpor %ymm2, %ymm10, %ymm2 687 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm10 688 ; AVX512VL-NEXT: vpblendvb %ymm10, %ymm2, %ymm0, %ymm0 689 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2 690 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 691 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm3 692 ; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 693 ; AVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2 694 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 695 ; AVX512VL-NEXT: vpsrlw $6, %ymm1, %ymm2 696 ; AVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2 697 ; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm3 698 ; AVX512VL-NEXT: vpand %ymm8, %ymm3, %ymm3 699 ; AVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2 700 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm1 701 ; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm2 702 ; AVX512VL-NEXT: vpand %ymm9, %ymm2, %ymm2 703 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm3 704 ; AVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2 705 ; AVX512VL-NEXT: vpblendvb %ymm10, %ymm2, %ymm1, %ymm1 706 ; AVX512VL-NEXT: retq 707 ; 708 ; AVX512BW-LABEL: constant_rotate_v64i8: 709 ; AVX512BW: # %bb.0: 710 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] 711 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 712 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 713 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 714 ; AVX512BW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1} 715 ; AVX512BW-NEXT: vpsllw $2, %zmm2, %zmm3 716 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 717 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 718 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 719 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} 720 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 721 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 722 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1} 723 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536] 724 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 725 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3 726 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 727 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} 728 ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm3 729 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 730 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 731 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 732 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} 733 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm3 734 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 735 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 736 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 737 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} 738 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 739 ; AVX512BW-NEXT: retq 740 ; 741 ; AVX512VLBW-LABEL: constant_rotate_v64i8: 742 ; AVX512VLBW: # %bb.0: 743 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] 744 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 745 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2 746 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 747 ; AVX512VLBW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1} 748 ; AVX512VLBW-NEXT: vpsllw $2, %zmm2, %zmm3 749 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 750 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 751 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 752 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} 753 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 754 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 755 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1} 756 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536] 757 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 758 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm3 759 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 760 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} 761 ; AVX512VLBW-NEXT: vpsrlw $2, %zmm0, %zmm3 762 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 763 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 764 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 765 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} 766 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm3 767 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 768 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 769 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 770 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} 771 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 772 ; AVX512VLBW-NEXT: retq 773 %shl = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1> 774 %lshr = lshr <64 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> 775 %or = or <64 x i8> %shl, %lshr 776 ret <64 x i8> %or 777 } 778 779 ; 780 ; Uniform Constant Rotates 781 ; 782 783 define <8 x i64> @splatconstant_rotate_v8i64(<8 x i64> %a) nounwind { 784 ; AVX512-LABEL: splatconstant_rotate_v8i64: 785 ; AVX512: # %bb.0: 786 ; AVX512-NEXT: vprolq $14, %zmm0, %zmm0 787 ; AVX512-NEXT: retq 788 %shl = shl <8 x i64> %a, <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14> 789 %lshr = lshr <8 x i64> %a, <i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50> 790 %or = or <8 x i64> %shl, %lshr 791 ret <8 x i64> %or 792 } 793 794 define <16 x i32> @splatconstant_rotate_v16i32(<16 x i32> %a) nounwind { 795 ; AVX512-LABEL: splatconstant_rotate_v16i32: 796 ; AVX512: # %bb.0: 797 ; AVX512-NEXT: vprold $4, %zmm0, %zmm0 798 ; AVX512-NEXT: retq 799 %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> 800 %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28> 801 %or = or <16 x i32> %shl, %lshr 802 ret <16 x i32> %or 803 } 804 805 define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind { 806 ; AVX512F-LABEL: splatconstant_rotate_v32i16: 807 ; AVX512F: # %bb.0: 808 ; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm2 809 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 810 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 811 ; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm2 812 ; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm1 813 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 814 ; AVX512F-NEXT: retq 815 ; 816 ; AVX512VL-LABEL: splatconstant_rotate_v32i16: 817 ; AVX512VL: # %bb.0: 818 ; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm2 819 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0 820 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 821 ; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm2 822 ; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm1 823 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 824 ; AVX512VL-NEXT: retq 825 ; 826 ; AVX512BW-LABEL: splatconstant_rotate_v32i16: 827 ; AVX512BW: # %bb.0: 828 ; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm1 829 ; AVX512BW-NEXT: vpsrlw $9, %zmm0, %zmm0 830 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 831 ; AVX512BW-NEXT: retq 832 ; 833 ; AVX512VLBW-LABEL: splatconstant_rotate_v32i16: 834 ; AVX512VLBW: # %bb.0: 835 ; AVX512VLBW-NEXT: vpsllw $7, %zmm0, %zmm1 836 ; AVX512VLBW-NEXT: vpsrlw $9, %zmm0, %zmm0 837 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 838 ; AVX512VLBW-NEXT: retq 839 %shl = shl <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 840 %lshr = lshr <32 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9> 841 %or = or <32 x i16> %shl, %lshr 842 ret <32 x i16> %or 843 } 844 845 define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind { 846 ; AVX512F-LABEL: splatconstant_rotate_v64i8: 847 ; AVX512F: # %bb.0: 848 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2 849 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 850 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 851 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0 852 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 853 ; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 854 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 855 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2 856 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 857 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1 858 ; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 859 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 860 ; AVX512F-NEXT: retq 861 ; 862 ; AVX512VL-LABEL: splatconstant_rotate_v64i8: 863 ; AVX512VL: # %bb.0: 864 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 865 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 866 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 867 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 868 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 869 ; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0 870 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 871 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2 872 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 873 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1 874 ; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1 875 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 876 ; AVX512VL-NEXT: retq 877 ; 878 ; AVX512BW-LABEL: splatconstant_rotate_v64i8: 879 ; AVX512BW: # %bb.0: 880 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 881 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 882 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 883 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 884 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 885 ; AVX512BW-NEXT: retq 886 ; 887 ; AVX512VLBW-LABEL: splatconstant_rotate_v64i8: 888 ; AVX512VLBW: # %bb.0: 889 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 890 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 891 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 892 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 893 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 894 ; AVX512VLBW-NEXT: retq 895 %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 896 %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 897 %or = or <64 x i8> %shl, %lshr 898 ret <64 x i8> %or 899 } 900 901 ; 902 ; Masked Uniform Constant Rotates 903 ; 904 905 define <8 x i64> @splatconstant_rotate_mask_v8i64(<8 x i64> %a) nounwind { 906 ; AVX512-LABEL: splatconstant_rotate_mask_v8i64: 907 ; AVX512: # %bb.0: 908 ; AVX512-NEXT: vprolq $15, %zmm0, %zmm0 909 ; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 910 ; AVX512-NEXT: retq 911 %shl = shl <8 x i64> %a, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15> 912 %lshr = lshr <8 x i64> %a, <i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49> 913 %rmask = and <8 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255, i64 255, i64 127, i64 127, i64 255> 914 %lmask = and <8 x i64> %shl, <i64 33, i64 65, i64 129, i64 257, i64 33, i64 65, i64 129, i64 257> 915 %or = or <8 x i64> %lmask, %rmask 916 ret <8 x i64> %or 917 } 918 919 define <16 x i32> @splatconstant_rotate_mask_v16i32(<16 x i32> %a) nounwind { 920 ; AVX512-LABEL: splatconstant_rotate_mask_v16i32: 921 ; AVX512: # %bb.0: 922 ; AVX512-NEXT: vprold $4, %zmm0, %zmm0 923 ; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 924 ; AVX512-NEXT: retq 925 %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> 926 %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28> 927 %rmask = and <16 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511, i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511> 928 %lmask = and <16 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3, i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3> 929 %or = or <16 x i32> %lmask, %rmask 930 ret <16 x i32> %or 931 } 932 933 define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind { 934 ; AVX512F-LABEL: splatconstant_rotate_mask_v32i16: 935 ; AVX512F: # %bb.0: 936 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] 937 ; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm3 938 ; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm0 939 ; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 940 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 941 ; AVX512F-NEXT: vpsrlw $11, %ymm1, %ymm3 942 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 943 ; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1 944 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 945 ; AVX512F-NEXT: retq 946 ; 947 ; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16: 948 ; AVX512VL: # %bb.0: 949 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] 950 ; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm3 951 ; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm0 952 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 953 ; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0 954 ; AVX512VL-NEXT: vpsrlw $11, %ymm1, %ymm3 955 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 956 ; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1 957 ; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1 958 ; AVX512VL-NEXT: retq 959 ; 960 ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16: 961 ; AVX512BW: # %bb.0: 962 ; AVX512BW-NEXT: vpsllw $5, %zmm0, %zmm1 963 ; AVX512BW-NEXT: vpsrlw $11, %zmm0, %zmm0 964 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 965 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 966 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 967 ; AVX512BW-NEXT: retq 968 ; 969 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16: 970 ; AVX512VLBW: # %bb.0: 971 ; AVX512VLBW-NEXT: vpsllw $5, %zmm0, %zmm1 972 ; AVX512VLBW-NEXT: vpsrlw $11, %zmm0, %zmm0 973 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 974 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 975 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 976 ; AVX512VLBW-NEXT: retq 977 %shl = shl <32 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5> 978 %lshr = lshr <32 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11> 979 %rmask = and <32 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55> 980 %lmask = and <32 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33> 981 %or = or <32 x i16> %lmask, %rmask 982 ret <32 x i16> %or 983 } 984 985 define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { 986 ; AVX512F-LABEL: splatconstant_rotate_mask_v64i8: 987 ; AVX512F: # %bb.0: 988 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2 989 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 990 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 991 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0 992 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 993 ; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 994 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 995 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] 996 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 997 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm5 998 ; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm3 999 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1 1000 ; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 1001 ; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1 1002 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 1003 ; AVX512F-NEXT: retq 1004 ; 1005 ; AVX512VL-LABEL: splatconstant_rotate_mask_v64i8: 1006 ; AVX512VL: # %bb.0: 1007 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 1008 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1009 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 1010 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 1011 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 1012 ; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0 1013 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 1014 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] 1015 ; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0 1016 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm5 1017 ; AVX512VL-NEXT: vpand %ymm3, %ymm5, %ymm3 1018 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1 1019 ; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1 1020 ; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1 1021 ; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1 1022 ; AVX512VL-NEXT: retq 1023 ; 1024 ; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8: 1025 ; AVX512BW: # %bb.0: 1026 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 1027 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 1028 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 1029 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 1030 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 1031 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 1032 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 1033 ; AVX512BW-NEXT: retq 1034 ; 1035 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8: 1036 ; AVX512VLBW: # %bb.0: 1037 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 1038 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 1039 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 1040 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 1041 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 1042 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 1043 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 1044 ; AVX512VLBW-NEXT: retq 1045 %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1046 %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1047 %rmask = and <64 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55> 1048 %lmask = and <64 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33> 1049 %or = or <64 x i8> %lmask, %rmask 1050 ret <64 x i8> %or 1051 } 1052