1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; TODO: Add AVX512BW shift support 3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ 4 5 ; 6 ; Variable Shifts 7 ; 8 9 define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { 10 ; ALL-LABEL: var_shift_v8i64: 11 ; ALL: ## BB#0: 12 ; ALL-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 13 ; ALL-NEXT: retq 14 %shift = shl <8 x i64> %a, %b 15 ret <8 x i64> %shift 16 } 17 18 define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { 19 ; ALL-LABEL: var_shift_v16i32: 20 ; ALL: ## BB#0: 21 ; ALL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 22 ; ALL-NEXT: retq 23 %shift = shl <16 x i32> %a, %b 24 ret <16 x i32> %shift 25 } 26 27 define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { 28 ; ALL-LABEL: var_shift_v32i16: 29 ; ALL: ## BB#0: 30 ; ALL-NEXT: vpxor %ymm4, %ymm4, %ymm4 31 ; ALL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] 32 ; ALL-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] 33 ; ALL-NEXT: vpsllvd %ymm5, %ymm6, %ymm5 34 ; ALL-NEXT: vpsrld $16, %ymm5, %ymm5 35 ; ALL-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] 36 ; ALL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] 37 ; ALL-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 38 ; ALL-NEXT: vpsrld $16, %ymm0, %ymm0 39 ; ALL-NEXT: vpackusdw %ymm5, %ymm0, %ymm0 40 ; ALL-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] 41 ; ALL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] 42 ; ALL-NEXT: vpsllvd %ymm2, %ymm5, %ymm2 43 ; ALL-NEXT: vpsrld $16, %ymm2, %ymm2 44 ; ALL-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] 45 ; ALL-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] 46 ; ALL-NEXT: vpsllvd %ymm3, %ymm1, %ymm1 47 ; ALL-NEXT: vpsrld $16, %ymm1, %ymm1 48 ; ALL-NEXT: vpackusdw %ymm2, %ymm1, %ymm1 49 ; ALL-NEXT: retq 50 %shift = shl <32 x i16> %a, %b 51 ret <32 x i16> %shift 52 } 53 54 define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { 55 ; ALL-LABEL: var_shift_v64i8: 56 ; ALL: ## BB#0: 57 ; ALL-NEXT: vpsllw $4, %ymm0, %ymm4 58 ; ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 59 ; ALL-NEXT: vpand %ymm5, %ymm4, %ymm4 60 ; ALL-NEXT: vpsllw $5, %ymm2, %ymm2 61 ; ALL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 62 ; ALL-NEXT: vpsllw $2, %ymm0, %ymm4 63 ; ALL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 64 ; ALL-NEXT: vpand %ymm6, %ymm4, %ymm4 65 ; ALL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 66 ; ALL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 67 ; ALL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 68 ; ALL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 69 ; ALL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 70 ; ALL-NEXT: vpsllw $4, %ymm1, %ymm2 71 ; ALL-NEXT: vpand %ymm5, %ymm2, %ymm2 72 ; ALL-NEXT: vpsllw $5, %ymm3, %ymm3 73 ; ALL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 74 ; ALL-NEXT: vpsllw $2, %ymm1, %ymm2 75 ; ALL-NEXT: vpand %ymm6, %ymm2, %ymm2 76 ; ALL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 77 ; ALL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 78 ; ALL-NEXT: vpaddb %ymm1, %ymm1, %ymm2 79 ; ALL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 80 ; ALL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 81 ; ALL-NEXT: retq 82 %shift = shl <64 x i8> %a, %b 83 ret <64 x i8> %shift 84 } 85 86 ; 87 ; Uniform Variable Shifts 88 ; 89 90 define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { 91 ; ALL-LABEL: splatvar_shift_v8i64: 92 ; ALL: ## BB#0: 93 ; ALL-NEXT: vpsllq %xmm1, %zmm0, %zmm0 94 ; ALL-NEXT: retq 95 %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer 96 %shift = shl <8 x i64> %a, %splat 97 ret <8 x i64> %shift 98 } 99 100 define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { 101 ; ALL-LABEL: splatvar_shift_v16i32: 102 ; ALL: ## BB#0: 103 ; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2 104 ; ALL-NEXT: vmovss %xmm1, %xmm2, %xmm1 105 ; ALL-NEXT: vpslld %xmm1, %zmm0, %zmm0 106 ; ALL-NEXT: retq 107 %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer 108 %shift = shl <16 x i32> %a, %splat 109 ret <16 x i32> %shift 110 } 111 112 define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { 113 ; ALL-LABEL: splatvar_shift_v32i16: 114 ; ALL: ## BB#0: 115 ; ALL-NEXT: vmovd %xmm2, %eax 116 ; ALL-NEXT: movzwl %ax, %eax 117 ; ALL-NEXT: vmovd %eax, %xmm2 118 ; ALL-NEXT: vpsllw %xmm2, %ymm0, %ymm0 119 ; ALL-NEXT: vpsllw %xmm2, %ymm1, %ymm1 120 ; ALL-NEXT: retq 121 %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer 122 %shift = shl <32 x i16> %a, %splat 123 ret <32 x i16> %shift 124 } 125 126 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { 127 ; ALL-LABEL: splatvar_shift_v64i8: 128 ; ALL: ## BB#0: 129 ; ALL-NEXT: vpbroadcastb %xmm2, %ymm2 130 ; ALL-NEXT: vpsllw $4, %ymm0, %ymm3 131 ; ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 132 ; ALL-NEXT: vpand %ymm4, %ymm3, %ymm3 133 ; ALL-NEXT: vpsllw $5, %ymm2, %ymm2 134 ; ALL-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 135 ; ALL-NEXT: vpsllw $2, %ymm0, %ymm3 136 ; ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 137 ; ALL-NEXT: vpand %ymm5, %ymm3, %ymm3 138 ; ALL-NEXT: vpaddb %ymm2, %ymm2, %ymm6 139 ; ALL-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0 140 ; ALL-NEXT: vpaddb %ymm0, %ymm0, %ymm3 141 ; ALL-NEXT: vpaddb %ymm6, %ymm6, %ymm7 142 ; ALL-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 143 ; ALL-NEXT: vpsllw $4, %ymm1, %ymm3 144 ; ALL-NEXT: vpand %ymm4, %ymm3, %ymm3 145 ; ALL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 146 ; ALL-NEXT: vpsllw $2, %ymm1, %ymm2 147 ; ALL-NEXT: vpand %ymm5, %ymm2, %ymm2 148 ; ALL-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 149 ; ALL-NEXT: vpaddb %ymm1, %ymm1, %ymm2 150 ; ALL-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm1 151 ; ALL-NEXT: retq 152 %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer 153 %shift = shl <64 x i8> %a, %splat 154 ret <64 x i8> %shift 155 } 156 157 ; 158 ; Constant Shifts 159 ; 160 161 define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind { 162 ; ALL-LABEL: constant_shift_v8i64: 163 ; ALL: ## BB#0: 164 ; ALL-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0 165 ; ALL-NEXT: retq 166 %shift = shl <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62> 167 ret <8 x i64> %shift 168 } 169 170 define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind { 171 ; ALL-LABEL: constant_shift_v16i32: 172 ; ALL: ## BB#0: 173 ; ALL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 174 ; ALL-NEXT: retq 175 %shift = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7> 176 ret <16 x i32> %shift 177 } 178 179 define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { 180 ; ALL-LABEL: constant_shift_v32i16: 181 ; ALL: ## BB#0: 182 ; ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] 183 ; ALL-NEXT: vpmullw %ymm2, %ymm0, %ymm0 184 ; ALL-NEXT: vpmullw %ymm2, %ymm1, %ymm1 185 ; ALL-NEXT: retq 186 %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 187 ret <32 x i16> %shift 188 } 189 190 define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { 191 ; ALL-LABEL: constant_shift_v64i8: 192 ; ALL: ## BB#0: 193 ; ALL-NEXT: vpsllw $4, %ymm0, %ymm2 194 ; ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 195 ; ALL-NEXT: vpand %ymm3, %ymm2, %ymm2 196 ; ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] 197 ; ALL-NEXT: vpsllw $5, %ymm4, %ymm4 198 ; ALL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 199 ; ALL-NEXT: vpsllw $2, %ymm0, %ymm2 200 ; ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 201 ; ALL-NEXT: vpand %ymm5, %ymm2, %ymm2 202 ; ALL-NEXT: vpaddb %ymm4, %ymm4, %ymm6 203 ; ALL-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 204 ; ALL-NEXT: vpaddb %ymm0, %ymm0, %ymm2 205 ; ALL-NEXT: vpaddb %ymm6, %ymm6, %ymm7 206 ; ALL-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm0 207 ; ALL-NEXT: vpsllw $4, %ymm1, %ymm2 208 ; ALL-NEXT: vpand %ymm3, %ymm2, %ymm2 209 ; ALL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 210 ; ALL-NEXT: vpsllw $2, %ymm1, %ymm2 211 ; ALL-NEXT: vpand %ymm5, %ymm2, %ymm2 212 ; ALL-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 213 ; ALL-NEXT: vpaddb %ymm1, %ymm1, %ymm2 214 ; ALL-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm1 215 ; ALL-NEXT: retq 216 %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> 217 ret <64 x i8> %shift 218 } 219 220 ; 221 ; Uniform Constant Shifts 222 ; 223 224 define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind { 225 ; ALL-LABEL: splatconstant_shift_v8i64: 226 ; ALL: ## BB#0: 227 ; ALL-NEXT: vpsllq $7, %zmm0, %zmm0 228 ; ALL-NEXT: retq 229 %shift = shl <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7> 230 ret <8 x i64> %shift 231 } 232 233 define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind { 234 ; ALL-LABEL: splatconstant_shift_v16i32: 235 ; ALL: ## BB#0: 236 ; ALL-NEXT: vpslld $5, %zmm0, %zmm0 237 ; ALL-NEXT: retq 238 %shift = shl <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> 239 ret <16 x i32> %shift 240 } 241 242 define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind { 243 ; ALL-LABEL: splatconstant_shift_v32i16: 244 ; ALL: ## BB#0: 245 ; ALL-NEXT: vpsllw $3, %ymm0, %ymm0 246 ; ALL-NEXT: vpsllw $3, %ymm1, %ymm1 247 ; ALL-NEXT: retq 248 %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> 249 ret <32 x i16> %shift 250 } 251 252 define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { 253 ; ALL-LABEL: splatconstant_shift_v64i8: 254 ; ALL: ## BB#0: 255 ; ALL-NEXT: vpsllw $3, %ymm0, %ymm0 256 ; ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] 257 ; ALL-NEXT: vpand %ymm2, %ymm0, %ymm0 258 ; ALL-NEXT: vpsllw $3, %ymm1, %ymm1 259 ; ALL-NEXT: vpand %ymm2, %ymm1, %ymm1 260 ; ALL-NEXT: retq 261 %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> 262 ret <64 x i8> %shift 263 } 264