1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X32,X32-SLOW 3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X32,X32-FAST 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-SLOW 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST 6 7 ; AVX2 Logical Shift Left 8 9 define <16 x i16> @test_sllw_1(<16 x i16> %InVec) { 10 ; X32-LABEL: test_sllw_1: 11 ; X32: # %bb.0: # %entry 12 ; X32-NEXT: retl 13 ; 14 ; X64-LABEL: test_sllw_1: 15 ; X64: # %bb.0: # %entry 16 ; X64-NEXT: retq 17 entry: 18 %shl = shl <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0> 19 ret <16 x i16> %shl 20 } 21 22 define <16 x i16> @test_sllw_2(<16 x i16> %InVec) { 23 ; X32-LABEL: test_sllw_2: 24 ; X32: # %bb.0: # %entry 25 ; X32-NEXT: vpaddw %ymm0, %ymm0, %ymm0 26 ; X32-NEXT: retl 27 ; 28 ; X64-LABEL: test_sllw_2: 29 ; X64: # %bb.0: # %entry 30 ; X64-NEXT: vpaddw %ymm0, %ymm0, %ymm0 31 ; X64-NEXT: retq 32 entry: 33 %shl = shl <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 34 ret <16 x i16> %shl 35 } 36 37 define <16 x i16> @test_sllw_3(<16 x i16> %InVec) { 38 ; X32-LABEL: test_sllw_3: 39 ; X32: # %bb.0: # %entry 40 ; X32-NEXT: vpsllw $15, %ymm0, %ymm0 41 ; X32-NEXT: retl 42 ; 43 ; X64-LABEL: test_sllw_3: 44 ; X64: # %bb.0: # %entry 45 ; X64-NEXT: vpsllw $15, %ymm0, %ymm0 46 ; X64-NEXT: retq 47 entry: 48 %shl = shl <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 49 ret <16 x i16> %shl 50 } 51 52 define <8 x i32> @test_slld_1(<8 x i32> %InVec) { 53 ; X32-LABEL: test_slld_1: 54 ; X32: # %bb.0: # %entry 55 ; X32-NEXT: retl 56 ; 57 ; X64-LABEL: test_slld_1: 58 ; X64: # %bb.0: # %entry 59 ; X64-NEXT: retq 60 entry: 61 %shl = shl <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 62 ret <8 x i32> %shl 63 } 64 65 define <8 x i32> @test_slld_2(<8 x i32> %InVec) { 66 ; X32-LABEL: test_slld_2: 67 ; X32: # %bb.0: # %entry 68 ; X32-NEXT: vpaddd %ymm0, %ymm0, %ymm0 69 ; X32-NEXT: retl 70 ; 71 ; X64-LABEL: test_slld_2: 72 ; X64: # %bb.0: # %entry 73 ; X64-NEXT: vpaddd %ymm0, %ymm0, %ymm0 74 ; X64-NEXT: retq 75 entry: 76 %shl = shl <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 77 ret <8 x i32> %shl 78 } 79 80 define <8 x i32> @test_vpslld_var(i32 %shift) { 81 ; X32-LABEL: test_vpslld_var: 82 ; X32: # %bb.0: 83 ; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 84 ; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199] 85 ; X32-NEXT: vpslld %xmm0, %ymm1, %ymm0 86 ; X32-NEXT: retl 87 ; 88 ; X64-LABEL: test_vpslld_var: 89 ; X64: # %bb.0: 90 ; X64-NEXT: vmovd %edi, %xmm0 91 ; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199] 92 ; X64-NEXT: vpslld %xmm0, %ymm1, %ymm0 93 ; X64-NEXT: retq 94 %amt = insertelement <8 x i32> undef, i32 %shift, i32 0 95 %tmp = shl <8 x i32> <i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199>, %amt 96 ret <8 x i32> %tmp 97 } 98 99 define <8 x i32> @test_slld_3(<8 x i32> %InVec) { 100 ; X32-LABEL: test_slld_3: 101 ; X32: # %bb.0: # %entry 102 ; X32-NEXT: vpslld $31, %ymm0, %ymm0 103 ; X32-NEXT: retl 104 ; 105 ; X64-LABEL: test_slld_3: 106 ; X64: # %bb.0: # %entry 107 ; X64-NEXT: vpslld $31, %ymm0, %ymm0 108 ; X64-NEXT: retq 109 entry: 110 %shl = shl <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> 111 ret <8 x i32> %shl 112 } 113 114 define <4 x i64> @test_sllq_1(<4 x i64> %InVec) { 115 ; X32-LABEL: test_sllq_1: 116 ; X32: # %bb.0: # %entry 117 ; X32-NEXT: retl 118 ; 119 ; X64-LABEL: test_sllq_1: 120 ; X64: # %bb.0: # %entry 121 ; X64-NEXT: retq 122 entry: 123 %shl = shl <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0> 124 ret <4 x i64> %shl 125 } 126 127 define <4 x i64> @test_sllq_2(<4 x i64> %InVec) { 128 ; X32-LABEL: test_sllq_2: 129 ; X32: # %bb.0: # %entry 130 ; X32-NEXT: vpaddq %ymm0, %ymm0, %ymm0 131 ; X32-NEXT: retl 132 ; 133 ; X64-LABEL: test_sllq_2: 134 ; X64: # %bb.0: # %entry 135 ; X64-NEXT: vpaddq %ymm0, %ymm0, %ymm0 136 ; X64-NEXT: retq 137 entry: 138 %shl = shl <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1> 139 ret <4 x i64> %shl 140 } 141 142 define <4 x i64> @test_sllq_3(<4 x i64> %InVec) { 143 ; X32-LABEL: test_sllq_3: 144 ; X32: # %bb.0: # %entry 145 ; X32-NEXT: vpsllq $63, %ymm0, %ymm0 146 ; X32-NEXT: retl 147 ; 148 ; X64-LABEL: test_sllq_3: 149 ; X64: # %bb.0: # %entry 150 ; X64-NEXT: vpsllq $63, %ymm0, %ymm0 151 ; X64-NEXT: retq 152 entry: 153 %shl = shl <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63> 154 ret <4 x i64> %shl 155 } 156 157 ; AVX2 Arithmetic Shift 158 159 define <16 x i16> @test_sraw_1(<16 x i16> %InVec) { 160 ; X32-LABEL: test_sraw_1: 161 ; X32: # %bb.0: # %entry 162 ; X32-NEXT: retl 163 ; 164 ; X64-LABEL: test_sraw_1: 165 ; X64: # %bb.0: # %entry 166 ; X64-NEXT: retq 167 entry: 168 %shl = ashr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0> 169 ret <16 x i16> %shl 170 } 171 172 define <16 x i16> @test_sraw_2(<16 x i16> %InVec) { 173 ; X32-LABEL: test_sraw_2: 174 ; X32: # %bb.0: # %entry 175 ; X32-NEXT: vpsraw $1, %ymm0, %ymm0 176 ; X32-NEXT: retl 177 ; 178 ; X64-LABEL: test_sraw_2: 179 ; X64: # %bb.0: # %entry 180 ; X64-NEXT: vpsraw $1, %ymm0, %ymm0 181 ; X64-NEXT: retq 182 entry: 183 %shl = ashr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 184 ret <16 x i16> %shl 185 } 186 187 define <16 x i16> @test_sraw_3(<16 x i16> %InVec) { 188 ; X32-LABEL: test_sraw_3: 189 ; X32: # %bb.0: # %entry 190 ; X32-NEXT: vpsraw $15, %ymm0, %ymm0 191 ; X32-NEXT: retl 192 ; 193 ; X64-LABEL: test_sraw_3: 194 ; X64: # %bb.0: # %entry 195 ; X64-NEXT: vpsraw $15, %ymm0, %ymm0 196 ; X64-NEXT: retq 197 entry: 198 %shl = ashr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 199 ret <16 x i16> %shl 200 } 201 202 define <8 x i32> @test_srad_1(<8 x i32> %InVec) { 203 ; X32-LABEL: test_srad_1: 204 ; X32: # %bb.0: # %entry 205 ; X32-NEXT: retl 206 ; 207 ; X64-LABEL: test_srad_1: 208 ; X64: # %bb.0: # %entry 209 ; X64-NEXT: retq 210 entry: 211 %shl = ashr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 212 ret <8 x i32> %shl 213 } 214 215 define <8 x i32> @test_srad_2(<8 x i32> %InVec) { 216 ; X32-LABEL: test_srad_2: 217 ; X32: # %bb.0: # %entry 218 ; X32-NEXT: vpsrad $1, %ymm0, %ymm0 219 ; X32-NEXT: retl 220 ; 221 ; X64-LABEL: test_srad_2: 222 ; X64: # %bb.0: # %entry 223 ; X64-NEXT: vpsrad $1, %ymm0, %ymm0 224 ; X64-NEXT: retq 225 entry: 226 %shl = ashr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 227 ret <8 x i32> %shl 228 } 229 230 define <8 x i32> @test_srad_3(<8 x i32> %InVec) { 231 ; X32-LABEL: test_srad_3: 232 ; X32: # %bb.0: # %entry 233 ; X32-NEXT: vpsrad $31, %ymm0, %ymm0 234 ; X32-NEXT: retl 235 ; 236 ; X64-LABEL: test_srad_3: 237 ; X64: # %bb.0: # %entry 238 ; X64-NEXT: vpsrad $31, %ymm0, %ymm0 239 ; X64-NEXT: retq 240 entry: 241 %shl = ashr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> 242 ret <8 x i32> %shl 243 } 244 245 ; SSE Logical Shift Right 246 247 define <16 x i16> @test_srlw_1(<16 x i16> %InVec) { 248 ; X32-LABEL: test_srlw_1: 249 ; X32: # %bb.0: # %entry 250 ; X32-NEXT: retl 251 ; 252 ; X64-LABEL: test_srlw_1: 253 ; X64: # %bb.0: # %entry 254 ; X64-NEXT: retq 255 entry: 256 %shl = lshr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0> 257 ret <16 x i16> %shl 258 } 259 260 define <16 x i16> @test_srlw_2(<16 x i16> %InVec) { 261 ; X32-LABEL: test_srlw_2: 262 ; X32: # %bb.0: # %entry 263 ; X32-NEXT: vpsrlw $1, %ymm0, %ymm0 264 ; X32-NEXT: retl 265 ; 266 ; X64-LABEL: test_srlw_2: 267 ; X64: # %bb.0: # %entry 268 ; X64-NEXT: vpsrlw $1, %ymm0, %ymm0 269 ; X64-NEXT: retq 270 entry: 271 %shl = lshr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 272 ret <16 x i16> %shl 273 } 274 275 define <16 x i16> @test_srlw_3(<16 x i16> %InVec) { 276 ; X32-LABEL: test_srlw_3: 277 ; X32: # %bb.0: # %entry 278 ; X32-NEXT: vpsrlw $15, %ymm0, %ymm0 279 ; X32-NEXT: retl 280 ; 281 ; X64-LABEL: test_srlw_3: 282 ; X64: # %bb.0: # %entry 283 ; X64-NEXT: vpsrlw $15, %ymm0, %ymm0 284 ; X64-NEXT: retq 285 entry: 286 %shl = lshr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 287 ret <16 x i16> %shl 288 } 289 290 define <8 x i32> @test_srld_1(<8 x i32> %InVec) { 291 ; X32-LABEL: test_srld_1: 292 ; X32: # %bb.0: # %entry 293 ; X32-NEXT: retl 294 ; 295 ; X64-LABEL: test_srld_1: 296 ; X64: # %bb.0: # %entry 297 ; X64-NEXT: retq 298 entry: 299 %shl = lshr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 300 ret <8 x i32> %shl 301 } 302 303 define <8 x i32> @test_srld_2(<8 x i32> %InVec) { 304 ; X32-LABEL: test_srld_2: 305 ; X32: # %bb.0: # %entry 306 ; X32-NEXT: vpsrld $1, %ymm0, %ymm0 307 ; X32-NEXT: retl 308 ; 309 ; X64-LABEL: test_srld_2: 310 ; X64: # %bb.0: # %entry 311 ; X64-NEXT: vpsrld $1, %ymm0, %ymm0 312 ; X64-NEXT: retq 313 entry: 314 %shl = lshr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 315 ret <8 x i32> %shl 316 } 317 318 define <8 x i32> @test_srld_3(<8 x i32> %InVec) { 319 ; X32-LABEL: test_srld_3: 320 ; X32: # %bb.0: # %entry 321 ; X32-NEXT: vpsrld $31, %ymm0, %ymm0 322 ; X32-NEXT: retl 323 ; 324 ; X64-LABEL: test_srld_3: 325 ; X64: # %bb.0: # %entry 326 ; X64-NEXT: vpsrld $31, %ymm0, %ymm0 327 ; X64-NEXT: retq 328 entry: 329 %shl = lshr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> 330 ret <8 x i32> %shl 331 } 332 333 define <4 x i64> @test_srlq_1(<4 x i64> %InVec) { 334 ; X32-LABEL: test_srlq_1: 335 ; X32: # %bb.0: # %entry 336 ; X32-NEXT: retl 337 ; 338 ; X64-LABEL: test_srlq_1: 339 ; X64: # %bb.0: # %entry 340 ; X64-NEXT: retq 341 entry: 342 %shl = lshr <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0> 343 ret <4 x i64> %shl 344 } 345 346 define <4 x i64> @test_srlq_2(<4 x i64> %InVec) { 347 ; X32-LABEL: test_srlq_2: 348 ; X32: # %bb.0: # %entry 349 ; X32-NEXT: vpsrlq $1, %ymm0, %ymm0 350 ; X32-NEXT: retl 351 ; 352 ; X64-LABEL: test_srlq_2: 353 ; X64: # %bb.0: # %entry 354 ; X64-NEXT: vpsrlq $1, %ymm0, %ymm0 355 ; X64-NEXT: retq 356 entry: 357 %shl = lshr <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1> 358 ret <4 x i64> %shl 359 } 360 361 define <4 x i64> @test_srlq_3(<4 x i64> %InVec) { 362 ; X32-LABEL: test_srlq_3: 363 ; X32: # %bb.0: # %entry 364 ; X32-NEXT: vpsrlq $63, %ymm0, %ymm0 365 ; X32-NEXT: retl 366 ; 367 ; X64-LABEL: test_srlq_3: 368 ; X64: # %bb.0: # %entry 369 ; X64-NEXT: vpsrlq $63, %ymm0, %ymm0 370 ; X64-NEXT: retq 371 entry: 372 %shl = lshr <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63> 373 ret <4 x i64> %shl 374 } 375 376 define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind { 377 ; X32-SLOW-LABEL: srl_trunc_and_v4i64: 378 ; X32-SLOW: # %bb.0: 379 ; X32-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 380 ; X32-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 381 ; X32-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] 382 ; X32-SLOW-NEXT: vpand %xmm2, %xmm1, %xmm1 383 ; X32-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 384 ; X32-SLOW-NEXT: vzeroupper 385 ; X32-SLOW-NEXT: retl 386 ; 387 ; X32-FAST-LABEL: srl_trunc_and_v4i64: 388 ; X32-FAST: # %bb.0: 389 ; X32-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 390 ; X32-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 391 ; X32-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] 392 ; X32-FAST-NEXT: vpand %xmm2, %xmm1, %xmm1 393 ; X32-FAST-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 394 ; X32-FAST-NEXT: vzeroupper 395 ; X32-FAST-NEXT: retl 396 ; 397 ; X64-SLOW-LABEL: srl_trunc_and_v4i64: 398 ; X64-SLOW: # %bb.0: 399 ; X64-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] 400 ; X64-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 401 ; X64-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] 402 ; X64-SLOW-NEXT: vpand %xmm2, %xmm1, %xmm1 403 ; X64-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 404 ; X64-SLOW-NEXT: vzeroupper 405 ; X64-SLOW-NEXT: retq 406 ; 407 ; X64-FAST-LABEL: srl_trunc_and_v4i64: 408 ; X64-FAST: # %bb.0: 409 ; X64-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 410 ; X64-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 411 ; X64-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] 412 ; X64-FAST-NEXT: vpand %xmm2, %xmm1, %xmm1 413 ; X64-FAST-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 414 ; X64-FAST-NEXT: vzeroupper 415 ; X64-FAST-NEXT: retq 416 %and = and <4 x i64> %y, <i64 8, i64 8, i64 8, i64 8> 417 %trunc = trunc <4 x i64> %and to <4 x i32> 418 %sra = lshr <4 x i32> %x, %trunc 419 ret <4 x i32> %sra 420 } 421 422 ; 423 ; Vectorized byte shifts 424 ; 425 426 define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind { 427 ; X32-LABEL: shl_8i16: 428 ; X32: # %bb.0: 429 ; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 430 ; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 431 ; X32-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 432 ; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 433 ; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 434 ; X32-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 435 ; X32-NEXT: vzeroupper 436 ; X32-NEXT: retl 437 ; 438 ; X64-LABEL: shl_8i16: 439 ; X64: # %bb.0: 440 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 441 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 442 ; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 443 ; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 444 ; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 445 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 446 ; X64-NEXT: vzeroupper 447 ; X64-NEXT: retq 448 %shl = shl <8 x i16> %r, %a 449 ret <8 x i16> %shl 450 } 451 452 define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind { 453 ; X32-LABEL: shl_16i16: 454 ; X32: # %bb.0: 455 ; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2 456 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 457 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] 458 ; X32-NEXT: vpsllvd %ymm3, %ymm4, %ymm3 459 ; X32-NEXT: vpsrld $16, %ymm3, %ymm3 460 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] 461 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] 462 ; X32-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 463 ; X32-NEXT: vpsrld $16, %ymm0, %ymm0 464 ; X32-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 465 ; X32-NEXT: retl 466 ; 467 ; X64-LABEL: shl_16i16: 468 ; X64: # %bb.0: 469 ; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 470 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 471 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] 472 ; X64-NEXT: vpsllvd %ymm3, %ymm4, %ymm3 473 ; X64-NEXT: vpsrld $16, %ymm3, %ymm3 474 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] 475 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] 476 ; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 477 ; X64-NEXT: vpsrld $16, %ymm0, %ymm0 478 ; X64-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 479 ; X64-NEXT: retq 480 %shl = shl <16 x i16> %r, %a 481 ret <16 x i16> %shl 482 } 483 484 define <32 x i8> @shl_32i8(<32 x i8> %r, <32 x i8> %a) nounwind { 485 ; X32-LABEL: shl_32i8: 486 ; X32: # %bb.0: 487 ; X32-NEXT: vpsllw $5, %ymm1, %ymm1 488 ; X32-NEXT: vpsllw $4, %ymm0, %ymm2 489 ; X32-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 490 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 491 ; X32-NEXT: vpsllw $2, %ymm0, %ymm2 492 ; X32-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 493 ; X32-NEXT: vpaddb %ymm1, %ymm1, %ymm1 494 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 495 ; X32-NEXT: vpaddb %ymm0, %ymm0, %ymm2 496 ; X32-NEXT: vpaddb %ymm1, %ymm1, %ymm1 497 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 498 ; X32-NEXT: retl 499 ; 500 ; X64-LABEL: shl_32i8: 501 ; X64: # %bb.0: 502 ; X64-NEXT: vpsllw $5, %ymm1, %ymm1 503 ; X64-NEXT: vpsllw $4, %ymm0, %ymm2 504 ; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 505 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 506 ; X64-NEXT: vpsllw $2, %ymm0, %ymm2 507 ; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 508 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1 509 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 510 ; X64-NEXT: vpaddb %ymm0, %ymm0, %ymm2 511 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1 512 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 513 ; X64-NEXT: retq 514 %shl = shl <32 x i8> %r, %a 515 ret <32 x i8> %shl 516 } 517 518 define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind { 519 ; X32-LABEL: ashr_8i16: 520 ; X32: # %bb.0: 521 ; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 522 ; X32-NEXT: vpmovsxwd %xmm0, %ymm0 523 ; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0 524 ; X32-NEXT: vextracti128 $1, %ymm0, %xmm1 525 ; X32-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 526 ; X32-NEXT: vzeroupper 527 ; X32-NEXT: retl 528 ; 529 ; X64-LABEL: ashr_8i16: 530 ; X64: # %bb.0: 531 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 532 ; X64-NEXT: vpmovsxwd %xmm0, %ymm0 533 ; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0 534 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 535 ; X64-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 536 ; X64-NEXT: vzeroupper 537 ; X64-NEXT: retq 538 %ashr = ashr <8 x i16> %r, %a 539 ret <8 x i16> %ashr 540 } 541 542 define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind { 543 ; X32-LABEL: ashr_16i16: 544 ; X32: # %bb.0: 545 ; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2 546 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 547 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] 548 ; X32-NEXT: vpsravd %ymm3, %ymm4, %ymm3 549 ; X32-NEXT: vpsrld $16, %ymm3, %ymm3 550 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] 551 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] 552 ; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0 553 ; X32-NEXT: vpsrld $16, %ymm0, %ymm0 554 ; X32-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 555 ; X32-NEXT: retl 556 ; 557 ; X64-LABEL: ashr_16i16: 558 ; X64: # %bb.0: 559 ; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 560 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 561 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] 562 ; X64-NEXT: vpsravd %ymm3, %ymm4, %ymm3 563 ; X64-NEXT: vpsrld $16, %ymm3, %ymm3 564 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] 565 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] 566 ; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0 567 ; X64-NEXT: vpsrld $16, %ymm0, %ymm0 568 ; X64-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 569 ; X64-NEXT: retq 570 %ashr = ashr <16 x i16> %r, %a 571 ret <16 x i16> %ashr 572 } 573 574 define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind { 575 ; X32-LABEL: ashr_32i8: 576 ; X32: # %bb.0: 577 ; X32-NEXT: vpsllw $5, %ymm1, %ymm1 578 ; X32-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 579 ; X32-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 580 ; X32-NEXT: vpsraw $4, %ymm3, %ymm4 581 ; X32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 582 ; X32-NEXT: vpsraw $2, %ymm3, %ymm4 583 ; X32-NEXT: vpaddw %ymm2, %ymm2, %ymm2 584 ; X32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 585 ; X32-NEXT: vpsraw $1, %ymm3, %ymm4 586 ; X32-NEXT: vpaddw %ymm2, %ymm2, %ymm2 587 ; X32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 588 ; X32-NEXT: vpsrlw $8, %ymm2, %ymm2 589 ; X32-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 590 ; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 591 ; X32-NEXT: vpsraw $4, %ymm0, %ymm3 592 ; X32-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 593 ; X32-NEXT: vpsraw $2, %ymm0, %ymm3 594 ; X32-NEXT: vpaddw %ymm1, %ymm1, %ymm1 595 ; X32-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 596 ; X32-NEXT: vpsraw $1, %ymm0, %ymm3 597 ; X32-NEXT: vpaddw %ymm1, %ymm1, %ymm1 598 ; X32-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 599 ; X32-NEXT: vpsrlw $8, %ymm0, %ymm0 600 ; X32-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 601 ; X32-NEXT: retl 602 ; 603 ; X64-LABEL: ashr_32i8: 604 ; X64: # %bb.0: 605 ; X64-NEXT: vpsllw $5, %ymm1, %ymm1 606 ; X64-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 607 ; X64-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 608 ; X64-NEXT: vpsraw $4, %ymm3, %ymm4 609 ; X64-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 610 ; X64-NEXT: vpsraw $2, %ymm3, %ymm4 611 ; X64-NEXT: vpaddw %ymm2, %ymm2, %ymm2 612 ; X64-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 613 ; X64-NEXT: vpsraw $1, %ymm3, %ymm4 614 ; X64-NEXT: vpaddw %ymm2, %ymm2, %ymm2 615 ; X64-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 616 ; X64-NEXT: vpsrlw $8, %ymm2, %ymm2 617 ; X64-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 618 ; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 619 ; X64-NEXT: vpsraw $4, %ymm0, %ymm3 620 ; X64-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 621 ; X64-NEXT: vpsraw $2, %ymm0, %ymm3 622 ; X64-NEXT: vpaddw %ymm1, %ymm1, %ymm1 623 ; X64-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 624 ; X64-NEXT: vpsraw $1, %ymm0, %ymm3 625 ; X64-NEXT: vpaddw %ymm1, %ymm1, %ymm1 626 ; X64-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 627 ; X64-NEXT: vpsrlw $8, %ymm0, %ymm0 628 ; X64-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 629 ; X64-NEXT: retq 630 %ashr = ashr <32 x i8> %r, %a 631 ret <32 x i8> %ashr 632 } 633 634 define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind { 635 ; X32-LABEL: lshr_8i16: 636 ; X32: # %bb.0: 637 ; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 638 ; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 639 ; X32-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 640 ; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 641 ; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 642 ; X32-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 643 ; X32-NEXT: vzeroupper 644 ; X32-NEXT: retl 645 ; 646 ; X64-LABEL: lshr_8i16: 647 ; X64: # %bb.0: 648 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 649 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 650 ; X64-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 651 ; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 652 ; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 653 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 654 ; X64-NEXT: vzeroupper 655 ; X64-NEXT: retq 656 %lshr = lshr <8 x i16> %r, %a 657 ret <8 x i16> %lshr 658 } 659 660 define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind { 661 ; X32-LABEL: lshr_16i16: 662 ; X32: # %bb.0: 663 ; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2 664 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 665 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] 666 ; X32-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3 667 ; X32-NEXT: vpsrld $16, %ymm3, %ymm3 668 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] 669 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] 670 ; X32-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 671 ; X32-NEXT: vpsrld $16, %ymm0, %ymm0 672 ; X32-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 673 ; X32-NEXT: retl 674 ; 675 ; X64-LABEL: lshr_16i16: 676 ; X64: # %bb.0: 677 ; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 678 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 679 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] 680 ; X64-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3 681 ; X64-NEXT: vpsrld $16, %ymm3, %ymm3 682 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] 683 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] 684 ; X64-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 685 ; X64-NEXT: vpsrld $16, %ymm0, %ymm0 686 ; X64-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 687 ; X64-NEXT: retq 688 %lshr = lshr <16 x i16> %r, %a 689 ret <16 x i16> %lshr 690 } 691 692 define <32 x i8> @lshr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind { 693 ; X32-LABEL: lshr_32i8: 694 ; X32: # %bb.0: 695 ; X32-NEXT: vpsllw $5, %ymm1, %ymm1 696 ; X32-NEXT: vpsrlw $4, %ymm0, %ymm2 697 ; X32-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 698 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 699 ; X32-NEXT: vpsrlw $2, %ymm0, %ymm2 700 ; X32-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 701 ; X32-NEXT: vpaddb %ymm1, %ymm1, %ymm1 702 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 703 ; X32-NEXT: vpsrlw $1, %ymm0, %ymm2 704 ; X32-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 705 ; X32-NEXT: vpaddb %ymm1, %ymm1, %ymm1 706 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 707 ; X32-NEXT: retl 708 ; 709 ; X64-LABEL: lshr_32i8: 710 ; X64: # %bb.0: 711 ; X64-NEXT: vpsllw $5, %ymm1, %ymm1 712 ; X64-NEXT: vpsrlw $4, %ymm0, %ymm2 713 ; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 714 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 715 ; X64-NEXT: vpsrlw $2, %ymm0, %ymm2 716 ; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 717 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1 718 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 719 ; X64-NEXT: vpsrlw $1, %ymm0, %ymm2 720 ; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 721 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1 722 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 723 ; X64-NEXT: retq 724 %lshr = lshr <32 x i8> %r, %a 725 ret <32 x i8> %lshr 726 } 727