1 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core2 < %s | FileCheck --check-prefix=SSE2-CODEGEN %s 2 ; RUN: opt -mtriple=x86_64-apple-darwin -mcpu=core2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s 3 4 %shifttype = type <2 x i16> 5 define %shifttype @shift2i16(%shifttype %a, %shifttype %b) { 6 entry: 7 ; SSE2: shift2i16 8 ; SSE2: cost of 20 {{.*}} shl 9 ; SSE2-CODEGEN: shift2i16 10 ; SSE2-CODEGEN: shlq %cl 11 12 %0 = shl %shifttype %a , %b 13 ret %shifttype %0 14 } 15 16 %shifttype4i16 = type <4 x i16> 17 define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) { 18 entry: 19 ; SSE2: shift4i16 20 ; SSE2: cost of 10 {{.*}} shl 21 ; SSE2-CODEGEN: shift4i16 22 ; SSE2-CODEGEN: pmuludq 23 24 %0 = shl %shifttype4i16 %a , %b 25 ret %shifttype4i16 %0 26 } 27 28 %shifttype8i16 = type <8 x i16> 29 define %shifttype8i16 @shift8i16(%shifttype8i16 %a, %shifttype8i16 %b) { 30 entry: 31 ; SSE2: shift8i16 32 ; SSE2: cost of 80 {{.*}} shl 33 ; SSE2-CODEGEN: shift8i16 34 ; SSE2-CODEGEN: shll %cl 35 36 %0 = shl %shifttype8i16 %a , %b 37 ret %shifttype8i16 %0 38 } 39 40 %shifttype16i16 = type <16 x i16> 41 define %shifttype16i16 @shift16i16(%shifttype16i16 %a, %shifttype16i16 %b) { 42 entry: 43 ; SSE2: shift16i16 44 ; SSE2: cost of 160 {{.*}} shl 45 ; SSE2-CODEGEN: shift16i16 46 ; SSE2-CODEGEN: shll %cl 47 48 %0 = shl %shifttype16i16 %a , %b 49 ret %shifttype16i16 %0 50 } 51 52 %shifttype32i16 = type <32 x i16> 53 define %shifttype32i16 @shift32i16(%shifttype32i16 %a, %shifttype32i16 %b) { 54 entry: 55 ; SSE2: shift32i16 56 ; SSE2: cost of 320 {{.*}} shl 57 ; SSE2-CODEGEN: shift32i16 58 ; SSE2-CODEGEN: shll %cl 59 60 %0 = shl %shifttype32i16 %a , %b 61 ret %shifttype32i16 %0 62 } 63 64 %shifttype2i32 = type <2 x i32> 65 define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) { 66 entry: 67 ; SSE2: shift2i32 68 ; SSE2: cost of 20 {{.*}} shl 69 ; SSE2-CODEGEN: shift2i32 70 ; SSE2-CODEGEN: shlq %cl 71 72 %0 = shl %shifttype2i32 %a , %b 73 ret %shifttype2i32 %0 74 } 75 76 %shifttype4i32 = type <4 x i32> 77 define %shifttype4i32 @shift4i32(%shifttype4i32 %a, %shifttype4i32 %b) { 78 entry: 79 ; SSE2: shift4i32 80 ; SSE2: cost of 10 {{.*}} shl 81 ; SSE2-CODEGEN: shift4i32 82 ; SSE2-CODEGEN: pmuludq 83 84 %0 = shl %shifttype4i32 %a , %b 85 ret %shifttype4i32 %0 86 } 87 88 %shifttype8i32 = type <8 x i32> 89 define %shifttype8i32 @shift8i32(%shifttype8i32 %a, %shifttype8i32 %b) { 90 entry: 91 ; SSE2: shift8i32 92 ; SSE2: cost of 20 {{.*}} shl 93 ; SSE2-CODEGEN: shift8i32 94 ; SSE2-CODEGEN: pmuludq 95 96 %0 = shl %shifttype8i32 %a , %b 97 ret %shifttype8i32 %0 98 } 99 100 %shifttype16i32 = type <16 x i32> 101 define %shifttype16i32 @shift16i32(%shifttype16i32 %a, %shifttype16i32 %b) { 102 entry: 103 ; SSE2: shift16i32 104 ; SSE2: cost of 40 {{.*}} shl 105 ; SSE2-CODEGEN: shift16i32 106 ; SSE2-CODEGEN: pmuludq 107 108 %0 = shl %shifttype16i32 %a , %b 109 ret %shifttype16i32 %0 110 } 111 112 %shifttype32i32 = type <32 x i32> 113 define %shifttype32i32 @shift32i32(%shifttype32i32 %a, %shifttype32i32 %b) { 114 entry: 115 ; SSE2: shift32i32 116 ; SSE2: cost of 80 {{.*}} shl 117 ; SSE2-CODEGEN: shift32i32 118 ; SSE2-CODEGEN: pmuludq 119 120 %0 = shl %shifttype32i32 %a , %b 121 ret %shifttype32i32 %0 122 } 123 124 %shifttype2i64 = type <2 x i64> 125 define %shifttype2i64 @shift2i64(%shifttype2i64 %a, %shifttype2i64 %b) { 126 entry: 127 ; SSE2: shift2i64 128 ; SSE2: cost of 20 {{.*}} shl 129 ; SSE2-CODEGEN: shift2i64 130 ; SSE2-CODEGEN: shlq %cl 131 132 %0 = shl %shifttype2i64 %a , %b 133 ret %shifttype2i64 %0 134 } 135 136 %shifttype4i64 = type <4 x i64> 137 define %shifttype4i64 @shift4i64(%shifttype4i64 %a, %shifttype4i64 %b) { 138 entry: 139 ; SSE2: shift4i64 140 ; SSE2: cost of 40 {{.*}} shl 141 ; SSE2-CODEGEN: shift4i64 142 ; SSE2-CODEGEN: shlq %cl 143 144 %0 = shl %shifttype4i64 %a , %b 145 ret %shifttype4i64 %0 146 } 147 148 %shifttype8i64 = type <8 x i64> 149 define %shifttype8i64 @shift8i64(%shifttype8i64 %a, %shifttype8i64 %b) { 150 entry: 151 ; SSE2: shift8i64 152 ; SSE2: cost of 80 {{.*}} shl 153 ; SSE2-CODEGEN: shift8i64 154 ; SSE2-CODEGEN: shlq %cl 155 156 %0 = shl %shifttype8i64 %a , %b 157 ret %shifttype8i64 %0 158 } 159 160 %shifttype16i64 = type <16 x i64> 161 define %shifttype16i64 @shift16i64(%shifttype16i64 %a, %shifttype16i64 %b) { 162 entry: 163 ; SSE2: shift16i64 164 ; SSE2: cost of 160 {{.*}} shl 165 ; SSE2-CODEGEN: shift16i64 166 ; SSE2-CODEGEN: shlq %cl 167 168 %0 = shl %shifttype16i64 %a , %b 169 ret %shifttype16i64 %0 170 } 171 172 %shifttype32i64 = type <32 x i64> 173 define %shifttype32i64 @shift32i64(%shifttype32i64 %a, %shifttype32i64 %b) { 174 entry: 175 ; SSE2: shift32i64 176 ; SSE2: cost of 320 {{.*}} shl 177 ; SSE2-CODEGEN: shift32i64 178 ; SSE2-CODEGEN: shlq %cl 179 180 %0 = shl %shifttype32i64 %a , %b 181 ret %shifttype32i64 %0 182 } 183 184 %shifttype2i8 = type <2 x i8> 185 define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) { 186 entry: 187 ; SSE2: shift2i8 188 ; SSE2: cost of 20 {{.*}} shl 189 ; SSE2-CODEGEN: shift2i8 190 ; SSE2-CODEGEN: shlq %cl 191 192 %0 = shl %shifttype2i8 %a , %b 193 ret %shifttype2i8 %0 194 } 195 196 %shifttype4i8 = type <4 x i8> 197 define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) { 198 entry: 199 ; SSE2: shift4i8 200 ; SSE2: cost of 10 {{.*}} shl 201 ; SSE2-CODEGEN: shift4i8 202 ; SSE2-CODEGEN: pmuludq 203 204 %0 = shl %shifttype4i8 %a , %b 205 ret %shifttype4i8 %0 206 } 207 208 %shifttype8i8 = type <8 x i8> 209 define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) { 210 entry: 211 ; SSE2: shift8i8 212 ; SSE2: cost of 80 {{.*}} shl 213 ; SSE2-CODEGEN: shift8i8 214 ; SSE2-CODEGEN: shll 215 216 %0 = shl %shifttype8i8 %a , %b 217 ret %shifttype8i8 %0 218 } 219 220 %shifttype16i8 = type <16 x i8> 221 define %shifttype16i8 @shift16i8(%shifttype16i8 %a, %shifttype16i8 %b) { 222 entry: 223 ; SSE2: shift16i8 224 ; SSE2: cost of 30 {{.*}} shl 225 ; SSE2-CODEGEN: shift16i8 226 ; SSE2-CODEGEN: cmpeqb 227 228 %0 = shl %shifttype16i8 %a , %b 229 ret %shifttype16i8 %0 230 } 231 232 %shifttype32i8 = type <32 x i8> 233 define %shifttype32i8 @shift32i8(%shifttype32i8 %a, %shifttype32i8 %b) { 234 entry: 235 ; SSE2: shift32i8 236 ; SSE2: cost of 60 {{.*}} shl 237 ; SSE2-CODEGEN: shift32i8 238 ; SSE2-CODEGEN: cmpeqb 239 240 %0 = shl %shifttype32i8 %a , %b 241 ret %shifttype32i8 %0 242 } 243 244 ; Test shift by a constant vector. 245 246 %shifttypec = type <2 x i16> 247 define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) { 248 entry: 249 ; SSE2: shift2i16const 250 ; SSE2: cost of 1 {{.*}} shl 251 ; SSE2-CODEGEN: shift2i16const 252 ; SSE2-CODEGEN: psllq $3 253 254 %0 = shl %shifttypec %a , <i16 3, i16 3> 255 ret %shifttypec %0 256 } 257 258 %shifttypec4i16 = type <4 x i16> 259 define %shifttypec4i16 @shift4i16const(%shifttypec4i16 %a, %shifttypec4i16 %b) { 260 entry: 261 ; SSE2: shift4i16const 262 ; SSE2: cost of 1 {{.*}} shl 263 ; SSE2-CODEGEN: shift4i16const 264 ; SSE2-CODEGEN: pslld $3 265 266 %0 = shl %shifttypec4i16 %a , <i16 3, i16 3, i16 3, i16 3> 267 ret %shifttypec4i16 %0 268 } 269 270 %shifttypec8i16 = type <8 x i16> 271 define %shifttypec8i16 @shift8i16const(%shifttypec8i16 %a, %shifttypec8i16 %b) { 272 entry: 273 ; SSE2: shift8i16const 274 ; SSE2: cost of 1 {{.*}} shl 275 ; SSE2-CODEGEN: shift8i16const 276 ; SSE2-CODEGEN: psllw $3 277 278 %0 = shl %shifttypec8i16 %a , <i16 3, i16 3, i16 3, i16 3, 279 i16 3, i16 3, i16 3, i16 3> 280 ret %shifttypec8i16 %0 281 } 282 283 %shifttypec16i16 = type <16 x i16> 284 define %shifttypec16i16 @shift16i16const(%shifttypec16i16 %a, 285 %shifttypec16i16 %b) { 286 entry: 287 ; SSE2: shift16i16const 288 ; SSE2: cost of 2 {{.*}} shl 289 ; SSE2-CODEGEN: shift16i16const 290 ; SSE2-CODEGEN: psllw $3 291 292 %0 = shl %shifttypec16i16 %a , <i16 3, i16 3, i16 3, i16 3, 293 i16 3, i16 3, i16 3, i16 3, 294 i16 3, i16 3, i16 3, i16 3, 295 i16 3, i16 3, i16 3, i16 3> 296 ret %shifttypec16i16 %0 297 } 298 299 %shifttypec32i16 = type <32 x i16> 300 define %shifttypec32i16 @shift32i16const(%shifttypec32i16 %a, 301 %shifttypec32i16 %b) { 302 entry: 303 ; SSE2: shift32i16const 304 ; SSE2: cost of 4 {{.*}} shl 305 ; SSE2-CODEGEN: shift32i16const 306 ; SSE2-CODEGEN: psllw $3 307 308 %0 = shl %shifttypec32i16 %a , <i16 3, i16 3, i16 3, i16 3, 309 i16 3, i16 3, i16 3, i16 3, 310 i16 3, i16 3, i16 3, i16 3, 311 i16 3, i16 3, i16 3, i16 3, 312 i16 3, i16 3, i16 3, i16 3, 313 i16 3, i16 3, i16 3, i16 3, 314 i16 3, i16 3, i16 3, i16 3, 315 i16 3, i16 3, i16 3, i16 3> 316 ret %shifttypec32i16 %0 317 } 318 319 %shifttypec2i32 = type <2 x i32> 320 define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) { 321 entry: 322 ; SSE2: shift2i32c 323 ; SSE2: cost of 1 {{.*}} shl 324 ; SSE2-CODEGEN: shift2i32c 325 ; SSE2-CODEGEN: psllq $3 326 327 %0 = shl %shifttypec2i32 %a , <i32 3, i32 3> 328 ret %shifttypec2i32 %0 329 } 330 331 %shifttypec4i32 = type <4 x i32> 332 define %shifttypec4i32 @shift4i32c(%shifttypec4i32 %a, %shifttypec4i32 %b) { 333 entry: 334 ; SSE2: shift4i32c 335 ; SSE2: cost of 1 {{.*}} shl 336 ; SSE2-CODEGEN: shift4i32c 337 ; SSE2-CODEGEN: pslld $3 338 339 %0 = shl %shifttypec4i32 %a , <i32 3, i32 3, i32 3, i32 3> 340 ret %shifttypec4i32 %0 341 } 342 343 %shifttypec8i32 = type <8 x i32> 344 define %shifttypec8i32 @shift8i32c(%shifttypec8i32 %a, %shifttypec8i32 %b) { 345 entry: 346 ; SSE2: shift8i32c 347 ; SSE2: cost of 2 {{.*}} shl 348 ; SSE2-CODEGEN: shift8i32c 349 ; SSE2-CODEGEN: pslld $3 350 351 %0 = shl %shifttypec8i32 %a , <i32 3, i32 3, i32 3, i32 3, 352 i32 3, i32 3, i32 3, i32 3> 353 ret %shifttypec8i32 %0 354 } 355 356 %shifttypec16i32 = type <16 x i32> 357 define %shifttypec16i32 @shift16i32c(%shifttypec16i32 %a, %shifttypec16i32 %b) { 358 entry: 359 ; SSE2: shift16i32c 360 ; SSE2: cost of 4 {{.*}} shl 361 ; SSE2-CODEGEN: shift16i32c 362 ; SSE2-CODEGEN: pslld $3 363 364 %0 = shl %shifttypec16i32 %a , <i32 3, i32 3, i32 3, i32 3, 365 i32 3, i32 3, i32 3, i32 3, 366 i32 3, i32 3, i32 3, i32 3, 367 i32 3, i32 3, i32 3, i32 3> 368 ret %shifttypec16i32 %0 369 } 370 371 %shifttypec32i32 = type <32 x i32> 372 define %shifttypec32i32 @shift32i32c(%shifttypec32i32 %a, %shifttypec32i32 %b) { 373 entry: 374 ; SSE2: shift32i32c 375 ; SSE2: cost of 8 {{.*}} shl 376 ; SSE2-CODEGEN: shift32i32c 377 ; SSE2-CODEGEN: pslld $3 378 %0 = shl %shifttypec32i32 %a , <i32 3, i32 3, i32 3, i32 3, 379 i32 3, i32 3, i32 3, i32 3, 380 i32 3, i32 3, i32 3, i32 3, 381 i32 3, i32 3, i32 3, i32 3, 382 i32 3, i32 3, i32 3, i32 3, 383 i32 3, i32 3, i32 3, i32 3, 384 i32 3, i32 3, i32 3, i32 3, 385 i32 3, i32 3, i32 3, i32 3> 386 ret %shifttypec32i32 %0 387 } 388 389 %shifttypec2i64 = type <2 x i64> 390 define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) { 391 entry: 392 ; SSE2: shift2i64c 393 ; SSE2: cost of 1 {{.*}} shl 394 ; SSE2-CODEGEN: shift2i64c 395 ; SSE2-CODEGEN: psllq $3 396 397 %0 = shl %shifttypec2i64 %a , <i64 3, i64 3> 398 ret %shifttypec2i64 %0 399 } 400 401 %shifttypec4i64 = type <4 x i64> 402 define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) { 403 entry: 404 ; SSE2: shift4i64c 405 ; SSE2: cost of 2 {{.*}} shl 406 ; SSE2-CODEGEN: shift4i64c 407 ; SSE2-CODEGEN: psllq $3 408 409 %0 = shl %shifttypec4i64 %a , <i64 3, i64 3, i64 3, i64 3> 410 ret %shifttypec4i64 %0 411 } 412 413 %shifttypec8i64 = type <8 x i64> 414 define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) { 415 entry: 416 ; SSE2: shift8i64c 417 ; SSE2: cost of 4 {{.*}} shl 418 ; SSE2-CODEGEN: shift8i64c 419 ; SSE2-CODEGEN: psllq $3 420 421 %0 = shl %shifttypec8i64 %a , <i64 3, i64 3, i64 3, i64 3, 422 i64 3, i64 3, i64 3, i64 3> 423 ret %shifttypec8i64 %0 424 } 425 426 %shifttypec16i64 = type <16 x i64> 427 define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) { 428 entry: 429 ; SSE2: shift16i64c 430 ; SSE2: cost of 8 {{.*}} shl 431 ; SSE2-CODEGEN: shift16i64c 432 ; SSE2-CODEGEN: psllq $3 433 434 %0 = shl %shifttypec16i64 %a , <i64 3, i64 3, i64 3, i64 3, 435 i64 3, i64 3, i64 3, i64 3, 436 i64 3, i64 3, i64 3, i64 3, 437 i64 3, i64 3, i64 3, i64 3> 438 ret %shifttypec16i64 %0 439 } 440 441 %shifttypec32i64 = type <32 x i64> 442 define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) { 443 entry: 444 ; SSE2: shift32i64c 445 ; SSE2: cost of 16 {{.*}} shl 446 ; SSE2-CODEGEN: shift32i64c 447 ; SSE2-CODEGEN: psllq $3 448 449 %0 = shl %shifttypec32i64 %a ,<i64 3, i64 3, i64 3, i64 3, 450 i64 3, i64 3, i64 3, i64 3, 451 i64 3, i64 3, i64 3, i64 3, 452 i64 3, i64 3, i64 3, i64 3, 453 i64 3, i64 3, i64 3, i64 3, 454 i64 3, i64 3, i64 3, i64 3, 455 i64 3, i64 3, i64 3, i64 3, 456 i64 3, i64 3, i64 3, i64 3> 457 ret %shifttypec32i64 %0 458 } 459 460 %shifttypec2i8 = type <2 x i8> 461 define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) { 462 entry: 463 ; SSE2: shift2i8c 464 ; SSE2: cost of 1 {{.*}} shl 465 ; SSE2-CODEGEN: shift2i8c 466 ; SSE2-CODEGEN: psllq $3 467 468 %0 = shl %shifttypec2i8 %a , <i8 3, i8 3> 469 ret %shifttypec2i8 %0 470 } 471 472 %shifttypec4i8 = type <4 x i8> 473 define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) { 474 entry: 475 ; SSE2: shift4i8c 476 ; SSE2: cost of 1 {{.*}} shl 477 ; SSE2-CODEGEN: shift4i8c 478 ; SSE2-CODEGEN: pslld $3 479 480 %0 = shl %shifttypec4i8 %a , <i8 3, i8 3, i8 3, i8 3> 481 ret %shifttypec4i8 %0 482 } 483 484 %shifttypec8i8 = type <8 x i8> 485 define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) { 486 entry: 487 ; SSE2: shift8i8c 488 ; SSE2: cost of 1 {{.*}} shl 489 ; SSE2-CODEGEN: shift8i8c 490 ; SSE2-CODEGEN: psllw $3 491 492 %0 = shl %shifttypec8i8 %a , <i8 3, i8 3, i8 3, i8 3, 493 i8 3, i8 3, i8 3, i8 3> 494 ret %shifttypec8i8 %0 495 } 496 497 %shifttypec16i8 = type <16 x i8> 498 define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) { 499 entry: 500 ; SSE2: shift16i8c 501 ; SSE2: cost of 1 {{.*}} shl 502 ; SSE2-CODEGEN: shift16i8c 503 ; SSE2-CODEGEN: psllw $3 504 505 %0 = shl %shifttypec16i8 %a , <i8 3, i8 3, i8 3, i8 3, 506 i8 3, i8 3, i8 3, i8 3, 507 i8 3, i8 3, i8 3, i8 3, 508 i8 3, i8 3, i8 3, i8 3> 509 ret %shifttypec16i8 %0 510 } 511 512 %shifttypec32i8 = type <32 x i8> 513 define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) { 514 entry: 515 ; SSE2: shift32i8c 516 ; SSE2: cost of 2 {{.*}} shl 517 ; SSE2-CODEGEN: shift32i8c 518 ; SSE2-CODEGEN: psllw $3 519 520 %0 = shl %shifttypec32i8 %a , <i8 3, i8 3, i8 3, i8 3, 521 i8 3, i8 3, i8 3, i8 3, 522 i8 3, i8 3, i8 3, i8 3, 523 i8 3, i8 3, i8 3, i8 3, 524 i8 3, i8 3, i8 3, i8 3, 525 i8 3, i8 3, i8 3, i8 3, 526 i8 3, i8 3, i8 3, i8 3, 527 i8 3, i8 3, i8 3, i8 3> 528 ret %shifttypec32i8 %0 529 } 530