1 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core2 < %s | FileCheck --check-prefix=SSE2-CODEGEN %s 2 ; RUN: opt -mtriple=x86_64-apple-darwin -mcpu=core2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s 3 4 %shifttype = type <2 x i16> 5 define %shifttype @shift2i16(%shifttype %a, %shifttype %b) { 6 entry: 7 ; SSE2: shift2i16 8 ; SSE2: cost of 20 {{.*}} lshr 9 ; SSE2-CODEGEN: shift2i16 10 ; SSE2-CODEGEN: shrq %cl 11 12 %0 = lshr %shifttype %a , %b 13 ret %shifttype %0 14 } 15 16 %shifttype4i16 = type <4 x i16> 17 define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) { 18 entry: 19 ; SSE2: shift4i16 20 ; SSE2: cost of 40 {{.*}} lshr 21 ; SSE2-CODEGEN: shift4i16 22 ; SSE2-CODEGEN: shrl %cl 23 24 %0 = lshr %shifttype4i16 %a , %b 25 ret %shifttype4i16 %0 26 } 27 28 %shifttype8i16 = type <8 x i16> 29 define %shifttype8i16 @shift8i16(%shifttype8i16 %a, %shifttype8i16 %b) { 30 entry: 31 ; SSE2: shift8i16 32 ; SSE2: cost of 80 {{.*}} lshr 33 ; SSE2-CODEGEN: shift8i16 34 ; SSE2-CODEGEN: shrl %cl 35 36 %0 = lshr %shifttype8i16 %a , %b 37 ret %shifttype8i16 %0 38 } 39 40 %shifttype16i16 = type <16 x i16> 41 define %shifttype16i16 @shift16i16(%shifttype16i16 %a, %shifttype16i16 %b) { 42 entry: 43 ; SSE2: shift16i16 44 ; SSE2: cost of 160 {{.*}} lshr 45 ; SSE2-CODEGEN: shift16i16 46 ; SSE2-CODEGEN: shrl %cl 47 48 %0 = lshr %shifttype16i16 %a , %b 49 ret %shifttype16i16 %0 50 } 51 52 %shifttype32i16 = type <32 x i16> 53 define %shifttype32i16 @shift32i16(%shifttype32i16 %a, %shifttype32i16 %b) { 54 entry: 55 ; SSE2: shift32i16 56 ; SSE2: cost of 320 {{.*}} lshr 57 ; SSE2-CODEGEN: shift32i16 58 ; SSE2-CODEGEN: shrl %cl 59 60 %0 = lshr %shifttype32i16 %a , %b 61 ret %shifttype32i16 %0 62 } 63 64 %shifttype2i32 = type <2 x i32> 65 define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) { 66 entry: 67 ; SSE2: shift2i32 68 ; SSE2: cost of 20 {{.*}} lshr 69 ; SSE2-CODEGEN: shift2i32 70 ; SSE2-CODEGEN: shrq %cl 71 72 %0 = lshr %shifttype2i32 %a , %b 73 ret %shifttype2i32 %0 74 } 75 76 %shifttype4i32 = type <4 x i32> 77 define %shifttype4i32 @shift4i32(%shifttype4i32 %a, %shifttype4i32 %b) { 78 entry: 79 ; SSE2: shift4i32 80 ; SSE2: cost of 40 {{.*}} lshr 81 ; SSE2-CODEGEN: shift4i32 82 ; SSE2-CODEGEN: shrl %cl 83 84 %0 = lshr %shifttype4i32 %a , %b 85 ret %shifttype4i32 %0 86 } 87 88 %shifttype8i32 = type <8 x i32> 89 define %shifttype8i32 @shift8i32(%shifttype8i32 %a, %shifttype8i32 %b) { 90 entry: 91 ; SSE2: shift8i32 92 ; SSE2: cost of 80 {{.*}} lshr 93 ; SSE2-CODEGEN: shift8i32 94 ; SSE2-CODEGEN: shrl %cl 95 96 %0 = lshr %shifttype8i32 %a , %b 97 ret %shifttype8i32 %0 98 } 99 100 %shifttype16i32 = type <16 x i32> 101 define %shifttype16i32 @shift16i32(%shifttype16i32 %a, %shifttype16i32 %b) { 102 entry: 103 ; SSE2: shift16i32 104 ; SSE2: cost of 160 {{.*}} lshr 105 ; SSE2-CODEGEN: shift16i32 106 ; SSE2-CODEGEN: shrl %cl 107 108 %0 = lshr %shifttype16i32 %a , %b 109 ret %shifttype16i32 %0 110 } 111 112 %shifttype32i32 = type <32 x i32> 113 define %shifttype32i32 @shift32i32(%shifttype32i32 %a, %shifttype32i32 %b) { 114 entry: 115 ; SSE2: shift32i32 116 ; SSE2: cost of 320 {{.*}} lshr 117 ; SSE2-CODEGEN: shift32i32 118 ; SSE2-CODEGEN: shrl %cl 119 120 %0 = lshr %shifttype32i32 %a , %b 121 ret %shifttype32i32 %0 122 } 123 124 %shifttype2i64 = type <2 x i64> 125 define %shifttype2i64 @shift2i64(%shifttype2i64 %a, %shifttype2i64 %b) { 126 entry: 127 ; SSE2: shift2i64 128 ; SSE2: cost of 20 {{.*}} lshr 129 ; SSE2-CODEGEN: shift2i64 130 ; SSE2-CODEGEN: shrq %cl 131 132 %0 = lshr %shifttype2i64 %a , %b 133 ret %shifttype2i64 %0 134 } 135 136 %shifttype4i64 = type <4 x i64> 137 define %shifttype4i64 @shift4i64(%shifttype4i64 %a, %shifttype4i64 %b) { 138 entry: 139 ; SSE2: shift4i64 140 ; SSE2: cost of 40 {{.*}} lshr 141 ; SSE2-CODEGEN: shift4i64 142 ; SSE2-CODEGEN: shrq %cl 143 144 %0 = lshr %shifttype4i64 %a , %b 145 ret %shifttype4i64 %0 146 } 147 148 %shifttype8i64 = type <8 x i64> 149 define %shifttype8i64 @shift8i64(%shifttype8i64 %a, %shifttype8i64 %b) { 150 entry: 151 ; SSE2: shift8i64 152 ; SSE2: cost of 80 {{.*}} lshr 153 ; SSE2-CODEGEN: shift8i64 154 ; SSE2-CODEGEN: shrq %cl 155 156 %0 = lshr %shifttype8i64 %a , %b 157 ret %shifttype8i64 %0 158 } 159 160 %shifttype16i64 = type <16 x i64> 161 define %shifttype16i64 @shift16i64(%shifttype16i64 %a, %shifttype16i64 %b) { 162 entry: 163 ; SSE2: shift16i64 164 ; SSE2: cost of 160 {{.*}} lshr 165 ; SSE2-CODEGEN: shift16i64 166 ; SSE2-CODEGEN: shrq %cl 167 168 %0 = lshr %shifttype16i64 %a , %b 169 ret %shifttype16i64 %0 170 } 171 172 %shifttype32i64 = type <32 x i64> 173 define %shifttype32i64 @shift32i64(%shifttype32i64 %a, %shifttype32i64 %b) { 174 entry: 175 ; SSE2: shift32i64 176 ; SSE2: cost of 320 {{.*}} lshr 177 ; SSE2-CODEGEN: shift32i64 178 ; SSE2-CODEGEN: shrq %cl 179 180 %0 = lshr %shifttype32i64 %a , %b 181 ret %shifttype32i64 %0 182 } 183 184 %shifttype2i8 = type <2 x i8> 185 define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) { 186 entry: 187 ; SSE2: shift2i8 188 ; SSE2: cost of 20 {{.*}} lshr 189 ; SSE2-CODEGEN: shift2i8 190 ; SSE2-CODEGEN: shrq %cl 191 192 %0 = lshr %shifttype2i8 %a , %b 193 ret %shifttype2i8 %0 194 } 195 196 %shifttype4i8 = type <4 x i8> 197 define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) { 198 entry: 199 ; SSE2: shift4i8 200 ; SSE2: cost of 40 {{.*}} lshr 201 ; SSE2-CODEGEN: shift4i8 202 ; SSE2-CODEGEN: shrl %cl 203 204 %0 = lshr %shifttype4i8 %a , %b 205 ret %shifttype4i8 %0 206 } 207 208 %shifttype8i8 = type <8 x i8> 209 define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) { 210 entry: 211 ; SSE2: shift8i8 212 ; SSE2: cost of 80 {{.*}} lshr 213 ; SSE2-CODEGEN: shift8i8 214 ; SSE2-CODEGEN: shrl %cl 215 216 %0 = lshr %shifttype8i8 %a , %b 217 ret %shifttype8i8 %0 218 } 219 220 %shifttype16i8 = type <16 x i8> 221 define %shifttype16i8 @shift16i8(%shifttype16i8 %a, %shifttype16i8 %b) { 222 entry: 223 ; SSE2: shift16i8 224 ; SSE2: cost of 160 {{.*}} lshr 225 ; SSE2-CODEGEN: shift16i8 226 ; SSE2-CODEGEN: shrb %cl 227 228 %0 = lshr %shifttype16i8 %a , %b 229 ret %shifttype16i8 %0 230 } 231 232 %shifttype32i8 = type <32 x i8> 233 define %shifttype32i8 @shift32i8(%shifttype32i8 %a, %shifttype32i8 %b) { 234 entry: 235 ; SSE2: shift32i8 236 ; SSE2: cost of 320 {{.*}} lshr 237 ; SSE2-CODEGEN: shift32i8 238 ; SSE2-CODEGEN: shrb %cl 239 240 %0 = lshr %shifttype32i8 %a , %b 241 ret %shifttype32i8 %0 242 } 243 244 ; Test shift by a constant vector. 245 246 %shifttypec = type <2 x i16> 247 define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) { 248 entry: 249 ; SSE2: shift2i16const 250 ; SSE2: cost of 1 {{.*}} lshr 251 ; SSE2-CODEGEN: shift2i16const 252 ; SSE2-CODEGEN: psrlq $3 253 254 %0 = lshr %shifttypec %a , <i16 3, i16 3> 255 ret %shifttypec %0 256 } 257 258 %shifttypec4i16 = type <4 x i16> 259 define %shifttypec4i16 @shift4i16const(%shifttypec4i16 %a, %shifttypec4i16 %b) { 260 entry: 261 ; SSE2: shift4i16const 262 ; SSE2: cost of 1 {{.*}} lshr 263 ; SSE2-CODEGEN: shift4i16const 264 ; SSE2-CODEGEN: psrld $3 265 266 %0 = lshr %shifttypec4i16 %a , <i16 3, i16 3, i16 3, i16 3> 267 ret %shifttypec4i16 %0 268 } 269 270 %shifttypec8i16 = type <8 x i16> 271 define %shifttypec8i16 @shift8i16const(%shifttypec8i16 %a, %shifttypec8i16 %b) { 272 entry: 273 ; SSE2: shift8i16const 274 ; SSE2: cost of 1 {{.*}} lshr 275 ; SSE2-CODEGEN: shift8i16const 276 ; SSE2-CODEGEN: psrlw $3 277 278 %0 = lshr %shifttypec8i16 %a , <i16 3, i16 3, i16 3, i16 3, 279 i16 3, i16 3, i16 3, i16 3> 280 ret %shifttypec8i16 %0 281 } 282 283 %shifttypec16i16 = type <16 x i16> 284 define %shifttypec16i16 @shift16i16const(%shifttypec16i16 %a, 285 %shifttypec16i16 %b) { 286 entry: 287 ; SSE2: shift16i16const 288 ; SSE2: cost of 2 {{.*}} lshr 289 ; SSE2-CODEGEN: shift16i16const 290 ; SSE2-CODEGEN: psrlw $3 291 292 %0 = lshr %shifttypec16i16 %a , <i16 3, i16 3, i16 3, i16 3, 293 i16 3, i16 3, i16 3, i16 3, 294 i16 3, i16 3, i16 3, i16 3, 295 i16 3, i16 3, i16 3, i16 3> 296 ret %shifttypec16i16 %0 297 } 298 299 %shifttypec32i16 = type <32 x i16> 300 define %shifttypec32i16 @shift32i16const(%shifttypec32i16 %a, 301 %shifttypec32i16 %b) { 302 entry: 303 ; SSE2: shift32i16const 304 ; SSE2: cost of 4 {{.*}} lshr 305 ; SSE2-CODEGEN: shift32i16const 306 ; SSE2-CODEGEN: psrlw $3 307 308 %0 = lshr %shifttypec32i16 %a , <i16 3, i16 3, i16 3, i16 3, 309 i16 3, i16 3, i16 3, i16 3, 310 i16 3, i16 3, i16 3, i16 3, 311 i16 3, i16 3, i16 3, i16 3, 312 i16 3, i16 3, i16 3, i16 3, 313 i16 3, i16 3, i16 3, i16 3, 314 i16 3, i16 3, i16 3, i16 3, 315 i16 3, i16 3, i16 3, i16 3> 316 ret %shifttypec32i16 %0 317 } 318 319 %shifttypec2i32 = type <2 x i32> 320 define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) { 321 entry: 322 ; SSE2: shift2i32c 323 ; SSE2: cost of 1 {{.*}} lshr 324 ; SSE2-CODEGEN: shift2i32c 325 ; SSE2-CODEGEN: psrlq $3 326 327 %0 = lshr %shifttypec2i32 %a , <i32 3, i32 3> 328 ret %shifttypec2i32 %0 329 } 330 331 %shifttypec4i32 = type <4 x i32> 332 define %shifttypec4i32 @shift4i32c(%shifttypec4i32 %a, %shifttypec4i32 %b) { 333 entry: 334 ; SSE2: shift4i32c 335 ; SSE2: cost of 1 {{.*}} lshr 336 ; SSE2-CODEGEN: shift4i32c 337 ; SSE2-CODEGEN: psrld $3 338 339 %0 = lshr %shifttypec4i32 %a , <i32 3, i32 3, i32 3, i32 3> 340 ret %shifttypec4i32 %0 341 } 342 343 %shifttypec8i32 = type <8 x i32> 344 define %shifttypec8i32 @shift8i32c(%shifttypec8i32 %a, %shifttypec8i32 %b) { 345 entry: 346 ; SSE2: shift8i32c 347 ; SSE2: cost of 2 {{.*}} lshr 348 ; SSE2-CODEGEN: shift8i32c 349 ; SSE2-CODEGEN: psrld $3 350 351 %0 = lshr %shifttypec8i32 %a , <i32 3, i32 3, i32 3, i32 3, 352 i32 3, i32 3, i32 3, i32 3> 353 ret %shifttypec8i32 %0 354 } 355 356 %shifttypec16i32 = type <16 x i32> 357 define %shifttypec16i32 @shift16i32c(%shifttypec16i32 %a, %shifttypec16i32 %b) { 358 entry: 359 ; SSE2: shift16i32c 360 ; SSE2: cost of 4 {{.*}} lshr 361 ; SSE2-CODEGEN: shift16i32c 362 ; SSE2-CODEGEN: psrld $3 363 364 %0 = lshr %shifttypec16i32 %a , <i32 3, i32 3, i32 3, i32 3, 365 i32 3, i32 3, i32 3, i32 3, 366 i32 3, i32 3, i32 3, i32 3, 367 i32 3, i32 3, i32 3, i32 3> 368 ret %shifttypec16i32 %0 369 } 370 371 %shifttypec32i32 = type <32 x i32> 372 define %shifttypec32i32 @shift32i32c(%shifttypec32i32 %a, %shifttypec32i32 %b) { 373 entry: 374 ; SSE2: shift32i32c 375 ; SSE2: cost of 8 {{.*}} lshr 376 ; SSE2-CODEGEN: shift32i32c 377 ; SSE2-CODEGEN: psrld $3 378 %0 = lshr %shifttypec32i32 %a , <i32 3, i32 3, i32 3, i32 3, 379 i32 3, i32 3, i32 3, i32 3, 380 i32 3, i32 3, i32 3, i32 3, 381 i32 3, i32 3, i32 3, i32 3, 382 i32 3, i32 3, i32 3, i32 3, 383 i32 3, i32 3, i32 3, i32 3, 384 i32 3, i32 3, i32 3, i32 3, 385 i32 3, i32 3, i32 3, i32 3> 386 ret %shifttypec32i32 %0 387 } 388 389 %shifttypec2i64 = type <2 x i64> 390 define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) { 391 entry: 392 ; SSE2: shift2i64c 393 ; SSE2: cost of 1 {{.*}} lshr 394 ; SSE2-CODEGEN: shift2i64c 395 ; SSE2-CODEGEN: psrlq $3 396 397 %0 = lshr %shifttypec2i64 %a , <i64 3, i64 3> 398 ret %shifttypec2i64 %0 399 } 400 401 %shifttypec4i64 = type <4 x i64> 402 define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) { 403 entry: 404 ; SSE2: shift4i64c 405 ; SSE2: cost of 2 {{.*}} lshr 406 ; SSE2-CODEGEN: shift4i64c 407 ; SSE2-CODEGEN: psrlq $3 408 409 %0 = lshr %shifttypec4i64 %a , <i64 3, i64 3, i64 3, i64 3> 410 ret %shifttypec4i64 %0 411 } 412 413 %shifttypec8i64 = type <8 x i64> 414 define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) { 415 entry: 416 ; SSE2: shift8i64c 417 ; SSE2: cost of 4 {{.*}} lshr 418 ; SSE2-CODEGEN: shift8i64c 419 ; SSE2-CODEGEN: psrlq $3 420 421 %0 = lshr %shifttypec8i64 %a , <i64 3, i64 3, i64 3, i64 3, 422 i64 3, i64 3, i64 3, i64 3> 423 ret %shifttypec8i64 %0 424 } 425 426 %shifttypec16i64 = type <16 x i64> 427 define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) { 428 entry: 429 ; SSE2: shift16i64c 430 ; SSE2: cost of 8 {{.*}} lshr 431 ; SSE2-CODEGEN: shift16i64c 432 ; SSE2-CODEGEN: psrlq $3 433 434 %0 = lshr %shifttypec16i64 %a , <i64 3, i64 3, i64 3, i64 3, 435 i64 3, i64 3, i64 3, i64 3, 436 i64 3, i64 3, i64 3, i64 3, 437 i64 3, i64 3, i64 3, i64 3> 438 ret %shifttypec16i64 %0 439 } 440 441 %shifttypec32i64 = type <32 x i64> 442 define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) { 443 entry: 444 ; SSE2: shift32i64c 445 ; SSE2: cost of 16 {{.*}} lshr 446 ; SSE2-CODEGEN: shift32i64c 447 ; SSE2-CODEGEN: psrlq $3 448 449 %0 = lshr %shifttypec32i64 %a ,<i64 3, i64 3, i64 3, i64 3, 450 i64 3, i64 3, i64 3, i64 3, 451 i64 3, i64 3, i64 3, i64 3, 452 i64 3, i64 3, i64 3, i64 3, 453 i64 3, i64 3, i64 3, i64 3, 454 i64 3, i64 3, i64 3, i64 3, 455 i64 3, i64 3, i64 3, i64 3, 456 i64 3, i64 3, i64 3, i64 3> 457 ret %shifttypec32i64 %0 458 } 459 460 %shifttypec2i8 = type <2 x i8> 461 define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) { 462 entry: 463 ; SSE2: shift2i8c 464 ; SSE2: cost of 1 {{.*}} lshr 465 ; SSE2-CODEGEN: shift2i8c 466 ; SSE2-CODEGEN: psrlq $3 467 468 %0 = lshr %shifttypec2i8 %a , <i8 3, i8 3> 469 ret %shifttypec2i8 %0 470 } 471 472 %shifttypec4i8 = type <4 x i8> 473 define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) { 474 entry: 475 ; SSE2: shift4i8c 476 ; SSE2: cost of 1 {{.*}} lshr 477 ; SSE2-CODEGEN: shift4i8c 478 ; SSE2-CODEGEN: psrld $3 479 480 %0 = lshr %shifttypec4i8 %a , <i8 3, i8 3, i8 3, i8 3> 481 ret %shifttypec4i8 %0 482 } 483 484 %shifttypec8i8 = type <8 x i8> 485 define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) { 486 entry: 487 ; SSE2: shift8i8c 488 ; SSE2: cost of 1 {{.*}} lshr 489 ; SSE2-CODEGEN: shift8i8c 490 ; SSE2-CODEGEN: psrlw $3 491 492 %0 = lshr %shifttypec8i8 %a , <i8 3, i8 3, i8 3, i8 3, 493 i8 3, i8 3, i8 3, i8 3> 494 ret %shifttypec8i8 %0 495 } 496 497 %shifttypec16i8 = type <16 x i8> 498 define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) { 499 entry: 500 ; SSE2: shift16i8c 501 ; SSE2: cost of 1 {{.*}} lshr 502 ; SSE2-CODEGEN: shift16i8c 503 ; SSE2-CODEGEN: psrlw $3 504 505 %0 = lshr %shifttypec16i8 %a , <i8 3, i8 3, i8 3, i8 3, 506 i8 3, i8 3, i8 3, i8 3, 507 i8 3, i8 3, i8 3, i8 3, 508 i8 3, i8 3, i8 3, i8 3> 509 ret %shifttypec16i8 %0 510 } 511 512 %shifttypec32i8 = type <32 x i8> 513 define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) { 514 entry: 515 ; SSE2: shift32i8c 516 ; SSE2: cost of 2 {{.*}} lshr 517 ; SSE2-CODEGEN: shift32i8c 518 ; SSE2-CODEGEN: psrlw $3 519 520 %0 = lshr %shifttypec32i8 %a , <i8 3, i8 3, i8 3, i8 3, 521 i8 3, i8 3, i8 3, i8 3, 522 i8 3, i8 3, i8 3, i8 3, 523 i8 3, i8 3, i8 3, i8 3, 524 i8 3, i8 3, i8 3, i8 3, 525 i8 3, i8 3, i8 3, i8 3, 526 i8 3, i8 3, i8 3, i8 3, 527 i8 3, i8 3, i8 3, i8 3> 528 ret %shifttypec32i8 %0 529 } 530