1 ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s 2 ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse4.1 -cost-model -analyze < %s | FileCheck --check-prefix=SSE41 %s 3 4 define void @zext_v4i8_to_v4i64(<4 x i8>* %a) { 5 ; SSE2: zext_v4i8_to_v4i64 6 ; SSE2: cost of 4 {{.*}} zext 7 ; 8 ; SSE41: zext_v4i8_to_v4i64 9 ; SSE41: cost of 2 {{.*}} zext 10 ; 11 %1 = load <4 x i8>, <4 x i8>* %a 12 %2 = zext <4 x i8> %1 to <4 x i64> 13 store <4 x i64> %2, <4 x i64>* undef, align 4 14 ret void 15 } 16 17 define void @sext_v4i8_to_v4i64(<4 x i8>* %a) { 18 ; SSE2: sext_v4i8_to_v4i64 19 ; SSE2: cost of 8 {{.*}} sext 20 ; 21 ; SSE41: sext_v4i8_to_v4i64 22 ; SSE41: cost of 2 {{.*}} sext 23 ; 24 %1 = load <4 x i8>, <4 x i8>* %a 25 %2 = sext <4 x i8> %1 to <4 x i64> 26 store <4 x i64> %2, <4 x i64>* undef, align 4 27 ret void 28 } 29 30 define void @zext_v4i16_to_v4i64(<4 x i16>* %a) { 31 ; SSE2: zext_v4i16_to_v4i64 32 ; SSE2: cost of 3 {{.*}} zext 33 ; 34 ; SSE41: zext_v4i16_to_v4i64 35 ; SSE41: cost of 2 {{.*}} zext 36 ; 37 %1 = load <4 x i16>, <4 x i16>* %a 38 %2 = zext <4 x i16> %1 to <4 x i64> 39 store <4 x i64> %2, <4 x i64>* undef, align 4 40 ret void 41 } 42 43 define void @sext_v4i16_to_v4i64(<4 x i16>* %a) { 44 ; SSE2: sext_v4i16_to_v4i64 45 ; SSE2: cost of 10 {{.*}} sext 46 ; 47 ; SSE41: sext_v4i16_to_v4i64 48 ; SSE41: cost of 2 {{.*}} sext 49 ; 50 %1 = load <4 x i16>, <4 x i16>* %a 51 %2 = sext <4 x i16> %1 to <4 x i64> 52 store <4 x i64> %2, <4 x i64>* undef, align 4 53 ret void 54 } 55 56 57 define void @zext_v4i32_to_v4i64(<4 x i32>* %a) { 58 ; SSE2: zext_v4i32_to_v4i64 59 ; SSE2: cost of 3 {{.*}} zext 60 ; 61 ; SSE41: zext_v4i32_to_v4i64 62 ; SSE41: cost of 2 {{.*}} zext 63 ; 64 %1 = load <4 x i32>, <4 x i32>* %a 65 %2 = zext <4 x i32> %1 to <4 x i64> 66 store <4 x i64> %2, <4 x i64>* undef, align 4 67 ret void 68 } 69 70 define void @sext_v4i32_to_v4i64(<4 x i32>* %a) { 71 ; SSE2: sext_v4i32_to_v4i64 72 ; SSE2: cost of 5 {{.*}} sext 73 ; 74 ; SSE41: sext_v4i32_to_v4i64 75 ; SSE41: cost of 2 {{.*}} sext 76 ; 77 %1 = load <4 x i32>, <4 x i32>* %a 78 %2 = sext <4 x i32> %1 to <4 x i64> 79 store <4 x i64> %2, <4 x i64>* undef, align 4 80 ret void 81 } 82 83 define void @zext_v16i16_to_v16i32(<16 x i16>* %a) { 84 ; SSE2: zext_v16i16_to_v16i32 85 ; SSE2: cost of 6 {{.*}} zext 86 ; 87 ; SSE41: zext_v16i16_to_v16i32 88 ; SSE41: cost of 4 {{.*}} zext 89 ; 90 %1 = load <16 x i16>, <16 x i16>* %a 91 %2 = zext <16 x i16> %1 to <16 x i32> 92 store <16 x i32> %2, <16 x i32>* undef, align 4 93 ret void 94 } 95 96 define void @sext_v16i16_to_v16i32(<16 x i16>* %a) { 97 ; SSE2: sext_v16i16_to_v16i32 98 ; SSE2: cost of 8 {{.*}} sext 99 ; 100 ; SSE41: sext_v16i16_to_v16i32 101 ; SSE41: cost of 4 {{.*}} sext 102 ; 103 %1 = load <16 x i16>, <16 x i16>* %a 104 %2 = sext <16 x i16> %1 to <16 x i32> 105 store <16 x i32> %2, <16 x i32>* undef, align 4 106 ret void 107 } 108 109 define void @zext_v8i16_to_v8i32(<8 x i16>* %a) { 110 ; SSE2: zext_v8i16_to_v8i32 111 ; SSE2: cost of 3 {{.*}} zext 112 ; 113 ; SSE41: zext_v8i16_to_v8i32 114 ; SSE41: cost of 2 {{.*}} zext 115 ; 116 %1 = load <8 x i16>, <8 x i16>* %a 117 %2 = zext <8 x i16> %1 to <8 x i32> 118 store <8 x i32> %2, <8 x i32>* undef, align 4 119 ret void 120 } 121 122 define void @sext_v8i16_to_v8i32(<8 x i16>* %a) { 123 ; SSE2: sext_v8i16_to_v8i32 124 ; SSE2: cost of 4 {{.*}} sext 125 ; 126 ; SSE41: sext_v8i16_to_v8i32 127 ; SSE41: cost of 2 {{.*}} sext 128 ; 129 %1 = load <8 x i16>, <8 x i16>* %a 130 %2 = sext <8 x i16> %1 to <8 x i32> 131 store <8 x i32> %2, <8 x i32>* undef, align 4 132 ret void 133 } 134 135 define void @zext_v4i16_to_v4i32(<4 x i16>* %a) { 136 ; SSE2: zext_v4i16_to_v4i32 137 ; SSE2: cost of 1 {{.*}} zext 138 ; 139 ; SSE41: zext_v4i16_to_v4i32 140 ; SSE41: cost of 1 {{.*}} zext 141 ; 142 %1 = load <4 x i16>, <4 x i16>* %a 143 %2 = zext <4 x i16> %1 to <4 x i32> 144 store <4 x i32> %2, <4 x i32>* undef, align 4 145 ret void 146 } 147 148 define void @sext_v4i16_to_v4i32(<4 x i16>* %a) { 149 ; SSE2: sext_v4i16_to_v4i32 150 ; SSE2: cost of 2 {{.*}} sext 151 ; 152 ; SSE41: sext_v4i16_to_v4i32 153 ; SSE41: cost of 1 {{.*}} sext 154 ; 155 %1 = load <4 x i16>, <4 x i16>* %a 156 %2 = sext <4 x i16> %1 to <4 x i32> 157 store <4 x i32> %2, <4 x i32>* undef, align 4 158 ret void 159 } 160 161 define void @zext_v16i8_to_v16i32(<16 x i8>* %a) { 162 ; SSE2: zext_v16i8_to_v16i32 163 ; SSE2: cost of 9 {{.*}} zext 164 ; 165 ; SSE41: zext_v16i8_to_v16i32 166 ; SSE41: cost of 4 {{.*}} zext 167 ; 168 %1 = load <16 x i8>, <16 x i8>* %a 169 %2 = zext <16 x i8> %1 to <16 x i32> 170 store <16 x i32> %2, <16 x i32>* undef, align 4 171 ret void 172 } 173 174 define void @sext_v16i8_to_v16i32(<16 x i8>* %a) { 175 ; SSE2: sext_v16i8_to_v16i32 176 ; SSE2: cost of 12 {{.*}} sext 177 ; 178 ; SSE41: sext_v16i8_to_v16i32 179 ; SSE41: cost of 4 {{.*}} sext 180 ; 181 %1 = load <16 x i8>, <16 x i8>* %a 182 %2 = sext <16 x i8> %1 to <16 x i32> 183 store <16 x i32> %2, <16 x i32>* undef, align 4 184 ret void 185 } 186 187 define void @zext_v8i8_to_v8i32(<8 x i8>* %a) { 188 ; SSE2: zext_v8i8_to_v8i32 189 ; SSE2: cost of 6 {{.*}} zext 190 ; 191 ; SSE41: zext_v8i8_to_v8i32 192 ; SSE41: cost of 2 {{.*}} zext 193 ; 194 %1 = load <8 x i8>, <8 x i8>* %a 195 %2 = zext <8 x i8> %1 to <8 x i32> 196 store <8 x i32> %2, <8 x i32>* undef, align 4 197 ret void 198 } 199 200 define void @sext_v8i8_to_v8i32(<8 x i8>* %a) { 201 ; SSE2: sext_v8i8_to_v8i32 202 ; SSE2: cost of 6 {{.*}} sext 203 ; 204 ; SSE41: sext_v8i8_to_v8i32 205 ; SSE41: cost of 2 {{.*}} sext 206 ; 207 %1 = load <8 x i8>, <8 x i8>* %a 208 %2 = sext <8 x i8> %1 to <8 x i32> 209 store <8 x i32> %2, <8 x i32>* undef, align 4 210 ret void 211 } 212 213 define void @zext_v4i8_to_v4i32(<4 x i8>* %a) { 214 ; SSE2: zext_v4i8_to_v4i32 215 ; SSE2: cost of 2 {{.*}} zext 216 ; 217 ; SSE41: zext_v4i8_to_v4i32 218 ; SSE41: cost of 1 {{.*}} zext 219 ; 220 %1 = load <4 x i8>, <4 x i8>* %a 221 %2 = zext <4 x i8> %1 to <4 x i32> 222 store <4 x i32> %2, <4 x i32>* undef, align 4 223 ret void 224 } 225 226 define void @sext_v4i8_to_v4i32(<4 x i8>* %a) { 227 ; SSE2: sext_v4i8_to_v4i32 228 ; SSE2: cost of 3 {{.*}} sext 229 ; 230 ; SSE41: sext_v4i8_to_v4i32 231 ; SSE41: cost of 1 {{.*}} sext 232 ; 233 %1 = load <4 x i8>, <4 x i8>* %a 234 %2 = sext <4 x i8> %1 to <4 x i32> 235 store <4 x i32> %2, <4 x i32>* undef, align 4 236 ret void 237 } 238 239 define void @zext_v16i8_to_v16i16(<16 x i8>* %a) { 240 ; SSE2: zext_v16i8_to_v16i16 241 ; SSE2: cost of 3 {{.*}} zext 242 ; 243 ; SSE41: zext_v16i8_to_v16i16 244 ; SSE41: cost of 2 {{.*}} zext 245 ; 246 %1 = load <16 x i8>, <16 x i8>* %a 247 %2 = zext <16 x i8> %1 to <16 x i16> 248 store <16 x i16> %2, <16 x i16>* undef, align 4 249 ret void 250 } 251 252 define void @sext_v16i8_to_v16i16(<16 x i8>* %a) { 253 ; SSE2: sext_v16i8_to_v16i16 254 ; SSE2: cost of 4 {{.*}} sext 255 ; 256 ; SSE41: sext_v16i8_to_v16i16 257 ; SSE41: cost of 2 {{.*}} sext 258 ; 259 %1 = load <16 x i8>, <16 x i8>* %a 260 %2 = sext <16 x i8> %1 to <16 x i16> 261 store <16 x i16> %2, <16 x i16>* undef, align 4 262 ret void 263 } 264 265 define void @zext_v8i8_to_v8i16(<8 x i8>* %a) { 266 ; SSE2: zext_v8i8_to_v8i16 267 ; SSE2: cost of 1 {{.*}} zext 268 ; 269 ; SSE41: zext_v8i8_to_v8i16 270 ; SSE41: cost of 1 {{.*}} zext 271 ; 272 %1 = load <8 x i8>, <8 x i8>* %a 273 %2 = zext <8 x i8> %1 to <8 x i16> 274 store <8 x i16> %2, <8 x i16>* undef, align 4 275 ret void 276 } 277 278 define void @sext_v8i8_to_v8i16(<8 x i8>* %a) { 279 ; SSE2: sext_v8i8_to_v8i16 280 ; SSE2: cost of 2 {{.*}} sext 281 ; 282 ; SSE41: sext_v8i8_to_v8i16 283 ; SSE41: cost of 1 {{.*}} sext 284 ; 285 %1 = load <8 x i8>, <8 x i8>* %a 286 %2 = sext <8 x i8> %1 to <8 x i16> 287 store <8 x i16> %2, <8 x i16>* undef, align 4 288 ret void 289 } 290 291 define void @zext_v4i8_to_v4i16(<4 x i8>* %a) { 292 ; SSE2: zext_v4i8_to_v4i16 293 ; SSE2: cost of 1 {{.*}} zext 294 ; 295 ; SSE41: zext_v4i8_to_v4i16 296 ; SSE41: cost of 1 {{.*}} zext 297 ; 298 %1 = load <4 x i8>, <4 x i8>* %a 299 %2 = zext <4 x i8> %1 to <4 x i16> 300 store <4 x i16> %2, <4 x i16>* undef, align 4 301 ret void 302 } 303 304 define void @sext_v4i8_to_v4i16(<4 x i8>* %a) { 305 ; SSE2: sext_v4i8_to_v4i16 306 ; SSE2: cost of 6 {{.*}} sext 307 ; 308 ; SSE41: sext_v4i8_to_v4i16 309 ; SSE41: cost of 2 {{.*}} sext 310 ; 311 %1 = load <4 x i8>, <4 x i8>* %a 312 %2 = sext <4 x i8> %1 to <4 x i16> 313 store <4 x i16> %2, <4 x i16>* undef, align 4 314 ret void 315 } 316 317 define void @truncate_v16i32_to_v16i16(<16 x i32>* %a) { 318 ; SSE2: truncate_v16i32_to_v16i16 319 ; SSE2: cost of 10 {{.*}} trunc 320 ; 321 ; SSE41: truncate_v16i32_to_v16i16 322 ; SSE41: cost of 6 {{.*}} trunc 323 ; 324 %1 = load <16 x i32>, <16 x i32>* %a 325 %2 = trunc <16 x i32> %1 to <16 x i16> 326 store <16 x i16> %2, <16 x i16>* undef, align 4 327 ret void 328 } 329 330 define void @truncate_v8i32_to_v8i16(<8 x i32>* %a) { 331 ; SSE2: truncate_v8i32_to_v8i16 332 ; SSE2: cost of 5 {{.*}} trunc 333 ; 334 ; SSE41: truncate_v8i32_to_v8i16 335 ; SSE41: cost of 3 {{.*}} trunc 336 ; 337 %1 = load <8 x i32>, <8 x i32>* %a 338 %2 = trunc <8 x i32> %1 to <8 x i16> 339 store <8 x i16> %2, <8 x i16>* undef, align 4 340 ret void 341 } 342 343 define void @truncate_v4i32_to_v4i16(<4 x i32>* %a) { 344 ; SSE2: truncate_v4i32_to_v4i16 345 ; SSE2: cost of 3 {{.*}} trunc 346 ; 347 ; SSE41: truncate_v4i32_to_v4i16 348 ; SSE41: cost of 1 {{.*}} trunc 349 ; 350 %1 = load <4 x i32>, <4 x i32>* %a 351 %2 = trunc <4 x i32> %1 to <4 x i16> 352 store <4 x i16> %2, <4 x i16>* undef, align 4 353 ret void 354 } 355 356 define void @truncate_v16i32_to_v16i8(<16 x i32>* %a) { 357 ; SSE2: truncate_v16i32_to_v16i8 358 ; SSE2: cost of 7 {{.*}} trunc 359 ; 360 ; SSE41: truncate_v16i32_to_v16i8 361 ; SSE41: cost of 7 {{.*}} trunc 362 ; 363 %1 = load <16 x i32>, <16 x i32>* %a 364 %2 = trunc <16 x i32> %1 to <16 x i8> 365 store <16 x i8> %2, <16 x i8>* undef, align 4 366 ret void 367 } 368 369 define void @truncate_v8i32_to_v8i8(<8 x i32>* %a) { 370 ; SSE2: truncate_v8i32_to_v8i8 371 ; SSE2: cost of 4 {{.*}} trunc 372 ; 373 ; SSE41: truncate_v8i32_to_v8i8 374 ; SSE41: cost of 3 {{.*}} trunc 375 ; 376 %1 = load <8 x i32>, <8 x i32>* %a 377 %2 = trunc <8 x i32> %1 to <8 x i8> 378 store <8 x i8> %2, <8 x i8>* undef, align 4 379 ret void 380 } 381 382 define void @truncate_v4i32_to_v4i8(<4 x i32>* %a) { 383 ; SSE2: truncate_v4i32_to_v4i8 384 ; SSE2: cost of 3 {{.*}} trunc 385 ; 386 ; SSE41: truncate_v4i32_to_v4i8 387 ; SSE41: cost of 1 {{.*}} trunc 388 ; 389 %1 = load <4 x i32>, <4 x i32>* %a 390 %2 = trunc <4 x i32> %1 to <4 x i8> 391 store <4 x i8> %2, <4 x i8>* undef, align 4 392 ret void 393 } 394 395 define void @truncate_v16i16_to_v16i8(<16 x i16>* %a) { 396 ; SSE2: truncate_v16i16_to_v16i8 397 ; SSE2: cost of 3 {{.*}} trunc 398 ; 399 ; SSE41: truncate_v16i16_to_v16i8 400 ; SSE41: cost of 3 {{.*}} trunc 401 ; 402 %1 = load <16 x i16>, <16 x i16>* %a 403 %2 = trunc <16 x i16> %1 to <16 x i8> 404 store <16 x i8> %2, <16 x i8>* undef, align 4 405 ret void 406 } 407 408 define void @truncate_v8i16_to_v8i8(<8 x i16>* %a) { 409 ; SSE2: truncate_v8i16_to_v8i8 410 ; SSE2: cost of 2 {{.*}} trunc 411 ; 412 ; SSE41: truncate_v8i16_to_v8i8 413 ; SSE41: cost of 1 {{.*}} trunc 414 ; 415 %1 = load <8 x i16>, <8 x i16>* %a 416 %2 = trunc <8 x i16> %1 to <8 x i8> 417 store <8 x i8> %2, <8 x i8>* undef, align 4 418 ret void 419 } 420 421 define void @truncate_v4i16_to_v4i8(<4 x i16>* %a) { 422 ; SSE2: truncate_v4i16_to_v4i8 423 ; SSE2: cost of 4 {{.*}} trunc 424 ; 425 ; SSE41: truncate_v4i16_to_v4i8 426 ; SSE41: cost of 2 {{.*}} trunc 427 ; 428 %1 = load <4 x i16>, <4 x i16>* %a 429 %2 = trunc <4 x i16> %1 to <4 x i8> 430 store <4 x i8> %2, <4 x i8>* undef, align 4 431 ret void 432 } 433