1 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=1 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWON %s 2 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=0 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWOFF %s 3 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -addr-sink-using-gep=1 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWON %s 4 5 %struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 } 6 %struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 } 7 8 ; CHECK-LABEL: merge_const_store: 9 ; save 1,2,3 ... as one big integer. 10 ; CHECK: movabsq $578437695752307201 11 ; CHECK: ret 12 define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp { 13 %1 = icmp sgt i32 %count, 0 14 br i1 %1, label %.lr.ph, label %._crit_edge 15 .lr.ph: 16 %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ] 17 %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ] 18 %2 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0 19 store i8 1, i8* %2, align 1 20 %3 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1 21 store i8 2, i8* %3, align 1 22 %4 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 2 23 store i8 3, i8* %4, align 1 24 %5 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 3 25 store i8 4, i8* %5, align 1 26 %6 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 4 27 store i8 5, i8* %6, align 1 28 %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 5 29 store i8 6, i8* %7, align 1 30 %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 6 31 store i8 7, i8* %8, align 1 32 %9 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 7 33 store i8 8, i8* %9, align 1 34 %10 = add nsw i32 %i.02, 1 35 %11 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1 36 %exitcond = icmp eq i32 %10, %count 37 br i1 %exitcond, label %._crit_edge, label %.lr.ph 38 ._crit_edge: 39 ret void 40 } 41 42 ; No vectors because we use noimplicitfloat 43 ; CHECK-LABEL: merge_const_store_no_vec: 44 ; CHECK-NOT: vmovups 45 ; CHECK: ret 46 define void @merge_const_store_no_vec(i32 %count, %struct.B* nocapture %p) noimplicitfloat{ 47 %1 = icmp sgt i32 %count, 0 48 br i1 %1, label %.lr.ph, label %._crit_edge 49 .lr.ph: 50 %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ] 51 %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ] 52 %2 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0 53 store i32 0, i32* %2, align 4 54 %3 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1 55 store i32 0, i32* %3, align 4 56 %4 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2 57 store i32 0, i32* %4, align 4 58 %5 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3 59 store i32 0, i32* %5, align 4 60 %6 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 4 61 store i32 0, i32* %6, align 4 62 %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 5 63 store i32 0, i32* %7, align 4 64 %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 6 65 store i32 0, i32* %8, align 4 66 %9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 7 67 store i32 0, i32* %9, align 4 68 %10 = add nsw i32 %i.02, 1 69 %11 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1 70 %exitcond = icmp eq i32 %10, %count 71 br i1 %exitcond, label %._crit_edge, label %.lr.ph 72 ._crit_edge: 73 ret void 74 } 75 76 ; Move the constants using a single vector store. 77 ; CHECK-LABEL: merge_const_store_vec: 78 ; CHECK: vmovups 79 ; CHECK: ret 80 define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp { 81 %1 = icmp sgt i32 %count, 0 82 br i1 %1, label %.lr.ph, label %._crit_edge 83 .lr.ph: 84 %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ] 85 %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ] 86 %2 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0 87 store i32 0, i32* %2, align 4 88 %3 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1 89 store i32 0, i32* %3, align 4 90 %4 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2 91 store i32 0, i32* %4, align 4 92 %5 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3 93 store i32 0, i32* %5, align 4 94 %6 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 4 95 store i32 0, i32* %6, align 4 96 %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 5 97 store i32 0, i32* %7, align 4 98 %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 6 99 store i32 0, i32* %8, align 4 100 %9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 7 101 store i32 0, i32* %9, align 4 102 %10 = add nsw i32 %i.02, 1 103 %11 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1 104 %exitcond = icmp eq i32 %10, %count 105 br i1 %exitcond, label %._crit_edge, label %.lr.ph 106 ._crit_edge: 107 ret void 108 } 109 110 ; Move the first 4 constants as a single vector. Move the rest as scalars. 111 ; CHECK-LABEL: merge_nonconst_store: 112 ; CHECK: movl $67305985 113 ; CHECK: movb 114 ; CHECK: movb 115 ; CHECK: movb 116 ; CHECK: movb 117 ; CHECK: ret 118 define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp { 119 %1 = icmp sgt i32 %count, 0 120 br i1 %1, label %.lr.ph, label %._crit_edge 121 .lr.ph: 122 %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ] 123 %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ] 124 %2 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0 125 store i8 1, i8* %2, align 1 126 %3 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1 127 store i8 2, i8* %3, align 1 128 %4 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 2 129 store i8 3, i8* %4, align 1 130 %5 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 3 131 store i8 4, i8* %5, align 1 132 %6 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 4 133 store i8 %zz, i8* %6, align 1 ; <----------- Not a const; 134 %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 5 135 store i8 6, i8* %7, align 1 136 %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 6 137 store i8 7, i8* %8, align 1 138 %9 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 7 139 store i8 8, i8* %9, align 1 140 %10 = add nsw i32 %i.02, 1 141 %11 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1 142 %exitcond = icmp eq i32 %10, %count 143 br i1 %exitcond, label %._crit_edge, label %.lr.ph 144 ._crit_edge: 145 ret void 146 } 147 148 149 ; CHECK-LABEL: merge_loads_i16: 150 ; load: 151 ; BWON: movzwl 152 ; BWOFF: movw 153 ; store: 154 ; CHECK: movw 155 ; CHECK: ret 156 define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp { 157 %1 = icmp sgt i32 %count, 0 158 br i1 %1, label %.lr.ph, label %._crit_edge 159 160 .lr.ph: ; preds = %0 161 %2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0 162 %3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1 163 br label %4 164 165 ; <label>:4 ; preds = %4, %.lr.ph 166 %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ] 167 %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %10, %4 ] 168 %5 = load i8, i8* %2, align 1 169 %6 = load i8, i8* %3, align 1 170 %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0 171 store i8 %5, i8* %7, align 1 172 %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1 173 store i8 %6, i8* %8, align 1 174 %9 = add nsw i32 %i.02, 1 175 %10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1 176 %exitcond = icmp eq i32 %9, %count 177 br i1 %exitcond, label %._crit_edge, label %4 178 179 ._crit_edge: ; preds = %4, %0 180 ret void 181 } 182 183 ; The loads and the stores are interleaved. Can't merge them. 184 ; CHECK-LABEL: no_merge_loads: 185 ; BWON: movzbl 186 ; BWOFF: movb 187 ; CHECK: movb 188 ; BWON: movzbl 189 ; BWOFF: movb 190 ; CHECK: movb 191 ; CHECK: ret 192 define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp { 193 %1 = icmp sgt i32 %count, 0 194 br i1 %1, label %.lr.ph, label %._crit_edge 195 196 .lr.ph: ; preds = %0 197 %2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0 198 %3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1 199 br label %a4 200 201 a4: ; preds = %4, %.lr.ph 202 %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ] 203 %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %a10, %a4 ] 204 %a5 = load i8, i8* %2, align 1 205 %a7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0 206 store i8 %a5, i8* %a7, align 1 207 %a8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1 208 %a6 = load i8, i8* %3, align 1 209 store i8 %a6, i8* %a8, align 1 210 %a9 = add nsw i32 %i.02, 1 211 %a10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1 212 %exitcond = icmp eq i32 %a9, %count 213 br i1 %exitcond, label %._crit_edge, label %a4 214 215 ._crit_edge: ; preds = %4, %0 216 ret void 217 } 218 219 220 ; CHECK-LABEL: merge_loads_integer: 221 ; load: 222 ; CHECK: movq 223 ; store: 224 ; CHECK: movq 225 ; CHECK: ret 226 define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp { 227 %1 = icmp sgt i32 %count, 0 228 br i1 %1, label %.lr.ph, label %._crit_edge 229 230 .lr.ph: ; preds = %0 231 %2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0 232 %3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1 233 br label %4 234 235 ; <label>:4 ; preds = %4, %.lr.ph 236 %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ] 237 %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %10, %4 ] 238 %5 = load i32, i32* %2 239 %6 = load i32, i32* %3 240 %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0 241 store i32 %5, i32* %7 242 %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1 243 store i32 %6, i32* %8 244 %9 = add nsw i32 %i.02, 1 245 %10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1 246 %exitcond = icmp eq i32 %9, %count 247 br i1 %exitcond, label %._crit_edge, label %4 248 249 ._crit_edge: ; preds = %4, %0 250 ret void 251 } 252 253 254 ; CHECK-LABEL: merge_loads_vector: 255 ; load: 256 ; CHECK: movups 257 ; store: 258 ; CHECK: movups 259 ; CHECK: ret 260 define void @merge_loads_vector(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp { 261 %a1 = icmp sgt i32 %count, 0 262 br i1 %a1, label %.lr.ph, label %._crit_edge 263 264 .lr.ph: ; preds = %0 265 %a2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0 266 %a3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1 267 %a4 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 2 268 %a5 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 3 269 br label %block4 270 271 block4: ; preds = %4, %.lr.ph 272 %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ] 273 %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ] 274 %a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0 275 %a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1 276 %a9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2 277 %a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3 278 %b1 = load i32, i32* %a2 279 %b2 = load i32, i32* %a3 280 %b3 = load i32, i32* %a4 281 %b4 = load i32, i32* %a5 282 store i32 %b1, i32* %a7 283 store i32 %b2, i32* %a8 284 store i32 %b3, i32* %a9 285 store i32 %b4, i32* %a10 286 %c9 = add nsw i32 %i.02, 1 287 %c10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1 288 %exitcond = icmp eq i32 %c9, %count 289 br i1 %exitcond, label %._crit_edge, label %block4 290 291 ._crit_edge: ; preds = %4, %0 292 ret void 293 } 294 295 ;; On x86, even unaligned copies should be merged to vector ops. 296 ;; TODO: however, this cannot happen at the moment, due to brokenness 297 ;; in MergeConsecutiveStores. See UseAA FIXME in DAGCombiner.cpp 298 ;; visitSTORE. 299 300 ; CHECK-LABEL: merge_loads_no_align: 301 ; load: 302 ; CHECK-NOT: vmovups ;; TODO 303 ; store: 304 ; CHECK-NOT: vmovups ;; TODO 305 ; CHECK: ret 306 define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp { 307 %a1 = icmp sgt i32 %count, 0 308 br i1 %a1, label %.lr.ph, label %._crit_edge 309 310 .lr.ph: ; preds = %0 311 %a2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0 312 %a3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1 313 %a4 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 2 314 %a5 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 3 315 br label %block4 316 317 block4: ; preds = %4, %.lr.ph 318 %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ] 319 %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ] 320 %a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0 321 %a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1 322 %a9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2 323 %a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3 324 %b1 = load i32, i32* %a2, align 1 325 %b2 = load i32, i32* %a3, align 1 326 %b3 = load i32, i32* %a4, align 1 327 %b4 = load i32, i32* %a5, align 1 328 store i32 %b1, i32* %a7, align 1 329 store i32 %b2, i32* %a8, align 1 330 store i32 %b3, i32* %a9, align 1 331 store i32 %b4, i32* %a10, align 1 332 %c9 = add nsw i32 %i.02, 1 333 %c10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1 334 %exitcond = icmp eq i32 %c9, %count 335 br i1 %exitcond, label %._crit_edge, label %block4 336 337 ._crit_edge: ; preds = %4, %0 338 ret void 339 } 340 341 ; Make sure that we merge the consecutive load/store sequence below and use a 342 ; word (16 bit) instead of a byte copy. 343 ; CHECK-LABEL: MergeLoadStoreBaseIndexOffset: 344 ; BWON: movzwl (%{{.*}},%{{.*}}), %e[[REG:[a-z]+]] 345 ; BWOFF: movw (%{{.*}},%{{.*}}), %[[REG:[a-z]+]] 346 ; CHECK: movw %[[REG]], (%{{.*}}) 347 define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) { 348 br label %1 349 350 ; <label>:1 351 %.09 = phi i32 [ %n, %0 ], [ %11, %1 ] 352 %.08 = phi i8* [ %b, %0 ], [ %10, %1 ] 353 %.0 = phi i64* [ %a, %0 ], [ %2, %1 ] 354 %2 = getelementptr inbounds i64, i64* %.0, i64 1 355 %3 = load i64, i64* %.0, align 1 356 %4 = getelementptr inbounds i8, i8* %c, i64 %3 357 %5 = load i8, i8* %4, align 1 358 %6 = add i64 %3, 1 359 %7 = getelementptr inbounds i8, i8* %c, i64 %6 360 %8 = load i8, i8* %7, align 1 361 store i8 %5, i8* %.08, align 1 362 %9 = getelementptr inbounds i8, i8* %.08, i64 1 363 store i8 %8, i8* %9, align 1 364 %10 = getelementptr inbounds i8, i8* %.08, i64 2 365 %11 = add nsw i32 %.09, -1 366 %12 = icmp eq i32 %11, 0 367 br i1 %12, label %13, label %1 368 369 ; <label>:13 370 ret void 371 } 372 373 ; Make sure that we merge the consecutive load/store sequence below and use a 374 ; word (16 bit) instead of a byte copy even if there are intermediate sign 375 ; extensions. 376 ; CHECK-LABEL: MergeLoadStoreBaseIndexOffsetSext: 377 ; BWON: movzwl (%{{.*}},%{{.*}}), %e[[REG:[a-z]+]] 378 ; BWOFF: movw (%{{.*}},%{{.*}}), %[[REG:[a-z]+]] 379 ; CHECK: movw %[[REG]], (%{{.*}}) 380 define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) { 381 br label %1 382 383 ; <label>:1 384 %.09 = phi i32 [ %n, %0 ], [ %12, %1 ] 385 %.08 = phi i8* [ %b, %0 ], [ %11, %1 ] 386 %.0 = phi i8* [ %a, %0 ], [ %2, %1 ] 387 %2 = getelementptr inbounds i8, i8* %.0, i64 1 388 %3 = load i8, i8* %.0, align 1 389 %4 = sext i8 %3 to i64 390 %5 = getelementptr inbounds i8, i8* %c, i64 %4 391 %6 = load i8, i8* %5, align 1 392 %7 = add i64 %4, 1 393 %8 = getelementptr inbounds i8, i8* %c, i64 %7 394 %9 = load i8, i8* %8, align 1 395 store i8 %6, i8* %.08, align 1 396 %10 = getelementptr inbounds i8, i8* %.08, i64 1 397 store i8 %9, i8* %10, align 1 398 %11 = getelementptr inbounds i8, i8* %.08, i64 2 399 %12 = add nsw i32 %.09, -1 400 %13 = icmp eq i32 %12, 0 401 br i1 %13, label %14, label %1 402 403 ; <label>:14 404 ret void 405 } 406 407 ; However, we can only merge ignore sign extensions when they are on all memory 408 ; computations; 409 ; CHECK-LABEL: loadStoreBaseIndexOffsetSextNoSex: 410 ; CHECK-NOT: movw (%{{.*}},%{{.*}}), [[REG:%[a-z]+]] 411 ; CHECK-NOT: movw [[REG]], (%{{.*}}) 412 define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) { 413 br label %1 414 415 ; <label>:1 416 %.09 = phi i32 [ %n, %0 ], [ %12, %1 ] 417 %.08 = phi i8* [ %b, %0 ], [ %11, %1 ] 418 %.0 = phi i8* [ %a, %0 ], [ %2, %1 ] 419 %2 = getelementptr inbounds i8, i8* %.0, i64 1 420 %3 = load i8, i8* %.0, align 1 421 %4 = sext i8 %3 to i64 422 %5 = getelementptr inbounds i8, i8* %c, i64 %4 423 %6 = load i8, i8* %5, align 1 424 %7 = add i8 %3, 1 425 %wrap.4 = sext i8 %7 to i64 426 %8 = getelementptr inbounds i8, i8* %c, i64 %wrap.4 427 %9 = load i8, i8* %8, align 1 428 store i8 %6, i8* %.08, align 1 429 %10 = getelementptr inbounds i8, i8* %.08, i64 1 430 store i8 %9, i8* %10, align 1 431 %11 = getelementptr inbounds i8, i8* %.08, i64 2 432 %12 = add nsw i32 %.09, -1 433 %13 = icmp eq i32 %12, 0 434 br i1 %13, label %14, label %1 435 436 ; <label>:14 437 ret void 438 } 439 440 ; PR21711 ( http://llvm.org/bugs/show_bug.cgi?id=21711 ) 441 define void @merge_vec_element_store(<8 x float> %v, float* %ptr) { 442 %vecext0 = extractelement <8 x float> %v, i32 0 443 %vecext1 = extractelement <8 x float> %v, i32 1 444 %vecext2 = extractelement <8 x float> %v, i32 2 445 %vecext3 = extractelement <8 x float> %v, i32 3 446 %vecext4 = extractelement <8 x float> %v, i32 4 447 %vecext5 = extractelement <8 x float> %v, i32 5 448 %vecext6 = extractelement <8 x float> %v, i32 6 449 %vecext7 = extractelement <8 x float> %v, i32 7 450 %arrayidx1 = getelementptr inbounds float, float* %ptr, i64 1 451 %arrayidx2 = getelementptr inbounds float, float* %ptr, i64 2 452 %arrayidx3 = getelementptr inbounds float, float* %ptr, i64 3 453 %arrayidx4 = getelementptr inbounds float, float* %ptr, i64 4 454 %arrayidx5 = getelementptr inbounds float, float* %ptr, i64 5 455 %arrayidx6 = getelementptr inbounds float, float* %ptr, i64 6 456 %arrayidx7 = getelementptr inbounds float, float* %ptr, i64 7 457 store float %vecext0, float* %ptr, align 4 458 store float %vecext1, float* %arrayidx1, align 4 459 store float %vecext2, float* %arrayidx2, align 4 460 store float %vecext3, float* %arrayidx3, align 4 461 store float %vecext4, float* %arrayidx4, align 4 462 store float %vecext5, float* %arrayidx5, align 4 463 store float %vecext6, float* %arrayidx6, align 4 464 store float %vecext7, float* %arrayidx7, align 4 465 ret void 466 467 ; CHECK-LABEL: merge_vec_element_store 468 ; CHECK: vmovups 469 ; CHECK-NEXT: vzeroupper 470 ; CHECK-NEXT: retq 471 } 472 473 ; PR21711 - Merge vector stores into wider vector stores. 474 ; These should be merged into 32-byte stores. 475 define void @merge_vec_extract_stores(<8 x float> %v1, <8 x float> %v2, <4 x float>* %ptr) { 476 %idx0 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3 477 %idx1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4 478 %idx2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5 479 %idx3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 6 480 %shuffle0 = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 481 %shuffle1 = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 482 %shuffle2 = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 483 %shuffle3 = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 484 store <4 x float> %shuffle0, <4 x float>* %idx0, align 16 485 store <4 x float> %shuffle1, <4 x float>* %idx1, align 16 486 store <4 x float> %shuffle2, <4 x float>* %idx2, align 16 487 store <4 x float> %shuffle3, <4 x float>* %idx3, align 16 488 ret void 489 490 ; CHECK-LABEL: merge_vec_extract_stores 491 ; CHECK: vmovups %ymm0, 48(%rdi) 492 ; CHECK-NEXT: vmovups %ymm1, 80(%rdi) 493 ; CHECK-NEXT: vzeroupper 494 ; CHECK-NEXT: retq 495 } 496 497 ; Merging vector stores when sourced from vector loads is not currently handled. 498 define void @merge_vec_stores_from_loads(<4 x float>* %v, <4 x float>* %ptr) { 499 %load_idx0 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 0 500 %load_idx1 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 1 501 %v0 = load <4 x float>, <4 x float>* %load_idx0 502 %v1 = load <4 x float>, <4 x float>* %load_idx1 503 %store_idx0 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 0 504 %store_idx1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 1 505 store <4 x float> %v0, <4 x float>* %store_idx0, align 16 506 store <4 x float> %v1, <4 x float>* %store_idx1, align 16 507 ret void 508 509 ; CHECK-LABEL: merge_vec_stores_from_loads 510 ; CHECK: vmovaps 511 ; CHECK-NEXT: vmovaps 512 ; CHECK-NEXT: vmovaps 513 ; CHECK-NEXT: vmovaps 514 ; CHECK-NEXT: retq 515 } 516 517 ; Merging vector stores when sourced from a constant vector is not currently handled. 518 define void @merge_vec_stores_of_constants(<4 x i32>* %ptr) { 519 %idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3 520 %idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4 521 store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx0, align 16 522 store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx1, align 16 523 ret void 524 525 ; CHECK-LABEL: merge_vec_stores_of_constants 526 ; CHECK: vxorps 527 ; CHECK-NEXT: vmovaps 528 ; CHECK-NEXT: vmovaps 529 ; CHECK-NEXT: retq 530 } 531 532 ; This is a minimized test based on real code that was failing. 533 ; We could merge stores (and loads) like this... 534 535 define void @merge_vec_element_and_scalar_load([6 x i64]* %array) { 536 %idx0 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 0 537 %idx1 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 1 538 %idx4 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 4 539 %idx5 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 5 540 541 %a0 = load i64, i64* %idx0, align 8 542 store i64 %a0, i64* %idx4, align 8 543 544 %b = bitcast i64* %idx1 to <2 x i64>* 545 %v = load <2 x i64>, <2 x i64>* %b, align 8 546 %a1 = extractelement <2 x i64> %v, i32 0 547 store i64 %a1, i64* %idx5, align 8 548 ret void 549 550 ; CHECK-LABEL: merge_vec_element_and_scalar_load 551 ; CHECK: movq (%rdi), %rax 552 ; CHECK-NEXT: movq %rax, 32(%rdi) 553 ; CHECK-NEXT: movq 8(%rdi), %rax 554 ; CHECK-NEXT: movq %rax, 40(%rdi) 555 ; CHECK-NEXT: retq 556 } 557