1 ; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-machineinstrs | FileCheck %s 2 ; 3 ; Verify that misched resource/latency balancy heuristics are sane. 4 5 define void @unrolled_mmult1(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94, 6 i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99, 7 i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104) 8 nounwind uwtable ssp { 9 entry: 10 br label %for.body 11 12 ; imull folded loads should be in order and interleaved with addl, never 13 ; adjacent. Also check that we have no spilling. 14 ; 15 ; Since mmult1 IR is already in good order, this effectively ensure 16 ; the scheduler maintains source order. 17 ; 18 ; CHECK: %for.body 19 ; CHECK-NOT: %rsp 20 ; CHECK: imull 4 21 ; CHECK-NOT: {{imull|rsp}} 22 ; CHECK: addl 23 ; CHECK: imull 8 24 ; CHECK-NOT: {{imull|rsp}} 25 ; CHECK: addl 26 ; CHECK: imull 12 27 ; CHECK-NOT: {{imull|rsp}} 28 ; CHECK: addl 29 ; CHECK: imull 16 30 ; CHECK-NOT: {{imull|rsp}} 31 ; CHECK: addl 32 ; CHECK: imull 20 33 ; CHECK-NOT: {{imull|rsp}} 34 ; CHECK: addl 35 ; CHECK: imull 24 36 ; CHECK-NOT: {{imull|rsp}} 37 ; CHECK: addl 38 ; CHECK: imull 28 39 ; CHECK-NOT: {{imull|rsp}} 40 ; CHECK: addl 41 ; CHECK: imull 32 42 ; CHECK-NOT: {{imull|rsp}} 43 ; CHECK: addl 44 ; CHECK: imull 36 45 ; CHECK-NOT: {{imull|rsp}} 46 ; CHECK: addl 47 ; CHECK-NOT: {{imull|rsp}} 48 ; CHECK: %end 49 for.body: 50 %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ] 51 %tmp57 = load i32* %tmp56, align 4 52 %arrayidx12.us.i61 = getelementptr inbounds i32* %pre, i64 %indvars.iv42.i 53 %tmp58 = load i32* %arrayidx12.us.i61, align 4 54 %mul.us.i = mul nsw i32 %tmp58, %tmp57 55 %arrayidx8.us.i.1 = getelementptr inbounds i32* %tmp56, i64 1 56 %tmp59 = load i32* %arrayidx8.us.i.1, align 4 57 %arrayidx12.us.i61.1 = getelementptr inbounds i32* %pre94, i64 %indvars.iv42.i 58 %tmp60 = load i32* %arrayidx12.us.i61.1, align 4 59 %mul.us.i.1 = mul nsw i32 %tmp60, %tmp59 60 %add.us.i.1 = add nsw i32 %mul.us.i.1, %mul.us.i 61 %arrayidx8.us.i.2 = getelementptr inbounds i32* %tmp56, i64 2 62 %tmp61 = load i32* %arrayidx8.us.i.2, align 4 63 %arrayidx12.us.i61.2 = getelementptr inbounds i32* %pre95, i64 %indvars.iv42.i 64 %tmp62 = load i32* %arrayidx12.us.i61.2, align 4 65 %mul.us.i.2 = mul nsw i32 %tmp62, %tmp61 66 %add.us.i.2 = add nsw i32 %mul.us.i.2, %add.us.i.1 67 %arrayidx8.us.i.3 = getelementptr inbounds i32* %tmp56, i64 3 68 %tmp63 = load i32* %arrayidx8.us.i.3, align 4 69 %arrayidx12.us.i61.3 = getelementptr inbounds i32* %pre96, i64 %indvars.iv42.i 70 %tmp64 = load i32* %arrayidx12.us.i61.3, align 4 71 %mul.us.i.3 = mul nsw i32 %tmp64, %tmp63 72 %add.us.i.3 = add nsw i32 %mul.us.i.3, %add.us.i.2 73 %arrayidx8.us.i.4 = getelementptr inbounds i32* %tmp56, i64 4 74 %tmp65 = load i32* %arrayidx8.us.i.4, align 4 75 %arrayidx12.us.i61.4 = getelementptr inbounds i32* %pre97, i64 %indvars.iv42.i 76 %tmp66 = load i32* %arrayidx12.us.i61.4, align 4 77 %mul.us.i.4 = mul nsw i32 %tmp66, %tmp65 78 %add.us.i.4 = add nsw i32 %mul.us.i.4, %add.us.i.3 79 %arrayidx8.us.i.5 = getelementptr inbounds i32* %tmp56, i64 5 80 %tmp67 = load i32* %arrayidx8.us.i.5, align 4 81 %arrayidx12.us.i61.5 = getelementptr inbounds i32* %pre98, i64 %indvars.iv42.i 82 %tmp68 = load i32* %arrayidx12.us.i61.5, align 4 83 %mul.us.i.5 = mul nsw i32 %tmp68, %tmp67 84 %add.us.i.5 = add nsw i32 %mul.us.i.5, %add.us.i.4 85 %arrayidx8.us.i.6 = getelementptr inbounds i32* %tmp56, i64 6 86 %tmp69 = load i32* %arrayidx8.us.i.6, align 4 87 %arrayidx12.us.i61.6 = getelementptr inbounds i32* %pre99, i64 %indvars.iv42.i 88 %tmp70 = load i32* %arrayidx12.us.i61.6, align 4 89 %mul.us.i.6 = mul nsw i32 %tmp70, %tmp69 90 %add.us.i.6 = add nsw i32 %mul.us.i.6, %add.us.i.5 91 %arrayidx8.us.i.7 = getelementptr inbounds i32* %tmp56, i64 7 92 %tmp71 = load i32* %arrayidx8.us.i.7, align 4 93 %arrayidx12.us.i61.7 = getelementptr inbounds i32* %pre100, i64 %indvars.iv42.i 94 %tmp72 = load i32* %arrayidx12.us.i61.7, align 4 95 %mul.us.i.7 = mul nsw i32 %tmp72, %tmp71 96 %add.us.i.7 = add nsw i32 %mul.us.i.7, %add.us.i.6 97 %arrayidx8.us.i.8 = getelementptr inbounds i32* %tmp56, i64 8 98 %tmp73 = load i32* %arrayidx8.us.i.8, align 4 99 %arrayidx12.us.i61.8 = getelementptr inbounds i32* %pre101, i64 %indvars.iv42.i 100 %tmp74 = load i32* %arrayidx12.us.i61.8, align 4 101 %mul.us.i.8 = mul nsw i32 %tmp74, %tmp73 102 %add.us.i.8 = add nsw i32 %mul.us.i.8, %add.us.i.7 103 %arrayidx8.us.i.9 = getelementptr inbounds i32* %tmp56, i64 9 104 %tmp75 = load i32* %arrayidx8.us.i.9, align 4 105 %arrayidx12.us.i61.9 = getelementptr inbounds i32* %pre102, i64 %indvars.iv42.i 106 %tmp76 = load i32* %arrayidx12.us.i61.9, align 4 107 %mul.us.i.9 = mul nsw i32 %tmp76, %tmp75 108 %add.us.i.9 = add nsw i32 %mul.us.i.9, %add.us.i.8 109 %arrayidx16.us.i = getelementptr inbounds i32* %tmp55, i64 %indvars.iv42.i 110 store i32 %add.us.i.9, i32* %arrayidx16.us.i, align 4 111 %indvars.iv.next43.i = add i64 %indvars.iv42.i, 1 112 %lftr.wideiv = trunc i64 %indvars.iv.next43.i to i32 113 %exitcond = icmp eq i32 %lftr.wideiv, 10 114 br i1 %exitcond, label %end, label %for.body 115 116 end: 117 ret void 118 } 119 120 ; Unlike the above loop, this IR starts out bad and must be 121 ; rescheduled. 122 ; 123 ; CHECK: %for.body 124 ; CHECK-NOT: %rsp 125 ; CHECK: imull 4 126 ; CHECK-NOT: {{imull|rsp}} 127 ; CHECK: addl 128 ; CHECK: imull 8 129 ; CHECK-NOT: {{imull|rsp}} 130 ; CHECK: addl 131 ; CHECK: imull 12 132 ; CHECK-NOT: {{imull|rsp}} 133 ; CHECK: addl 134 ; CHECK: imull 16 135 ; CHECK-NOT: {{imull|rsp}} 136 ; CHECK: addl 137 ; CHECK: imull 20 138 ; CHECK-NOT: {{imull|rsp}} 139 ; CHECK: addl 140 ; CHECK: imull 24 141 ; CHECK-NOT: {{imull|rsp}} 142 ; CHECK: addl 143 ; CHECK: imull 28 144 ; CHECK-NOT: {{imull|rsp}} 145 ; CHECK: addl 146 ; CHECK: imull 32 147 ; CHECK-NOT: {{imull|rsp}} 148 ; CHECK: addl 149 ; CHECK: imull 36 150 ; CHECK-NOT: {{imull|rsp}} 151 ; CHECK: addl 152 ; CHECK-NOT: {{imull|rsp}} 153 ; CHECK: %end 154 define void @unrolled_mmult2(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94, 155 i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99, 156 i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104) 157 nounwind uwtable ssp { 158 entry: 159 br label %for.body 160 for.body: 161 %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ] 162 %tmp57 = load i32* %tmp56, align 4 163 %arrayidx12.us.i61 = getelementptr inbounds i32* %pre, i64 %indvars.iv42.i 164 %tmp58 = load i32* %arrayidx12.us.i61, align 4 165 %arrayidx8.us.i.1 = getelementptr inbounds i32* %tmp56, i64 1 166 %tmp59 = load i32* %arrayidx8.us.i.1, align 4 167 %arrayidx12.us.i61.1 = getelementptr inbounds i32* %pre94, i64 %indvars.iv42.i 168 %tmp60 = load i32* %arrayidx12.us.i61.1, align 4 169 %arrayidx8.us.i.2 = getelementptr inbounds i32* %tmp56, i64 2 170 %tmp61 = load i32* %arrayidx8.us.i.2, align 4 171 %arrayidx12.us.i61.2 = getelementptr inbounds i32* %pre95, i64 %indvars.iv42.i 172 %tmp62 = load i32* %arrayidx12.us.i61.2, align 4 173 %arrayidx8.us.i.3 = getelementptr inbounds i32* %tmp56, i64 3 174 %tmp63 = load i32* %arrayidx8.us.i.3, align 4 175 %arrayidx12.us.i61.3 = getelementptr inbounds i32* %pre96, i64 %indvars.iv42.i 176 %tmp64 = load i32* %arrayidx12.us.i61.3, align 4 177 %arrayidx8.us.i.4 = getelementptr inbounds i32* %tmp56, i64 4 178 %tmp65 = load i32* %arrayidx8.us.i.4, align 4 179 %arrayidx12.us.i61.4 = getelementptr inbounds i32* %pre97, i64 %indvars.iv42.i 180 %tmp66 = load i32* %arrayidx12.us.i61.4, align 4 181 %arrayidx8.us.i.5 = getelementptr inbounds i32* %tmp56, i64 5 182 %tmp67 = load i32* %arrayidx8.us.i.5, align 4 183 %arrayidx12.us.i61.5 = getelementptr inbounds i32* %pre98, i64 %indvars.iv42.i 184 %tmp68 = load i32* %arrayidx12.us.i61.5, align 4 185 %arrayidx8.us.i.6 = getelementptr inbounds i32* %tmp56, i64 6 186 %tmp69 = load i32* %arrayidx8.us.i.6, align 4 187 %arrayidx12.us.i61.6 = getelementptr inbounds i32* %pre99, i64 %indvars.iv42.i 188 %tmp70 = load i32* %arrayidx12.us.i61.6, align 4 189 %mul.us.i = mul nsw i32 %tmp58, %tmp57 190 %arrayidx8.us.i.7 = getelementptr inbounds i32* %tmp56, i64 7 191 %tmp71 = load i32* %arrayidx8.us.i.7, align 4 192 %arrayidx12.us.i61.7 = getelementptr inbounds i32* %pre100, i64 %indvars.iv42.i 193 %tmp72 = load i32* %arrayidx12.us.i61.7, align 4 194 %arrayidx8.us.i.8 = getelementptr inbounds i32* %tmp56, i64 8 195 %tmp73 = load i32* %arrayidx8.us.i.8, align 4 196 %arrayidx12.us.i61.8 = getelementptr inbounds i32* %pre101, i64 %indvars.iv42.i 197 %tmp74 = load i32* %arrayidx12.us.i61.8, align 4 198 %arrayidx8.us.i.9 = getelementptr inbounds i32* %tmp56, i64 9 199 %tmp75 = load i32* %arrayidx8.us.i.9, align 4 200 %arrayidx12.us.i61.9 = getelementptr inbounds i32* %pre102, i64 %indvars.iv42.i 201 %tmp76 = load i32* %arrayidx12.us.i61.9, align 4 202 %mul.us.i.1 = mul nsw i32 %tmp60, %tmp59 203 %add.us.i.1 = add nsw i32 %mul.us.i.1, %mul.us.i 204 %mul.us.i.2 = mul nsw i32 %tmp62, %tmp61 205 %add.us.i.2 = add nsw i32 %mul.us.i.2, %add.us.i.1 206 %mul.us.i.3 = mul nsw i32 %tmp64, %tmp63 207 %add.us.i.3 = add nsw i32 %mul.us.i.3, %add.us.i.2 208 %mul.us.i.4 = mul nsw i32 %tmp66, %tmp65 209 %add.us.i.4 = add nsw i32 %mul.us.i.4, %add.us.i.3 210 %mul.us.i.5 = mul nsw i32 %tmp68, %tmp67 211 %add.us.i.5 = add nsw i32 %mul.us.i.5, %add.us.i.4 212 %mul.us.i.6 = mul nsw i32 %tmp70, %tmp69 213 %add.us.i.6 = add nsw i32 %mul.us.i.6, %add.us.i.5 214 %mul.us.i.7 = mul nsw i32 %tmp72, %tmp71 215 %add.us.i.7 = add nsw i32 %mul.us.i.7, %add.us.i.6 216 %mul.us.i.8 = mul nsw i32 %tmp74, %tmp73 217 %add.us.i.8 = add nsw i32 %mul.us.i.8, %add.us.i.7 218 %mul.us.i.9 = mul nsw i32 %tmp76, %tmp75 219 %add.us.i.9 = add nsw i32 %mul.us.i.9, %add.us.i.8 220 %arrayidx16.us.i = getelementptr inbounds i32* %tmp55, i64 %indvars.iv42.i 221 store i32 %add.us.i.9, i32* %arrayidx16.us.i, align 4 222 %indvars.iv.next43.i = add i64 %indvars.iv42.i, 1 223 %lftr.wideiv = trunc i64 %indvars.iv.next43.i to i32 224 %exitcond = icmp eq i32 %lftr.wideiv, 10 225 br i1 %exitcond, label %end, label %for.body 226 227 end: 228 ret void 229 } 230 231 ; A mildly interesting little block extracted from a cipher. The 232 ; balanced heuristics are interesting here because we have resource, 233 ; latency, and register limits all at once. For now, simply check that 234 ; we don't use any callee-saves. 235 ; CHECK: @encpc1 236 ; CHECK: %entry 237 ; CHECK-NOT: push 238 ; CHECK-NOT: pop 239 ; CHECK: ret 240 @a = external global i32, align 4 241 @b = external global i32, align 4 242 @c = external global i32, align 4 243 @d = external global i32, align 4 244 define i32 @encpc1() nounwind { 245 entry: 246 %l1 = load i32* @a, align 16 247 %conv = shl i32 %l1, 8 248 %s5 = lshr i32 %l1, 8 249 %add = or i32 %conv, %s5 250 store i32 %add, i32* @b 251 %l6 = load i32* @a 252 %l7 = load i32* @c 253 %add.i = add i32 %l7, %l6 254 %idxprom.i = zext i32 %l7 to i64 255 %arrayidx.i = getelementptr inbounds i32* @d, i64 %idxprom.i 256 %l8 = load i32* %arrayidx.i 257 store i32 346, i32* @c 258 store i32 20021, i32* @d 259 %l9 = load i32* @a 260 store i32 %l8, i32* @a 261 store i32 %l9, i32* @b 262 store i32 %add.i, i32* @c 263 store i32 %l9, i32* @d 264 %cmp.i = icmp eq i32 %add.i, 0 265 %s10 = lshr i32 %l1, 16 266 %s12 = lshr i32 %l1, 24 267 %s14 = lshr i32 %l1, 30 268 br i1 %cmp.i, label %if, label %return 269 if: 270 %sa = add i32 %s5, %s10 271 %sb = add i32 %sa, %s12 272 %sc = add i32 %sb, %s14 273 br label %return 274 return: 275 %result = phi i32 [0, %entry], [%sc, %if] 276 ret i32 %result 277 } 278