1 ; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \ 2 ; RUN: -misched-topdown -verify-machineinstrs \ 3 ; RUN: | FileCheck %s -check-prefix=TOPDOWN 4 ; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \ 5 ; RUN: -misched=ilpmin -verify-machineinstrs \ 6 ; RUN: | FileCheck %s -check-prefix=ILPMIN 7 ; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \ 8 ; RUN: -misched=ilpmax -verify-machineinstrs \ 9 ; RUN: | FileCheck %s -check-prefix=ILPMAX 10 ; 11 ; Verify that the MI scheduler minimizes register pressure for a 12 ; uniform set of bottom-up subtrees (unrolled matrix multiply). 13 ; 14 ; For current top-down heuristics, ensure that some folded imulls have 15 ; been reordered with the stores. This tests the scheduler's cheap 16 ; alias analysis ability (that doesn't require any AliasAnalysis pass). 17 ; 18 ; TOPDOWN-LABEL: %for.body 19 ; TOPDOWN: movl %{{.*}}, ( 20 ; TOPDOWN: imull {{[0-9]*}}( 21 ; TOPDOWN: movl %{{.*}}, 4( 22 ; TOPDOWN: imull {{[0-9]*}}( 23 ; TOPDOWN: movl %{{.*}}, 8( 24 ; TOPDOWN: movl %{{.*}}, 12( 25 ; TOPDOWN-LABEL: %for.end 26 ; 27 ; For -misched=ilpmin, verify that each expression subtree is 28 ; scheduled independently, and that the imull/adds are interleaved. 29 ; 30 ; ILPMIN-LABEL: %for.body 31 ; ILPMIN: movl %{{.*}}, ( 32 ; ILPMIN: imull 33 ; ILPMIN: imull 34 ; ILPMIN: addl 35 ; ILPMIN: imull 36 ; ILPMIN: addl 37 ; ILPMIN: imull 38 ; ILPMIN: addl 39 ; ILPMIN: movl %{{.*}}, 4( 40 ; ILPMIN: imull 41 ; ILPMIN: imull 42 ; ILPMIN: addl 43 ; ILPMIN: imull 44 ; ILPMIN: addl 45 ; ILPMIN: imull 46 ; ILPMIN: addl 47 ; ILPMIN: movl %{{.*}}, 8( 48 ; ILPMIN: imull 49 ; ILPMIN: imull 50 ; ILPMIN: addl 51 ; ILPMIN: imull 52 ; ILPMIN: addl 53 ; ILPMIN: imull 54 ; ILPMIN: addl 55 ; ILPMIN: movl %{{.*}}, 12( 56 ; ILPMIN-LABEL: %for.end 57 ; 58 ; For -misched=ilpmax, verify that each expression subtree is 59 ; scheduled independently, and that the imull/adds are clustered. 60 ; 61 ; ILPMAX-LABEL: %for.body 62 ; ILPMAX: movl %{{.*}}, ( 63 ; ILPMAX: imull 64 ; ILPMAX: imull 65 ; ILPMAX: imull 66 ; ILPMAX: imull 67 ; ILPMAX: addl 68 ; ILPMAX: addl 69 ; ILPMAX: addl 70 ; ILPMAX: movl %{{.*}}, 4( 71 ; ILPMAX: imull 72 ; ILPMAX: imull 73 ; ILPMAX: imull 74 ; ILPMAX: imull 75 ; ILPMAX: addl 76 ; ILPMAX: addl 77 ; ILPMAX: addl 78 ; ILPMAX: movl %{{.*}}, 8( 79 ; ILPMAX: imull 80 ; ILPMAX: imull 81 ; ILPMAX: imull 82 ; ILPMAX: imull 83 ; ILPMAX: addl 84 ; ILPMAX: addl 85 ; ILPMAX: addl 86 ; ILPMAX: movl %{{.*}}, 12( 87 ; ILPMAX-LABEL: %for.end 88 89 define void @mmult([4 x i32]* noalias nocapture %m1, [4 x i32]* noalias nocapture %m2, 90 [4 x i32]* noalias nocapture %m3) nounwind uwtable ssp { 91 entry: 92 br label %for.body 93 94 for.body: ; preds = %for.body, %entry 95 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 96 %arrayidx8 = getelementptr inbounds [4 x i32]* %m1, i64 %indvars.iv, i64 0 97 %tmp = load i32* %arrayidx8, align 4 98 %arrayidx12 = getelementptr inbounds [4 x i32]* %m2, i64 0, i64 0 99 %tmp1 = load i32* %arrayidx12, align 4 100 %arrayidx8.1 = getelementptr inbounds [4 x i32]* %m1, i64 %indvars.iv, i64 1 101 %tmp2 = load i32* %arrayidx8.1, align 4 102 %arrayidx12.1 = getelementptr inbounds [4 x i32]* %m2, i64 1, i64 0 103 %tmp3 = load i32* %arrayidx12.1, align 4 104 %arrayidx8.2 = getelementptr inbounds [4 x i32]* %m1, i64 %indvars.iv, i64 2 105 %tmp4 = load i32* %arrayidx8.2, align 4 106 %arrayidx12.2 = getelementptr inbounds [4 x i32]* %m2, i64 2, i64 0 107 %tmp5 = load i32* %arrayidx12.2, align 4 108 %arrayidx8.3 = getelementptr inbounds [4 x i32]* %m1, i64 %indvars.iv, i64 3 109 %tmp6 = load i32* %arrayidx8.3, align 4 110 %arrayidx12.3 = getelementptr inbounds [4 x i32]* %m2, i64 3, i64 0 111 %tmp8 = load i32* %arrayidx8, align 4 112 %arrayidx12.137 = getelementptr inbounds [4 x i32]* %m2, i64 0, i64 1 113 %tmp9 = load i32* %arrayidx12.137, align 4 114 %tmp10 = load i32* %arrayidx8.1, align 4 115 %arrayidx12.1.1 = getelementptr inbounds [4 x i32]* %m2, i64 1, i64 1 116 %tmp11 = load i32* %arrayidx12.1.1, align 4 117 %tmp12 = load i32* %arrayidx8.2, align 4 118 %arrayidx12.2.1 = getelementptr inbounds [4 x i32]* %m2, i64 2, i64 1 119 %tmp13 = load i32* %arrayidx12.2.1, align 4 120 %tmp14 = load i32* %arrayidx8.3, align 4 121 %arrayidx12.3.1 = getelementptr inbounds [4 x i32]* %m2, i64 3, i64 1 122 %tmp15 = load i32* %arrayidx12.3.1, align 4 123 %tmp16 = load i32* %arrayidx8, align 4 124 %arrayidx12.239 = getelementptr inbounds [4 x i32]* %m2, i64 0, i64 2 125 %tmp17 = load i32* %arrayidx12.239, align 4 126 %tmp18 = load i32* %arrayidx8.1, align 4 127 %arrayidx12.1.2 = getelementptr inbounds [4 x i32]* %m2, i64 1, i64 2 128 %tmp19 = load i32* %arrayidx12.1.2, align 4 129 %tmp20 = load i32* %arrayidx8.2, align 4 130 %arrayidx12.2.2 = getelementptr inbounds [4 x i32]* %m2, i64 2, i64 2 131 %tmp21 = load i32* %arrayidx12.2.2, align 4 132 %tmp22 = load i32* %arrayidx8.3, align 4 133 %arrayidx12.3.2 = getelementptr inbounds [4 x i32]* %m2, i64 3, i64 2 134 %tmp23 = load i32* %arrayidx12.3.2, align 4 135 %tmp24 = load i32* %arrayidx8, align 4 136 %arrayidx12.341 = getelementptr inbounds [4 x i32]* %m2, i64 0, i64 3 137 %tmp25 = load i32* %arrayidx12.341, align 4 138 %tmp26 = load i32* %arrayidx8.1, align 4 139 %arrayidx12.1.3 = getelementptr inbounds [4 x i32]* %m2, i64 1, i64 3 140 %tmp27 = load i32* %arrayidx12.1.3, align 4 141 %tmp28 = load i32* %arrayidx8.2, align 4 142 %arrayidx12.2.3 = getelementptr inbounds [4 x i32]* %m2, i64 2, i64 3 143 %tmp29 = load i32* %arrayidx12.2.3, align 4 144 %tmp30 = load i32* %arrayidx8.3, align 4 145 %arrayidx12.3.3 = getelementptr inbounds [4 x i32]* %m2, i64 3, i64 3 146 %tmp31 = load i32* %arrayidx12.3.3, align 4 147 %tmp7 = load i32* %arrayidx12.3, align 4 148 %mul = mul nsw i32 %tmp1, %tmp 149 %mul.1 = mul nsw i32 %tmp3, %tmp2 150 %mul.2 = mul nsw i32 %tmp5, %tmp4 151 %mul.3 = mul nsw i32 %tmp7, %tmp6 152 %mul.138 = mul nsw i32 %tmp9, %tmp8 153 %mul.1.1 = mul nsw i32 %tmp11, %tmp10 154 %mul.2.1 = mul nsw i32 %tmp13, %tmp12 155 %mul.3.1 = mul nsw i32 %tmp15, %tmp14 156 %mul.240 = mul nsw i32 %tmp17, %tmp16 157 %mul.1.2 = mul nsw i32 %tmp19, %tmp18 158 %mul.2.2 = mul nsw i32 %tmp21, %tmp20 159 %mul.3.2 = mul nsw i32 %tmp23, %tmp22 160 %mul.342 = mul nsw i32 %tmp25, %tmp24 161 %mul.1.3 = mul nsw i32 %tmp27, %tmp26 162 %mul.2.3 = mul nsw i32 %tmp29, %tmp28 163 %mul.3.3 = mul nsw i32 %tmp31, %tmp30 164 %add.1 = add nsw i32 %mul.1, %mul 165 %add.2 = add nsw i32 %mul.2, %add.1 166 %add.3 = add nsw i32 %mul.3, %add.2 167 %add.1.1 = add nsw i32 %mul.1.1, %mul.138 168 %add.2.1 = add nsw i32 %mul.2.1, %add.1.1 169 %add.3.1 = add nsw i32 %mul.3.1, %add.2.1 170 %add.1.2 = add nsw i32 %mul.1.2, %mul.240 171 %add.2.2 = add nsw i32 %mul.2.2, %add.1.2 172 %add.3.2 = add nsw i32 %mul.3.2, %add.2.2 173 %add.1.3 = add nsw i32 %mul.1.3, %mul.342 174 %add.2.3 = add nsw i32 %mul.2.3, %add.1.3 175 %add.3.3 = add nsw i32 %mul.3.3, %add.2.3 176 %arrayidx16 = getelementptr inbounds [4 x i32]* %m3, i64 %indvars.iv, i64 0 177 store i32 %add.3, i32* %arrayidx16, align 4 178 %arrayidx16.1 = getelementptr inbounds [4 x i32]* %m3, i64 %indvars.iv, i64 1 179 store i32 %add.3.1, i32* %arrayidx16.1, align 4 180 %arrayidx16.2 = getelementptr inbounds [4 x i32]* %m3, i64 %indvars.iv, i64 2 181 store i32 %add.3.2, i32* %arrayidx16.2, align 4 182 %arrayidx16.3 = getelementptr inbounds [4 x i32]* %m3, i64 %indvars.iv, i64 3 183 store i32 %add.3.3, i32* %arrayidx16.3, align 4 184 %indvars.iv.next = add i64 %indvars.iv, 1 185 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 186 %exitcond = icmp eq i32 %lftr.wideiv, 4 187 br i1 %exitcond, label %for.end, label %for.body 188 189 for.end: ; preds = %for.body 190 ret void 191 } 192