Home | History | Annotate | Download | only in X86
      1 ; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
      2 ; RUN:          -misched-topdown -verify-machineinstrs \
      3 ; RUN:     | FileCheck %s -check-prefix=TOPDOWN
      4 ; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
      5 ; RUN:          -misched=ilpmin -verify-machineinstrs \
      6 ; RUN:     | FileCheck %s -check-prefix=ILPMIN
      7 ; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
      8 ; RUN:          -misched=ilpmax -verify-machineinstrs \
      9 ; RUN:     | FileCheck %s -check-prefix=ILPMAX
     10 ;
     11 ; Verify that the MI scheduler minimizes register pressure for a
     12 ; uniform set of bottom-up subtrees (unrolled matrix multiply).
     13 ;
     14 ; For current top-down heuristics, ensure that some folded imulls have
     15 ; been reordered with the stores. This tests the scheduler's cheap
     16 ; alias analysis ability (that doesn't require any AliasAnalysis pass).
     17 ;
     18 ; TOPDOWN-LABEL: %for.body
     19 ; TOPDOWN: movl %{{.*}}, (
     20 ; TOPDOWN: imull {{[0-9]*}}(
     21 ; TOPDOWN: movl %{{.*}}, 4(
     22 ; TOPDOWN: imull {{[0-9]*}}(
     23 ; TOPDOWN: movl %{{.*}}, 8(
     24 ; TOPDOWN: movl %{{.*}}, 12(
     25 ; TOPDOWN-LABEL: %for.end
     26 ;
     27 ; For -misched=ilpmin, verify that each expression subtree is
     28 ; scheduled independently, and that the imull/adds are interleaved.
     29 ;
     30 ; ILPMIN-LABEL: %for.body
     31 ; ILPMIN: movl %{{.*}}, (
     32 ; ILPMIN: imull
     33 ; ILPMIN: imull
     34 ; ILPMIN: addl
     35 ; ILPMIN: imull
     36 ; ILPMIN: addl
     37 ; ILPMIN: imull
     38 ; ILPMIN: addl
     39 ; ILPMIN: movl %{{.*}}, 4(
     40 ; ILPMIN: imull
     41 ; ILPMIN: imull
     42 ; ILPMIN: addl
     43 ; ILPMIN: imull
     44 ; ILPMIN: addl
     45 ; ILPMIN: imull
     46 ; ILPMIN: addl
     47 ; ILPMIN: movl %{{.*}}, 8(
     48 ; ILPMIN: imull
     49 ; ILPMIN: imull
     50 ; ILPMIN: addl
     51 ; ILPMIN: imull
     52 ; ILPMIN: addl
     53 ; ILPMIN: imull
     54 ; ILPMIN: addl
     55 ; ILPMIN: movl %{{.*}}, 12(
     56 ; ILPMIN-LABEL: %for.end
     57 ;
     58 ; For -misched=ilpmax, verify that each expression subtree is
     59 ; scheduled independently, and that the imull/adds are clustered.
     60 ;
     61 ; ILPMAX-LABEL: %for.body
     62 ; ILPMAX: movl %{{.*}}, (
     63 ; ILPMAX: imull
     64 ; ILPMAX: imull
     65 ; ILPMAX: imull
     66 ; ILPMAX: imull
     67 ; ILPMAX: addl
     68 ; ILPMAX: addl
     69 ; ILPMAX: addl
     70 ; ILPMAX: movl %{{.*}}, 4(
     71 ; ILPMAX: imull
     72 ; ILPMAX: imull
     73 ; ILPMAX: imull
     74 ; ILPMAX: imull
     75 ; ILPMAX: addl
     76 ; ILPMAX: addl
     77 ; ILPMAX: addl
     78 ; ILPMAX: movl %{{.*}}, 8(
     79 ; ILPMAX: imull
     80 ; ILPMAX: imull
     81 ; ILPMAX: imull
     82 ; ILPMAX: imull
     83 ; ILPMAX: addl
     84 ; ILPMAX: addl
     85 ; ILPMAX: addl
     86 ; ILPMAX: movl %{{.*}}, 12(
     87 ; ILPMAX-LABEL: %for.end
     88 
     89 define void @mmult([4 x i32]* noalias nocapture %m1, [4 x i32]* noalias nocapture %m2,
     90 [4 x i32]* noalias nocapture %m3) nounwind uwtable ssp {
     91 entry:
     92   br label %for.body
     93 
     94 for.body:                              ; preds = %for.body, %entry
     95   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
     96   %arrayidx8 = getelementptr inbounds [4 x i32]* %m1, i64 %indvars.iv, i64 0
     97   %tmp = load i32* %arrayidx8, align 4
     98   %arrayidx12 = getelementptr inbounds [4 x i32]* %m2, i64 0, i64 0
     99   %tmp1 = load i32* %arrayidx12, align 4
    100   %arrayidx8.1 = getelementptr inbounds [4 x i32]* %m1, i64 %indvars.iv, i64 1
    101   %tmp2 = load i32* %arrayidx8.1, align 4
    102   %arrayidx12.1 = getelementptr inbounds [4 x i32]* %m2, i64 1, i64 0
    103   %tmp3 = load i32* %arrayidx12.1, align 4
    104   %arrayidx8.2 = getelementptr inbounds [4 x i32]* %m1, i64 %indvars.iv, i64 2
    105   %tmp4 = load i32* %arrayidx8.2, align 4
    106   %arrayidx12.2 = getelementptr inbounds [4 x i32]* %m2, i64 2, i64 0
    107   %tmp5 = load i32* %arrayidx12.2, align 4
    108   %arrayidx8.3 = getelementptr inbounds [4 x i32]* %m1, i64 %indvars.iv, i64 3
    109   %tmp6 = load i32* %arrayidx8.3, align 4
    110   %arrayidx12.3 = getelementptr inbounds [4 x i32]* %m2, i64 3, i64 0
    111   %tmp8 = load i32* %arrayidx8, align 4
    112   %arrayidx12.137 = getelementptr inbounds [4 x i32]* %m2, i64 0, i64 1
    113   %tmp9 = load i32* %arrayidx12.137, align 4
    114   %tmp10 = load i32* %arrayidx8.1, align 4
    115   %arrayidx12.1.1 = getelementptr inbounds [4 x i32]* %m2, i64 1, i64 1
    116   %tmp11 = load i32* %arrayidx12.1.1, align 4
    117   %tmp12 = load i32* %arrayidx8.2, align 4
    118   %arrayidx12.2.1 = getelementptr inbounds [4 x i32]* %m2, i64 2, i64 1
    119   %tmp13 = load i32* %arrayidx12.2.1, align 4
    120   %tmp14 = load i32* %arrayidx8.3, align 4
    121   %arrayidx12.3.1 = getelementptr inbounds [4 x i32]* %m2, i64 3, i64 1
    122   %tmp15 = load i32* %arrayidx12.3.1, align 4
    123   %tmp16 = load i32* %arrayidx8, align 4
    124   %arrayidx12.239 = getelementptr inbounds [4 x i32]* %m2, i64 0, i64 2
    125   %tmp17 = load i32* %arrayidx12.239, align 4
    126   %tmp18 = load i32* %arrayidx8.1, align 4
    127   %arrayidx12.1.2 = getelementptr inbounds [4 x i32]* %m2, i64 1, i64 2
    128   %tmp19 = load i32* %arrayidx12.1.2, align 4
    129   %tmp20 = load i32* %arrayidx8.2, align 4
    130   %arrayidx12.2.2 = getelementptr inbounds [4 x i32]* %m2, i64 2, i64 2
    131   %tmp21 = load i32* %arrayidx12.2.2, align 4
    132   %tmp22 = load i32* %arrayidx8.3, align 4
    133   %arrayidx12.3.2 = getelementptr inbounds [4 x i32]* %m2, i64 3, i64 2
    134   %tmp23 = load i32* %arrayidx12.3.2, align 4
    135   %tmp24 = load i32* %arrayidx8, align 4
    136   %arrayidx12.341 = getelementptr inbounds [4 x i32]* %m2, i64 0, i64 3
    137   %tmp25 = load i32* %arrayidx12.341, align 4
    138   %tmp26 = load i32* %arrayidx8.1, align 4
    139   %arrayidx12.1.3 = getelementptr inbounds [4 x i32]* %m2, i64 1, i64 3
    140   %tmp27 = load i32* %arrayidx12.1.3, align 4
    141   %tmp28 = load i32* %arrayidx8.2, align 4
    142   %arrayidx12.2.3 = getelementptr inbounds [4 x i32]* %m2, i64 2, i64 3
    143   %tmp29 = load i32* %arrayidx12.2.3, align 4
    144   %tmp30 = load i32* %arrayidx8.3, align 4
    145   %arrayidx12.3.3 = getelementptr inbounds [4 x i32]* %m2, i64 3, i64 3
    146   %tmp31 = load i32* %arrayidx12.3.3, align 4
    147   %tmp7 = load i32* %arrayidx12.3, align 4
    148   %mul = mul nsw i32 %tmp1, %tmp
    149   %mul.1 = mul nsw i32 %tmp3, %tmp2
    150   %mul.2 = mul nsw i32 %tmp5, %tmp4
    151   %mul.3 = mul nsw i32 %tmp7, %tmp6
    152   %mul.138 = mul nsw i32 %tmp9, %tmp8
    153   %mul.1.1 = mul nsw i32 %tmp11, %tmp10
    154   %mul.2.1 = mul nsw i32 %tmp13, %tmp12
    155   %mul.3.1 = mul nsw i32 %tmp15, %tmp14
    156   %mul.240 = mul nsw i32 %tmp17, %tmp16
    157   %mul.1.2 = mul nsw i32 %tmp19, %tmp18
    158   %mul.2.2 = mul nsw i32 %tmp21, %tmp20
    159   %mul.3.2 = mul nsw i32 %tmp23, %tmp22
    160   %mul.342 = mul nsw i32 %tmp25, %tmp24
    161   %mul.1.3 = mul nsw i32 %tmp27, %tmp26
    162   %mul.2.3 = mul nsw i32 %tmp29, %tmp28
    163   %mul.3.3 = mul nsw i32 %tmp31, %tmp30
    164   %add.1 = add nsw i32 %mul.1, %mul
    165   %add.2 = add nsw i32 %mul.2, %add.1
    166   %add.3 = add nsw i32 %mul.3, %add.2
    167   %add.1.1 = add nsw i32 %mul.1.1, %mul.138
    168   %add.2.1 = add nsw i32 %mul.2.1, %add.1.1
    169   %add.3.1 = add nsw i32 %mul.3.1, %add.2.1
    170   %add.1.2 = add nsw i32 %mul.1.2, %mul.240
    171   %add.2.2 = add nsw i32 %mul.2.2, %add.1.2
    172   %add.3.2 = add nsw i32 %mul.3.2, %add.2.2
    173   %add.1.3 = add nsw i32 %mul.1.3, %mul.342
    174   %add.2.3 = add nsw i32 %mul.2.3, %add.1.3
    175   %add.3.3 = add nsw i32 %mul.3.3, %add.2.3
    176   %arrayidx16 = getelementptr inbounds [4 x i32]* %m3, i64 %indvars.iv, i64 0
    177   store i32 %add.3, i32* %arrayidx16, align 4
    178   %arrayidx16.1 = getelementptr inbounds [4 x i32]* %m3, i64 %indvars.iv, i64 1
    179   store i32 %add.3.1, i32* %arrayidx16.1, align 4
    180   %arrayidx16.2 = getelementptr inbounds [4 x i32]* %m3, i64 %indvars.iv, i64 2
    181   store i32 %add.3.2, i32* %arrayidx16.2, align 4
    182   %arrayidx16.3 = getelementptr inbounds [4 x i32]* %m3, i64 %indvars.iv, i64 3
    183   store i32 %add.3.3, i32* %arrayidx16.3, align 4
    184   %indvars.iv.next = add i64 %indvars.iv, 1
    185   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
    186   %exitcond = icmp eq i32 %lftr.wideiv, 4
    187   br i1 %exitcond, label %for.end, label %for.body
    188 
    189 for.end:                                        ; preds = %for.body
    190   ret void
    191 }
    192