Home | History | Annotate | Download | only in X86
      1 ; RUN: llc < %s -O3 -march=x86-64 -mcpu=core2 | FileCheck %s -check-prefix=X64
      2 ; RUN: llc < %s -O3 -march=x86 -mcpu=core2 | FileCheck %s -check-prefix=X32
      3 ; RUN: llc < %s -O3 -march=x86-64 -mcpu=core2 -addr-sink-using-gep=1 | FileCheck %s -check-prefix=X64
      4 ; RUN: llc < %s -O3 -march=x86 -mcpu=core2 -addr-sink-using-gep=1 | FileCheck %s -check-prefix=X32
      5 
      6 ; @simple is the most basic chain of address induction variables. Chaining
      7 ; saves at least one register and avoids complex addressing and setup
      8 ; code.
      9 ;
     10 ; X64: @simple
     11 ; %x * 4
     12 ; X64: shlq $2
     13 ; no other address computation in the preheader
     14 ; X64-NEXT: xorl
     15 ; X64-NEXT: .align
     16 ; X64: %loop
     17 ; no complex address modes
     18 ; X64-NOT: (%{{[^)]+}},%{{[^)]+}},
     19 ;
     20 ; X32: @simple
     21 ; no expensive address computation in the preheader
     22 ; X32-NOT: imul
     23 ; X32: %loop
     24 ; no complex address modes
     25 ; X32-NOT: (%{{[^)]+}},%{{[^)]+}},
     26 define i32 @simple(i32* %a, i32* %b, i32 %x) nounwind {
     27 entry:
     28   br label %loop
     29 loop:
     30   %iv = phi i32* [ %a, %entry ], [ %iv4, %loop ]
     31   %s = phi i32 [ 0, %entry ], [ %s4, %loop ]
     32   %v = load i32, i32* %iv
     33   %iv1 = getelementptr inbounds i32, i32* %iv, i32 %x
     34   %v1 = load i32, i32* %iv1
     35   %iv2 = getelementptr inbounds i32, i32* %iv1, i32 %x
     36   %v2 = load i32, i32* %iv2
     37   %iv3 = getelementptr inbounds i32, i32* %iv2, i32 %x
     38   %v3 = load i32, i32* %iv3
     39   %s1 = add i32 %s, %v
     40   %s2 = add i32 %s1, %v1
     41   %s3 = add i32 %s2, %v2
     42   %s4 = add i32 %s3, %v3
     43   %iv4 = getelementptr inbounds i32, i32* %iv3, i32 %x
     44   %cmp = icmp eq i32* %iv4, %b
     45   br i1 %cmp, label %exit, label %loop
     46 exit:
     47   ret i32 %s4
     48 }
     49 
     50 ; @user is not currently chained because the IV is live across memory ops.
     51 ;
     52 ; X64: @user
     53 ; X64: shlq $4
     54 ; X64: lea
     55 ; X64: lea
     56 ; X64: %loop
     57 ; complex address modes
     58 ; X64: (%{{[^)]+}},%{{[^)]+}},
     59 ;
     60 ; X32: @user
     61 ; expensive address computation in the preheader
     62 ; X32: shll $4
     63 ; X32: lea
     64 ; X32: lea
     65 ; X32: %loop
     66 ; complex address modes
     67 ; X32: (%{{[^)]+}},%{{[^)]+}},
     68 define i32 @user(i32* %a, i32* %b, i32 %x) nounwind {
     69 entry:
     70   br label %loop
     71 loop:
     72   %iv = phi i32* [ %a, %entry ], [ %iv4, %loop ]
     73   %s = phi i32 [ 0, %entry ], [ %s4, %loop ]
     74   %v = load i32, i32* %iv
     75   %iv1 = getelementptr inbounds i32, i32* %iv, i32 %x
     76   %v1 = load i32, i32* %iv1
     77   %iv2 = getelementptr inbounds i32, i32* %iv1, i32 %x
     78   %v2 = load i32, i32* %iv2
     79   %iv3 = getelementptr inbounds i32, i32* %iv2, i32 %x
     80   %v3 = load i32, i32* %iv3
     81   %s1 = add i32 %s, %v
     82   %s2 = add i32 %s1, %v1
     83   %s3 = add i32 %s2, %v2
     84   %s4 = add i32 %s3, %v3
     85   %iv4 = getelementptr inbounds i32, i32* %iv3, i32 %x
     86   store i32 %s4, i32* %iv
     87   %cmp = icmp eq i32* %iv4, %b
     88   br i1 %cmp, label %exit, label %loop
     89 exit:
     90   ret i32 %s4
     91 }
     92 
     93 ; @extrastride is a slightly more interesting case of a single
     94 ; complete chain with multiple strides. The test case IR is what LSR
     95 ; used to do, and exactly what we don't want to do. LSR's new IV
     96 ; chaining feature should now undo the damage.
     97 ;
     98 ; X64: extrastride:
     99 ; We currently don't handle this on X64 because the sexts cause
    100 ; strange increment expressions like this:
    101 ; IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
    102 ;
    103 ; X32: extrastride:
    104 ; no spills in the preheader
    105 ; X32-NOT: mov{{.*}}(%esp){{$}}
    106 ; X32: %for.body{{$}}
    107 ; no complex address modes
    108 ; X32-NOT: (%{{[^)]+}},%{{[^)]+}},
    109 ; no reloads
    110 ; X32-NOT: (%esp)
    111 define void @extrastride(i8* nocapture %main, i32 %main_stride, i32* nocapture %res, i32 %x, i32 %y, i32 %z) nounwind {
    112 entry:
    113   %cmp8 = icmp eq i32 %z, 0
    114   br i1 %cmp8, label %for.end, label %for.body.lr.ph
    115 
    116 for.body.lr.ph:                                   ; preds = %entry
    117   %add.ptr.sum = shl i32 %main_stride, 1 ; s*2
    118   %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride ; s*3
    119   %add.ptr2.sum = add i32 %x, %main_stride ; s + x
    120   %add.ptr4.sum = shl i32 %main_stride, 2 ; s*4
    121   %add.ptr3.sum = add i32 %add.ptr2.sum, %add.ptr4.sum ; total IV stride = s*5+x
    122   br label %for.body
    123 
    124 for.body:                                         ; preds = %for.body.lr.ph, %for.body
    125   %main.addr.011 = phi i8* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
    126   %i.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
    127   %res.addr.09 = phi i32* [ %res, %for.body.lr.ph ], [ %add.ptr7, %for.body ]
    128   %0 = bitcast i8* %main.addr.011 to i32*
    129   %1 = load i32, i32* %0, align 4
    130   %add.ptr = getelementptr inbounds i8, i8* %main.addr.011, i32 %main_stride
    131   %2 = bitcast i8* %add.ptr to i32*
    132   %3 = load i32, i32* %2, align 4
    133   %add.ptr1 = getelementptr inbounds i8, i8* %main.addr.011, i32 %add.ptr.sum
    134   %4 = bitcast i8* %add.ptr1 to i32*
    135   %5 = load i32, i32* %4, align 4
    136   %add.ptr2 = getelementptr inbounds i8, i8* %main.addr.011, i32 %add.ptr1.sum
    137   %6 = bitcast i8* %add.ptr2 to i32*
    138   %7 = load i32, i32* %6, align 4
    139   %add.ptr3 = getelementptr inbounds i8, i8* %main.addr.011, i32 %add.ptr4.sum
    140   %8 = bitcast i8* %add.ptr3 to i32*
    141   %9 = load i32, i32* %8, align 4
    142   %add = add i32 %3, %1
    143   %add4 = add i32 %add, %5
    144   %add5 = add i32 %add4, %7
    145   %add6 = add i32 %add5, %9
    146   store i32 %add6, i32* %res.addr.09, align 4
    147   %add.ptr6 = getelementptr inbounds i8, i8* %main.addr.011, i32 %add.ptr3.sum
    148   %add.ptr7 = getelementptr inbounds i32, i32* %res.addr.09, i32 %y
    149   %inc = add i32 %i.010, 1
    150   %cmp = icmp eq i32 %inc, %z
    151   br i1 %cmp, label %for.end, label %for.body
    152 
    153 for.end:                                          ; preds = %for.body, %entry
    154   ret void
    155 }
    156 
    157 ; @foldedidx is an unrolled variant of this loop:
    158 ;  for (unsigned long i = 0; i < len; i += s) {
    159 ;    c[i] = a[i] + b[i];
    160 ;  }
    161 ; where 's' can be folded into the addressing mode.
    162 ; Consequently, we should *not* form any chains.
    163 ;
    164 ; X64: foldedidx:
    165 ; X64: movzbl -3(
    166 ;
    167 ; X32: foldedidx:
    168 ; X32: movzbl -3(
    169 define void @foldedidx(i8* nocapture %a, i8* nocapture %b, i8* nocapture %c) nounwind ssp {
    170 entry:
    171   br label %for.body
    172 
    173 for.body:                                         ; preds = %for.body, %entry
    174   %i.07 = phi i32 [ 0, %entry ], [ %inc.3, %for.body ]
    175   %arrayidx = getelementptr inbounds i8, i8* %a, i32 %i.07
    176   %0 = load i8, i8* %arrayidx, align 1
    177   %conv5 = zext i8 %0 to i32
    178   %arrayidx1 = getelementptr inbounds i8, i8* %b, i32 %i.07
    179   %1 = load i8, i8* %arrayidx1, align 1
    180   %conv26 = zext i8 %1 to i32
    181   %add = add nsw i32 %conv26, %conv5
    182   %conv3 = trunc i32 %add to i8
    183   %arrayidx4 = getelementptr inbounds i8, i8* %c, i32 %i.07
    184   store i8 %conv3, i8* %arrayidx4, align 1
    185   %inc1 = or i32 %i.07, 1
    186   %arrayidx.1 = getelementptr inbounds i8, i8* %a, i32 %inc1
    187   %2 = load i8, i8* %arrayidx.1, align 1
    188   %conv5.1 = zext i8 %2 to i32
    189   %arrayidx1.1 = getelementptr inbounds i8, i8* %b, i32 %inc1
    190   %3 = load i8, i8* %arrayidx1.1, align 1
    191   %conv26.1 = zext i8 %3 to i32
    192   %add.1 = add nsw i32 %conv26.1, %conv5.1
    193   %conv3.1 = trunc i32 %add.1 to i8
    194   %arrayidx4.1 = getelementptr inbounds i8, i8* %c, i32 %inc1
    195   store i8 %conv3.1, i8* %arrayidx4.1, align 1
    196   %inc.12 = or i32 %i.07, 2
    197   %arrayidx.2 = getelementptr inbounds i8, i8* %a, i32 %inc.12
    198   %4 = load i8, i8* %arrayidx.2, align 1
    199   %conv5.2 = zext i8 %4 to i32
    200   %arrayidx1.2 = getelementptr inbounds i8, i8* %b, i32 %inc.12
    201   %5 = load i8, i8* %arrayidx1.2, align 1
    202   %conv26.2 = zext i8 %5 to i32
    203   %add.2 = add nsw i32 %conv26.2, %conv5.2
    204   %conv3.2 = trunc i32 %add.2 to i8
    205   %arrayidx4.2 = getelementptr inbounds i8, i8* %c, i32 %inc.12
    206   store i8 %conv3.2, i8* %arrayidx4.2, align 1
    207   %inc.23 = or i32 %i.07, 3
    208   %arrayidx.3 = getelementptr inbounds i8, i8* %a, i32 %inc.23
    209   %6 = load i8, i8* %arrayidx.3, align 1
    210   %conv5.3 = zext i8 %6 to i32
    211   %arrayidx1.3 = getelementptr inbounds i8, i8* %b, i32 %inc.23
    212   %7 = load i8, i8* %arrayidx1.3, align 1
    213   %conv26.3 = zext i8 %7 to i32
    214   %add.3 = add nsw i32 %conv26.3, %conv5.3
    215   %conv3.3 = trunc i32 %add.3 to i8
    216   %arrayidx4.3 = getelementptr inbounds i8, i8* %c, i32 %inc.23
    217   store i8 %conv3.3, i8* %arrayidx4.3, align 1
    218   %inc.3 = add nsw i32 %i.07, 4
    219   %exitcond.3 = icmp eq i32 %inc.3, 400
    220   br i1 %exitcond.3, label %for.end, label %for.body
    221 
    222 for.end:                                          ; preds = %for.body
    223   ret void
    224 }
    225 
    226 ; @multioper tests instructions with multiple IV user operands. We
    227 ; should be able to chain them independent of each other.
    228 ;
    229 ; X64: @multioper
    230 ; X64: %for.body
    231 ; X64: movl %{{.*}},4)
    232 ; X64-NEXT: leal 1(
    233 ; X64-NEXT: movl %{{.*}},4)
    234 ; X64-NEXT: leal 2(
    235 ; X64-NEXT: movl %{{.*}},4)
    236 ; X64-NEXT: leal 3(
    237 ; X64-NEXT: movl %{{.*}},4)
    238 ;
    239 ; X32: @multioper
    240 ; X32: %for.body
    241 ; X32: movl %{{.*}},4)
    242 ; X32-NEXT: leal 1(
    243 ; X32-NEXT: movl %{{.*}},4)
    244 ; X32-NEXT: leal 2(
    245 ; X32-NEXT: movl %{{.*}},4)
    246 ; X32-NEXT: leal 3(
    247 ; X32-NEXT: movl %{{.*}},4)
    248 define void @multioper(i32* %a, i32 %n) nounwind {
    249 entry:
    250   br label %for.body
    251 
    252 for.body:
    253   %p = phi i32* [ %p.next, %for.body ], [ %a, %entry ]
    254   %i = phi i32 [ %inc4, %for.body ], [ 0, %entry ]
    255   store i32 %i, i32* %p, align 4
    256   %inc1 = or i32 %i, 1
    257   %add.ptr.i1 = getelementptr inbounds i32, i32* %p, i32 1
    258   store i32 %inc1, i32* %add.ptr.i1, align 4
    259   %inc2 = add nsw i32 %i, 2
    260   %add.ptr.i2 = getelementptr inbounds i32, i32* %p, i32 2
    261   store i32 %inc2, i32* %add.ptr.i2, align 4
    262   %inc3 = add nsw i32 %i, 3
    263   %add.ptr.i3 = getelementptr inbounds i32, i32* %p, i32 3
    264   store i32 %inc3, i32* %add.ptr.i3, align 4
    265   %p.next = getelementptr inbounds i32, i32* %p, i32 4
    266   %inc4 = add nsw i32 %i, 4
    267   %cmp = icmp slt i32 %inc4, %n
    268   br i1 %cmp, label %for.body, label %exit
    269 
    270 exit:
    271   ret void
    272 }
    273 
    274 ; @testCmpZero has a ICmpZero LSR use that should not be hidden from
    275 ; LSR. Profitable chains should have more than one nonzero increment
    276 ; anyway.
    277 ;
    278 ; X32: @testCmpZero
    279 ; X32: %for.body82.us
    280 ; X32: dec
    281 ; X32: jne
    282 define void @testCmpZero(i8* %src, i8* %dst, i32 %srcidx, i32 %dstidx, i32 %len) nounwind ssp {
    283 entry:
    284   %dest0 = getelementptr inbounds i8, i8* %src, i32 %srcidx
    285   %source0 = getelementptr inbounds i8, i8* %dst, i32 %dstidx
    286   %add.ptr79.us.sum = add i32 %srcidx, %len
    287   %lftr.limit = getelementptr i8, i8* %src, i32 %add.ptr79.us.sum
    288   br label %for.body82.us
    289 
    290 for.body82.us:
    291   %dest = phi i8* [ %dest0, %entry ], [ %incdec.ptr91.us, %for.body82.us ]
    292   %source = phi i8* [ %source0, %entry ], [ %add.ptr83.us, %for.body82.us ]
    293   %0 = bitcast i8* %source to i32*
    294   %1 = load i32, i32* %0, align 4
    295   %trunc = trunc i32 %1 to i8
    296   %add.ptr83.us = getelementptr inbounds i8, i8* %source, i32 4
    297   %incdec.ptr91.us = getelementptr inbounds i8, i8* %dest, i32 1
    298   store i8 %trunc, i8* %dest, align 1
    299   %exitcond = icmp eq i8* %incdec.ptr91.us, %lftr.limit
    300   br i1 %exitcond, label %return, label %for.body82.us
    301 
    302 return:
    303   ret void
    304 }
    305