Home | History | Annotate | Download | only in AArch64
      1 ; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s --check-prefix=GENERIC
      2 ; RUN: opt -S -mcpu=kryo -slp-vectorizer -dce -instcombine < %s | FileCheck %s --check-prefix=KRYO
      3 
      4 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
      5 target triple = "aarch64--linux-gnu"
      6 
      7 ; These tests check that we vectorize the index calculations in the
      8 ; gather-reduce pattern shown below. We check cases having i32 and i64
      9 ; subtraction.
     10 ;
     11 ; int gather_reduce_8x16(short *a, short *b, short *g, int n) {
     12 ;   int sum = 0;
     13 ;   for (int i = 0; i < n ; ++i) {
     14 ;     sum += g[*a++ - b[0]]; sum += g[*a++ - b[1]];
     15 ;     sum += g[*a++ - b[2]]; sum += g[*a++ - b[3]];
     16 ;     sum += g[*a++ - b[4]]; sum += g[*a++ - b[5]];
     17 ;     sum += g[*a++ - b[6]]; sum += g[*a++ - b[7]];
     18 ;   }
     19 ;   return sum;
     20 ; }
     21 
     22 ; GENERIC-LABEL: @gather_reduce_8x16_i32
     23 ;
     24 ; GENERIC: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16>
     25 ; GENERIC: zext <8 x i16> [[L]] to <8 x i32>
     26 ; GENERIC: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32>
     27 ; GENERIC: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]
     28 ; GENERIC: sext i32 [[X]] to i64
     29 ;
     30 define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
     31 entry:
     32   %cmp.99 = icmp sgt i32 %n, 0
     33   br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup
     34 
     35 for.body.preheader:
     36   br label %for.body
     37 
     38 for.cond.cleanup.loopexit:
     39   br label %for.cond.cleanup
     40 
     41 for.cond.cleanup:
     42   %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ]
     43   ret i32 %sum.0.lcssa
     44 
     45 for.body:
     46   %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
     47   %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ]
     48   %a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ]
     49   %incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1
     50   %0 = load i16, i16* %a.addr.0101, align 2
     51   %conv = zext i16 %0 to i32
     52   %incdec.ptr1 = getelementptr inbounds i16, i16* %b, i64 1
     53   %1 = load i16, i16* %b, align 2
     54   %conv2 = zext i16 %1 to i32
     55   %sub = sub nsw i32 %conv, %conv2
     56   %arrayidx = getelementptr inbounds i16, i16* %g, i32 %sub
     57   %2 = load i16, i16* %arrayidx, align 2
     58   %conv3 = zext i16 %2 to i32
     59   %add = add nsw i32 %conv3, %sum.0102
     60   %incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2
     61   %3 = load i16, i16* %incdec.ptr, align 2
     62   %conv5 = zext i16 %3 to i32
     63   %incdec.ptr6 = getelementptr inbounds i16, i16* %b, i64 2
     64   %4 = load i16, i16* %incdec.ptr1, align 2
     65   %conv7 = zext i16 %4 to i32
     66   %sub8 = sub nsw i32 %conv5, %conv7
     67   %arrayidx10 = getelementptr inbounds i16, i16* %g, i32 %sub8
     68   %5 = load i16, i16* %arrayidx10, align 2
     69   %conv11 = zext i16 %5 to i32
     70   %add12 = add nsw i32 %add, %conv11
     71   %incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3
     72   %6 = load i16, i16* %incdec.ptr4, align 2
     73   %conv14 = zext i16 %6 to i32
     74   %incdec.ptr15 = getelementptr inbounds i16, i16* %b, i64 3
     75   %7 = load i16, i16* %incdec.ptr6, align 2
     76   %conv16 = zext i16 %7 to i32
     77   %sub17 = sub nsw i32 %conv14, %conv16
     78   %arrayidx19 = getelementptr inbounds i16, i16* %g, i32 %sub17
     79   %8 = load i16, i16* %arrayidx19, align 2
     80   %conv20 = zext i16 %8 to i32
     81   %add21 = add nsw i32 %add12, %conv20
     82   %incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4
     83   %9 = load i16, i16* %incdec.ptr13, align 2
     84   %conv23 = zext i16 %9 to i32
     85   %incdec.ptr24 = getelementptr inbounds i16, i16* %b, i64 4
     86   %10 = load i16, i16* %incdec.ptr15, align 2
     87   %conv25 = zext i16 %10 to i32
     88   %sub26 = sub nsw i32 %conv23, %conv25
     89   %arrayidx28 = getelementptr inbounds i16, i16* %g, i32 %sub26
     90   %11 = load i16, i16* %arrayidx28, align 2
     91   %conv29 = zext i16 %11 to i32
     92   %add30 = add nsw i32 %add21, %conv29
     93   %incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5
     94   %12 = load i16, i16* %incdec.ptr22, align 2
     95   %conv32 = zext i16 %12 to i32
     96   %incdec.ptr33 = getelementptr inbounds i16, i16* %b, i64 5
     97   %13 = load i16, i16* %incdec.ptr24, align 2
     98   %conv34 = zext i16 %13 to i32
     99   %sub35 = sub nsw i32 %conv32, %conv34
    100   %arrayidx37 = getelementptr inbounds i16, i16* %g, i32 %sub35
    101   %14 = load i16, i16* %arrayidx37, align 2
    102   %conv38 = zext i16 %14 to i32
    103   %add39 = add nsw i32 %add30, %conv38
    104   %incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6
    105   %15 = load i16, i16* %incdec.ptr31, align 2
    106   %conv41 = zext i16 %15 to i32
    107   %incdec.ptr42 = getelementptr inbounds i16, i16* %b, i64 6
    108   %16 = load i16, i16* %incdec.ptr33, align 2
    109   %conv43 = zext i16 %16 to i32
    110   %sub44 = sub nsw i32 %conv41, %conv43
    111   %arrayidx46 = getelementptr inbounds i16, i16* %g, i32 %sub44
    112   %17 = load i16, i16* %arrayidx46, align 2
    113   %conv47 = zext i16 %17 to i32
    114   %add48 = add nsw i32 %add39, %conv47
    115   %incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7
    116   %18 = load i16, i16* %incdec.ptr40, align 2
    117   %conv50 = zext i16 %18 to i32
    118   %incdec.ptr51 = getelementptr inbounds i16, i16* %b, i64 7
    119   %19 = load i16, i16* %incdec.ptr42, align 2
    120   %conv52 = zext i16 %19 to i32
    121   %sub53 = sub nsw i32 %conv50, %conv52
    122   %arrayidx55 = getelementptr inbounds i16, i16* %g, i32 %sub53
    123   %20 = load i16, i16* %arrayidx55, align 2
    124   %conv56 = zext i16 %20 to i32
    125   %add57 = add nsw i32 %add48, %conv56
    126   %incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8
    127   %21 = load i16, i16* %incdec.ptr49, align 2
    128   %conv59 = zext i16 %21 to i32
    129   %22 = load i16, i16* %incdec.ptr51, align 2
    130   %conv61 = zext i16 %22 to i32
    131   %sub62 = sub nsw i32 %conv59, %conv61
    132   %arrayidx64 = getelementptr inbounds i16, i16* %g, i32 %sub62
    133   %23 = load i16, i16* %arrayidx64, align 2
    134   %conv65 = zext i16 %23 to i32
    135   %add66 = add nsw i32 %add57, %conv65
    136   %inc = add nuw nsw i32 %i.0103, 1
    137   %exitcond = icmp eq i32 %inc, %n
    138   br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
    139 }
    140 
    141 ; KRYO-LABEL: @gather_reduce_8x16_i64
    142 ;
    143 ; KRYO: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16>
    144 ; KRYO: zext <8 x i16> [[L]] to <8 x i32>
    145 ; KRYO: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32>
    146 ; KRYO: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]
    147 ; KRYO: sext i32 [[X]] to i64
    148 ;
    149 define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
    150 entry:
    151   %cmp.99 = icmp sgt i32 %n, 0
    152   br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup
    153 
    154 for.body.preheader:
    155   br label %for.body
    156 
    157 for.cond.cleanup.loopexit:
    158   br label %for.cond.cleanup
    159 
    160 for.cond.cleanup:
    161   %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ]
    162   ret i32 %sum.0.lcssa
    163 
    164 for.body:
    165   %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
    166   %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ]
    167   %a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ]
    168   %incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1
    169   %0 = load i16, i16* %a.addr.0101, align 2
    170   %conv = zext i16 %0 to i64
    171   %incdec.ptr1 = getelementptr inbounds i16, i16* %b, i64 1
    172   %1 = load i16, i16* %b, align 2
    173   %conv2 = zext i16 %1 to i64
    174   %sub = sub nsw i64 %conv, %conv2
    175   %arrayidx = getelementptr inbounds i16, i16* %g, i64 %sub
    176   %2 = load i16, i16* %arrayidx, align 2
    177   %conv3 = zext i16 %2 to i32
    178   %add = add nsw i32 %conv3, %sum.0102
    179   %incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2
    180   %3 = load i16, i16* %incdec.ptr, align 2
    181   %conv5 = zext i16 %3 to i64
    182   %incdec.ptr6 = getelementptr inbounds i16, i16* %b, i64 2
    183   %4 = load i16, i16* %incdec.ptr1, align 2
    184   %conv7 = zext i16 %4 to i64
    185   %sub8 = sub nsw i64 %conv5, %conv7
    186   %arrayidx10 = getelementptr inbounds i16, i16* %g, i64 %sub8
    187   %5 = load i16, i16* %arrayidx10, align 2
    188   %conv11 = zext i16 %5 to i32
    189   %add12 = add nsw i32 %add, %conv11
    190   %incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3
    191   %6 = load i16, i16* %incdec.ptr4, align 2
    192   %conv14 = zext i16 %6 to i64
    193   %incdec.ptr15 = getelementptr inbounds i16, i16* %b, i64 3
    194   %7 = load i16, i16* %incdec.ptr6, align 2
    195   %conv16 = zext i16 %7 to i64
    196   %sub17 = sub nsw i64 %conv14, %conv16
    197   %arrayidx19 = getelementptr inbounds i16, i16* %g, i64 %sub17
    198   %8 = load i16, i16* %arrayidx19, align 2
    199   %conv20 = zext i16 %8 to i32
    200   %add21 = add nsw i32 %add12, %conv20
    201   %incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4
    202   %9 = load i16, i16* %incdec.ptr13, align 2
    203   %conv23 = zext i16 %9 to i64
    204   %incdec.ptr24 = getelementptr inbounds i16, i16* %b, i64 4
    205   %10 = load i16, i16* %incdec.ptr15, align 2
    206   %conv25 = zext i16 %10 to i64
    207   %sub26 = sub nsw i64 %conv23, %conv25
    208   %arrayidx28 = getelementptr inbounds i16, i16* %g, i64 %sub26
    209   %11 = load i16, i16* %arrayidx28, align 2
    210   %conv29 = zext i16 %11 to i32
    211   %add30 = add nsw i32 %add21, %conv29
    212   %incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5
    213   %12 = load i16, i16* %incdec.ptr22, align 2
    214   %conv32 = zext i16 %12 to i64
    215   %incdec.ptr33 = getelementptr inbounds i16, i16* %b, i64 5
    216   %13 = load i16, i16* %incdec.ptr24, align 2
    217   %conv34 = zext i16 %13 to i64
    218   %sub35 = sub nsw i64 %conv32, %conv34
    219   %arrayidx37 = getelementptr inbounds i16, i16* %g, i64 %sub35
    220   %14 = load i16, i16* %arrayidx37, align 2
    221   %conv38 = zext i16 %14 to i32
    222   %add39 = add nsw i32 %add30, %conv38
    223   %incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6
    224   %15 = load i16, i16* %incdec.ptr31, align 2
    225   %conv41 = zext i16 %15 to i64
    226   %incdec.ptr42 = getelementptr inbounds i16, i16* %b, i64 6
    227   %16 = load i16, i16* %incdec.ptr33, align 2
    228   %conv43 = zext i16 %16 to i64
    229   %sub44 = sub nsw i64 %conv41, %conv43
    230   %arrayidx46 = getelementptr inbounds i16, i16* %g, i64 %sub44
    231   %17 = load i16, i16* %arrayidx46, align 2
    232   %conv47 = zext i16 %17 to i32
    233   %add48 = add nsw i32 %add39, %conv47
    234   %incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7
    235   %18 = load i16, i16* %incdec.ptr40, align 2
    236   %conv50 = zext i16 %18 to i64
    237   %incdec.ptr51 = getelementptr inbounds i16, i16* %b, i64 7
    238   %19 = load i16, i16* %incdec.ptr42, align 2
    239   %conv52 = zext i16 %19 to i64
    240   %sub53 = sub nsw i64 %conv50, %conv52
    241   %arrayidx55 = getelementptr inbounds i16, i16* %g, i64 %sub53
    242   %20 = load i16, i16* %arrayidx55, align 2
    243   %conv56 = zext i16 %20 to i32
    244   %add57 = add nsw i32 %add48, %conv56
    245   %incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8
    246   %21 = load i16, i16* %incdec.ptr49, align 2
    247   %conv59 = zext i16 %21 to i64
    248   %22 = load i16, i16* %incdec.ptr51, align 2
    249   %conv61 = zext i16 %22 to i64
    250   %sub62 = sub nsw i64 %conv59, %conv61
    251   %arrayidx64 = getelementptr inbounds i16, i16* %g, i64 %sub62
    252   %23 = load i16, i16* %arrayidx64, align 2
    253   %conv65 = zext i16 %23 to i32
    254   %add66 = add nsw i32 %add57, %conv65
    255   %inc = add nuw nsw i32 %i.0103, 1
    256   %exitcond = icmp eq i32 %inc, %n
    257   br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
    258 }
    259