Home | History | Annotate | Download | only in X86
      1 ; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-100 -instcombine -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
      2 
      3 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
      4 
      5 
      6 
      7 ; Make sure we order the operands of commutative operations so that we get
      8 ; bigger vectorizable trees.
      9 
     10 ; CHECK-LABEL: shuffle_operands1
     11 ; CHECK:         load <2 x double>
     12 ; CHECK:         fadd <2 x double>
     13 
     14 define void @shuffle_operands1(double * noalias %from, double * noalias %to,
     15                                double %v1, double %v2) {
     16   %from_1 = getelementptr double, double *%from, i64 1
     17   %v0_1 = load double , double * %from
     18   %v0_2 = load double , double * %from_1
     19   %v1_1 = fadd double %v0_1, %v1
     20   %v1_2 = fadd double %v2, %v0_2
     21   %to_2 = getelementptr double, double * %to, i64 1
     22   store double %v1_1, double *%to
     23   store double %v1_2, double *%to_2
     24   ret void
     25 }
     26 
     27 ; CHECK-LABEL: shuffle_preserve_broadcast
     28 ; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
     29 ; CHECK:                      = insertelement <2 x double> %[[BCAST]], double %v0_1
     30 define void @shuffle_preserve_broadcast(double * noalias %from,
     31                                         double * noalias %to,
     32                                         double %v1, double %v2) {
     33 entry:
     34 br label %lp
     35 
     36 lp:
     37   %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
     38   %from_1 = getelementptr double, double *%from, i64 1
     39   %v0_1 = load double , double * %from
     40   %v0_2 = load double , double * %from_1
     41   %v1_1 = fadd double %v0_1, %p
     42   %v1_2 = fadd double %v0_1, %v0_2
     43   %to_2 = getelementptr double, double * %to, i64 1
     44   store double %v1_1, double *%to
     45   store double %v1_2, double *%to_2
     46 br i1 undef, label %lp, label %ext
     47 
     48 ext:
     49   ret void
     50 }
     51 
     52 ; CHECK-LABEL: shuffle_preserve_broadcast2
     53 ; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
     54 ; CHECK:                      = insertelement <2 x double> %[[BCAST]], double %v0_1
     55 define void @shuffle_preserve_broadcast2(double * noalias %from,
     56                                         double * noalias %to,
     57                                         double %v1, double %v2) {
     58 entry:
     59 br label %lp
     60 
     61 lp:
     62   %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
     63   %from_1 = getelementptr double, double *%from, i64 1
     64   %v0_1 = load double , double * %from
     65   %v0_2 = load double , double * %from_1
     66   %v1_1 = fadd double %p, %v0_1
     67   %v1_2 = fadd double %v0_2, %v0_1
     68   %to_2 = getelementptr double, double * %to, i64 1
     69   store double %v1_1, double *%to
     70   store double %v1_2, double *%to_2
     71 br i1 undef, label %lp, label %ext
     72 
     73 ext:
     74   ret void
     75 }
     76 
     77 ; CHECK-LABEL: shuffle_preserve_broadcast3
     78 ; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
     79 ; CHECK:                      = insertelement <2 x double> %[[BCAST]], double %v0_1
     80 define void @shuffle_preserve_broadcast3(double * noalias %from,
     81                                         double * noalias %to,
     82                                         double %v1, double %v2) {
     83 entry:
     84 br label %lp
     85 
     86 lp:
     87   %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
     88   %from_1 = getelementptr double, double *%from, i64 1
     89   %v0_1 = load double , double * %from
     90   %v0_2 = load double , double * %from_1
     91   %v1_1 = fadd double %p, %v0_1
     92   %v1_2 = fadd double %v0_1, %v0_2
     93   %to_2 = getelementptr double, double * %to, i64 1
     94   store double %v1_1, double *%to
     95   store double %v1_2, double *%to_2
     96 br i1 undef, label %lp, label %ext
     97 
     98 ext:
     99   ret void
    100 }
    101 
    102 
    103 ; CHECK-LABEL: shuffle_preserve_broadcast4
    104 ; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
    105 ; CHECK:                      = insertelement <2 x double> %[[BCAST]], double %v0_1
    106 define void @shuffle_preserve_broadcast4(double * noalias %from,
    107                                         double * noalias %to,
    108                                         double %v1, double %v2) {
    109 entry:
    110 br label %lp
    111 
    112 lp:
    113   %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
    114   %from_1 = getelementptr double, double *%from, i64 1
    115   %v0_1 = load double , double * %from
    116   %v0_2 = load double , double * %from_1
    117   %v1_1 = fadd double %v0_2, %v0_1
    118   %v1_2 = fadd double %p, %v0_1
    119   %to_2 = getelementptr double, double * %to, i64 1
    120   store double %v1_1, double *%to
    121   store double %v1_2, double *%to_2
    122 br i1 undef, label %lp, label %ext
    123 
    124 ext:
    125   ret void
    126 }
    127 
    128 ; CHECK-LABEL: shuffle_preserve_broadcast5
    129 ; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
    130 ; CHECK:                      = insertelement <2 x double> %[[BCAST]], double %v0_1
    131 define void @shuffle_preserve_broadcast5(double * noalias %from,
    132                                         double * noalias %to,
    133                                         double %v1, double %v2) {
    134 entry:
    135 br label %lp
    136 
    137 lp:
    138   %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
    139   %from_1 = getelementptr double, double *%from, i64 1
    140   %v0_1 = load double , double * %from
    141   %v0_2 = load double , double * %from_1
    142   %v1_1 = fadd double %v0_1, %v0_2
    143   %v1_2 = fadd double %p, %v0_1
    144   %to_2 = getelementptr double, double * %to, i64 1
    145   store double %v1_1, double *%to
    146   store double %v1_2, double *%to_2
    147 br i1 undef, label %lp, label %ext
    148 
    149 ext:
    150   ret void
    151 }
    152 
    153 
    154 ; CHECK-LABEL: shuffle_preserve_broadcast6
    155 ; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
    156 ; CHECK:                      = insertelement <2 x double> %[[BCAST]], double %v0_1
    157 define void @shuffle_preserve_broadcast6(double * noalias %from,
    158                                         double * noalias %to,
    159                                         double %v1, double %v2) {
    160 entry:
    161 br label %lp
    162 
    163 lp:
    164   %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
    165   %from_1 = getelementptr double, double *%from, i64 1
    166   %v0_1 = load double , double * %from
    167   %v0_2 = load double , double * %from_1
    168   %v1_1 = fadd double %v0_1, %v0_2
    169   %v1_2 = fadd double %v0_1, %p
    170   %to_2 = getelementptr double, double * %to, i64 1
    171   store double %v1_1, double *%to
    172   store double %v1_2, double *%to_2
    173 br i1 undef, label %lp, label %ext
    174 
    175 ext:
    176   ret void
    177 }
    178 
    179 ; Make sure we don't scramble operands when we reorder them and destroy
    180 ; 'good' source order.
    181 
    182 ; CHECK-LABEL: good_load_order
    183 
    184 ; CHECK: %[[V1:[0-9]+]] = load <4 x float>, <4 x float>*
    185 ; CHECK: %[[V2:[0-9]+]] = insertelement <4 x float> undef, float %1, i32 0
    186 ; CHECK: %[[V3:[0-9]+]] = shufflevector <4 x float> %[[V2]], <4 x float> %[[V1]], <4 x i32> <i32 0, i32 4, i32 5, i32 6>
    187 ; CHECK:                = fmul <4 x float> %[[V1]], %[[V3]]
    188 
    189 @a = common global [32000 x float] zeroinitializer, align 16
    190 
    191 define void @good_load_order() {
    192 entry:
    193   br label %for.cond1.preheader
    194 
    195 for.cond1.preheader:
    196   %0 = load float, float* getelementptr inbounds ([32000 x float], [32000 x float]* @a, i64 0, i64 0), align 16
    197   br label %for.body3
    198 
    199 for.body3:
    200   %1 = phi float [ %0, %for.cond1.preheader ], [ %10, %for.body3 ]
    201   %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
    202   %2 = add nsw i64 %indvars.iv, 1
    203   %arrayidx = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %2
    204   %3 = load float, float* %arrayidx, align 4
    205   %arrayidx5 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %indvars.iv
    206   %mul6 = fmul float %3, %1
    207   store float %mul6, float* %arrayidx5, align 4
    208   %4 = add nsw i64 %indvars.iv, 2
    209   %arrayidx11 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %4
    210   %5 = load float, float* %arrayidx11, align 4
    211   %mul15 = fmul float %5, %3
    212   store float %mul15, float* %arrayidx, align 4
    213   %6 = add nsw i64 %indvars.iv, 3
    214   %arrayidx21 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %6
    215   %7 = load float, float* %arrayidx21, align 4
    216   %mul25 = fmul float %7, %5
    217   store float %mul25, float* %arrayidx11, align 4
    218   %8 = add nsw i64 %indvars.iv, 4
    219   %arrayidx31 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %8
    220   %9 = load float, float* %arrayidx31, align 4
    221   %mul35 = fmul float %9, %7
    222   store float %mul35, float* %arrayidx21, align 4
    223   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
    224   %arrayidx41 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %indvars.iv.next
    225   %10 = load float, float* %arrayidx41, align 4
    226   %mul45 = fmul float %10, %9
    227   store float %mul45, float* %arrayidx31, align 4
    228   %11 = trunc i64 %indvars.iv.next to i32
    229   %cmp2 = icmp slt i32 %11, 31995
    230   br i1 %cmp2, label %for.body3, label %for.end
    231 
    232 for.end:
    233   ret void
    234 }
    235 
    236 ; Check vectorization of following code for double data type-
    237 ;  c[0] = a[0]+b[0];
    238 ;  c[1] = b[1]+a[1]; // swapped b[1] and a[1]
    239 
    240 ; CHECK-LABEL: load_reorder_double
    241 ; CHECK: load <2 x double>, <2 x double>*
    242 ; CHECK: fadd <2 x double>
    243 define void @load_reorder_double(double* nocapture %c, double* noalias nocapture readonly %a, double* noalias nocapture readonly %b){
    244   %1 = load double, double* %a
    245   %2 = load double, double* %b
    246   %3 = fadd double %1, %2
    247   store double %3, double* %c
    248   %4 = getelementptr inbounds double, double* %b, i64 1
    249   %5 = load double, double* %4
    250   %6 = getelementptr inbounds double, double* %a, i64 1
    251   %7 = load double, double* %6
    252   %8 = fadd double %5, %7
    253   %9 = getelementptr inbounds double, double* %c, i64 1
    254   store double %8, double* %9
    255   ret void
    256 }
    257 
    258 ; Check vectorization of following code for float data type-
    259 ;  c[0] = a[0]+b[0];
    260 ;  c[1] = b[1]+a[1]; // swapped b[1] and a[1]
    261 ;  c[2] = a[2]+b[2];
    262 ;  c[3] = a[3]+b[3];
    263 
    264 ; CHECK-LABEL: load_reorder_float
    265 ; CHECK: load <4 x float>, <4 x float>*
    266 ; CHECK: fadd <4 x float>
    267 define void @load_reorder_float(float* nocapture %c, float* noalias nocapture readonly %a, float* noalias nocapture readonly %b){
    268   %1 = load float, float* %a
    269   %2 = load float, float* %b
    270   %3 = fadd float %1, %2
    271   store float %3, float* %c
    272   %4 = getelementptr inbounds float, float* %b, i64 1
    273   %5 = load float, float* %4
    274   %6 = getelementptr inbounds float, float* %a, i64 1
    275   %7 = load float, float* %6
    276   %8 = fadd float %5, %7
    277   %9 = getelementptr inbounds float, float* %c, i64 1
    278   store float %8, float* %9
    279   %10 = getelementptr inbounds float, float* %a, i64 2
    280   %11 = load float, float* %10
    281   %12 = getelementptr inbounds float, float* %b, i64 2
    282   %13 = load float, float* %12
    283   %14 = fadd float %11, %13
    284   %15 = getelementptr inbounds float, float* %c, i64 2
    285   store float %14, float* %15
    286   %16 = getelementptr inbounds float, float* %a, i64 3
    287   %17 = load float, float* %16
    288   %18 = getelementptr inbounds float, float* %b, i64 3
    289   %19 = load float, float* %18
    290   %20 = fadd float %17, %19
    291   %21 = getelementptr inbounds float, float* %c, i64 3
    292   store float %20, float* %21
    293   ret void
    294 }
    295 
    296 ; Check we properly reorder the below code so that it gets vectorized optimally-
    297 ; a[0] = (b[0]+c[0])+d[0];
    298 ; a[1] = d[1]+(b[1]+c[1]);
    299 ; a[2] = (b[2]+c[2])+d[2];
    300 ; a[3] = (b[3]+c[3])+d[3];
    301 
    302 ; CHECK-LABEL: opcode_reorder
    303 ; CHECK: load <4 x float>, <4 x float>*
    304 ; CHECK: fadd <4 x float>
    305 define void @opcode_reorder(float* noalias nocapture %a, float* noalias nocapture readonly %b, 
    306                             float* noalias nocapture readonly %c,float* noalias nocapture readonly %d){
    307   %1 = load float, float* %b
    308   %2 = load float, float* %c
    309   %3 = fadd float %1, %2
    310   %4 = load float, float* %d
    311   %5 = fadd float %3, %4
    312   store float %5, float* %a
    313   %6 = getelementptr inbounds float, float* %d, i64 1
    314   %7 = load float, float* %6
    315   %8 = getelementptr inbounds float, float* %b, i64 1
    316   %9 = load float, float* %8
    317   %10 = getelementptr inbounds float, float* %c, i64 1
    318   %11 = load float, float* %10
    319   %12 = fadd float %9, %11
    320   %13 = fadd float %7, %12
    321   %14 = getelementptr inbounds float, float* %a, i64 1
    322   store float %13, float* %14
    323   %15 = getelementptr inbounds float, float* %b, i64 2
    324   %16 = load float, float* %15
    325   %17 = getelementptr inbounds float, float* %c, i64 2
    326   %18 = load float, float* %17
    327   %19 = fadd float %16, %18
    328   %20 = getelementptr inbounds float, float* %d, i64 2
    329   %21 = load float, float* %20
    330   %22 = fadd float %19, %21
    331   %23 = getelementptr inbounds float, float* %a, i64 2
    332   store float %22, float* %23
    333   %24 = getelementptr inbounds float, float* %b, i64 3
    334   %25 = load float, float* %24
    335   %26 = getelementptr inbounds float, float* %c, i64 3
    336   %27 = load float, float* %26
    337   %28 = fadd float %25, %27
    338   %29 = getelementptr inbounds float, float* %d, i64 3
    339   %30 = load float, float* %29
    340   %31 = fadd float %28, %30
    341   %32 = getelementptr inbounds float, float* %a, i64 3
    342   store float %31, float* %32
    343   ret void
    344 }
    345