Home | History | Annotate | Download | only in X86
      1 ; RUN: opt < %s  -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
      2 
      3 ;AVX1-NOT: llvm.masked
      4 
      5 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
      6 target triple = "x86_64-pc_linux"
      7 
      8 ; The source code:
      9 ;
     10 ;void foo1(float * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger, int * __restrict__ index) {
     11 ;
     12 ;  for (int i=0; i < SIZE; ++i) {
     13 ;    if (trigger[i] > 0) {
     14 ;      out[i] = in[index[i]] + (float) 0.5;
     15 ;    }
     16 ;  }
     17 ;}
     18 
     19 ;AVX512-LABEL: @foo1
     20 ;AVX512: llvm.masked.load.v16i32
     21 ;AVX512: llvm.masked.gather.v16f32
     22 ;AVX512: llvm.masked.store.v16f32
     23 ;AVX512: ret void
     24 
     25 ; Function Attrs: nounwind uwtable
     26 define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) {
     27 entry:
     28   %in.addr = alloca float*, align 8
     29   %out.addr = alloca float*, align 8
     30   %trigger.addr = alloca i32*, align 8
     31   %index.addr = alloca i32*, align 8
     32   %i = alloca i32, align 4
     33   store float* %in, float** %in.addr, align 8
     34   store float* %out, float** %out.addr, align 8
     35   store i32* %trigger, i32** %trigger.addr, align 8
     36   store i32* %index, i32** %index.addr, align 8
     37   store i32 0, i32* %i, align 4
     38   br label %for.cond
     39 
     40 for.cond:                                         ; preds = %for.inc, %entry
     41   %0 = load i32, i32* %i, align 4
     42   %cmp = icmp slt i32 %0, 4096
     43   br i1 %cmp, label %for.body, label %for.end
     44 
     45 for.body:                                         ; preds = %for.cond
     46   %1 = load i32, i32* %i, align 4
     47   %idxprom = sext i32 %1 to i64
     48   %2 = load i32*, i32** %trigger.addr, align 8
     49   %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
     50   %3 = load i32, i32* %arrayidx, align 4
     51   %cmp1 = icmp sgt i32 %3, 0
     52   br i1 %cmp1, label %if.then, label %if.end
     53 
     54 if.then:                                          ; preds = %for.body
     55   %4 = load i32, i32* %i, align 4
     56   %idxprom2 = sext i32 %4 to i64
     57   %5 = load i32*, i32** %index.addr, align 8
     58   %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2
     59   %6 = load i32, i32* %arrayidx3, align 4
     60   %idxprom4 = sext i32 %6 to i64
     61   %7 = load float*, float** %in.addr, align 8
     62   %arrayidx5 = getelementptr inbounds float, float* %7, i64 %idxprom4
     63   %8 = load float, float* %arrayidx5, align 4
     64   %add = fadd float %8, 5.000000e-01
     65   %9 = load i32, i32* %i, align 4
     66   %idxprom6 = sext i32 %9 to i64
     67   %10 = load float*, float** %out.addr, align 8
     68   %arrayidx7 = getelementptr inbounds float, float* %10, i64 %idxprom6
     69   store float %add, float* %arrayidx7, align 4
     70   br label %if.end
     71 
     72 if.end:                                           ; preds = %if.then, %for.body
     73   br label %for.inc
     74 
     75 for.inc:                                          ; preds = %if.end
     76   %11 = load i32, i32* %i, align 4
     77   %inc = add nsw i32 %11, 1
     78   store i32 %inc, i32* %i, align 4
     79   br label %for.cond
     80 
     81 for.end:                                          ; preds = %for.cond
     82   ret void
     83 }
     84 
     85 ; The source code
     86 ;void foo2 (In * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger) {
     87 ;
     88 ;  for (int i=0; i<SIZE; ++i) {
     89 ;    if (trigger[i] > 0) {
     90 ;      out[i] = in[i].b + (float) 0.5;
     91 ;    }
     92 ;  }
     93 ;}
     94 
     95 %struct.In = type { float, float }
     96 
     97 ;AVX512-LABEL: @foo2
     98 ;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1
     99 ;AVX512: llvm.masked.gather.v16f32
    100 ;AVX512: llvm.masked.store.v16f32
    101 ;AVX512: ret void
    102 define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
    103 entry:
    104   %in.addr = alloca %struct.In*, align 8
    105   %out.addr = alloca float*, align 8
    106   %trigger.addr = alloca i32*, align 8
    107   %index.addr = alloca i32*, align 8
    108   %i = alloca i32, align 4
    109   store %struct.In* %in, %struct.In** %in.addr, align 8
    110   store float* %out, float** %out.addr, align 8
    111   store i32* %trigger, i32** %trigger.addr, align 8
    112   store i32* %index, i32** %index.addr, align 8
    113   store i32 0, i32* %i, align 4
    114   br label %for.cond
    115 
    116 for.cond:                                         ; preds = %for.inc, %entry
    117   %0 = load i32, i32* %i, align 4
    118   %cmp = icmp slt i32 %0, 4096
    119   br i1 %cmp, label %for.body, label %for.end
    120 
    121 for.body:                                         ; preds = %for.cond
    122   %1 = load i32, i32* %i, align 4
    123   %idxprom = sext i32 %1 to i64
    124   %2 = load i32*, i32** %trigger.addr, align 8
    125   %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
    126   %3 = load i32, i32* %arrayidx, align 4
    127   %cmp1 = icmp sgt i32 %3, 0
    128   br i1 %cmp1, label %if.then, label %if.end
    129 
    130 if.then:                                          ; preds = %for.body
    131   %4 = load i32, i32* %i, align 4
    132   %idxprom2 = sext i32 %4 to i64
    133   %5 = load %struct.In*, %struct.In** %in.addr, align 8
    134   %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2
    135   %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1
    136   %6 = load float, float* %b, align 4
    137   %add = fadd float %6, 5.000000e-01
    138   %7 = load i32, i32* %i, align 4
    139   %idxprom4 = sext i32 %7 to i64
    140   %8 = load float*, float** %out.addr, align 8
    141   %arrayidx5 = getelementptr inbounds float, float* %8, i64 %idxprom4
    142   store float %add, float* %arrayidx5, align 4
    143   br label %if.end
    144 
    145 if.end:                                           ; preds = %if.then, %for.body
    146   br label %for.inc
    147 
    148 for.inc:                                          ; preds = %if.end
    149   %9 = load i32, i32* %i, align 4
    150   %inc = add nsw i32 %9, 1
    151   store i32 %inc, i32* %i, align 4
    152   br label %for.cond
    153 
    154 for.end:                                          ; preds = %for.cond
    155   ret void
    156 }
    157 
    158 ; The source code
    159 ;struct Out {
    160 ;  float a;
    161 ;  float b;
    162 ;};
    163 ;void foo3 (In * __restrict__ in, Out * __restrict__ out, int * __restrict__ trigger) {
    164 ;
    165 ;  for (int i=0; i<SIZE; ++i) {
    166 ;    if (trigger[i] > 0) {
    167 ;      out[i].b = in[i].b + (float) 0.5;
    168 ;    }
    169 ;  }
    170 ;}
    171 
    172 ;AVX512-LABEL: @foo3
    173 ;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1
    174 ;AVX512: llvm.masked.gather.v16f32
    175 ;AVX512: fadd <16 x float>
    176 ;AVX512: getelementptr inbounds %struct.Out, %struct.Out* %out, <16 x i64> %{{.*}}, i32 1
    177 ;AVX512: llvm.masked.scatter.v16f32
    178 ;AVX512: ret void
    179 
    180 %struct.Out = type { float, float }
    181 
    182 define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noalias %trigger) {
    183 entry:
    184   %in.addr = alloca %struct.In*, align 8
    185   %out.addr = alloca %struct.Out*, align 8
    186   %trigger.addr = alloca i32*, align 8
    187   %i = alloca i32, align 4
    188   store %struct.In* %in, %struct.In** %in.addr, align 8
    189   store %struct.Out* %out, %struct.Out** %out.addr, align 8
    190   store i32* %trigger, i32** %trigger.addr, align 8
    191   store i32 0, i32* %i, align 4
    192   br label %for.cond
    193 
    194 for.cond:                                         ; preds = %for.inc, %entry
    195   %0 = load i32, i32* %i, align 4
    196   %cmp = icmp slt i32 %0, 4096
    197   br i1 %cmp, label %for.body, label %for.end
    198 
    199 for.body:                                         ; preds = %for.cond
    200   %1 = load i32, i32* %i, align 4
    201   %idxprom = sext i32 %1 to i64
    202   %2 = load i32*, i32** %trigger.addr, align 8
    203   %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
    204   %3 = load i32, i32* %arrayidx, align 4
    205   %cmp1 = icmp sgt i32 %3, 0
    206   br i1 %cmp1, label %if.then, label %if.end
    207 
    208 if.then:                                          ; preds = %for.body
    209   %4 = load i32, i32* %i, align 4
    210   %idxprom2 = sext i32 %4 to i64
    211   %5 = load %struct.In*, %struct.In** %in.addr, align 8
    212   %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2
    213   %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1
    214   %6 = load float, float* %b, align 4
    215   %add = fadd float %6, 5.000000e-01
    216   %7 = load i32, i32* %i, align 4
    217   %idxprom4 = sext i32 %7 to i64
    218   %8 = load %struct.Out*, %struct.Out** %out.addr, align 8
    219   %arrayidx5 = getelementptr inbounds %struct.Out, %struct.Out* %8, i64 %idxprom4
    220   %b6 = getelementptr inbounds %struct.Out, %struct.Out* %arrayidx5, i32 0, i32 1
    221   store float %add, float* %b6, align 4
    222   br label %if.end
    223 
    224 if.end:                                           ; preds = %if.then, %for.body
    225   br label %for.inc
    226 
    227 for.inc:                                          ; preds = %if.end
    228   %9 = load i32, i32* %i, align 4
    229   %inc = add nsw i32 %9, 1
    230   store i32 %inc, i32* %i, align 4
    231   br label %for.cond
    232 
    233 for.end:                                          ; preds = %for.cond
    234   ret void
    235 }
    236 declare void @llvm.masked.scatter.v16f32(<16 x float>, <16 x float*>, i32, <16 x i1>)
    237