1 ; RUN: opt < %s -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512 2 3 ;AVX1-NOT: llvm.masked 4 5 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 6 target triple = "x86_64-pc_linux" 7 8 ; The source code: 9 ; 10 ;void foo1(float * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger, int * __restrict__ index) { 11 ; 12 ; for (int i=0; i < SIZE; ++i) { 13 ; if (trigger[i] > 0) { 14 ; out[i] = in[index[i]] + (float) 0.5; 15 ; } 16 ; } 17 ;} 18 19 ;AVX512-LABEL: @foo1 20 ;AVX512: llvm.masked.load.v16i32 21 ;AVX512: llvm.masked.gather.v16f32 22 ;AVX512: llvm.masked.store.v16f32 23 ;AVX512: ret void 24 25 ; Function Attrs: nounwind uwtable 26 define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) { 27 entry: 28 %in.addr = alloca float*, align 8 29 %out.addr = alloca float*, align 8 30 %trigger.addr = alloca i32*, align 8 31 %index.addr = alloca i32*, align 8 32 %i = alloca i32, align 4 33 store float* %in, float** %in.addr, align 8 34 store float* %out, float** %out.addr, align 8 35 store i32* %trigger, i32** %trigger.addr, align 8 36 store i32* %index, i32** %index.addr, align 8 37 store i32 0, i32* %i, align 4 38 br label %for.cond 39 40 for.cond: ; preds = %for.inc, %entry 41 %0 = load i32, i32* %i, align 4 42 %cmp = icmp slt i32 %0, 4096 43 br i1 %cmp, label %for.body, label %for.end 44 45 for.body: ; preds = %for.cond 46 %1 = load i32, i32* %i, align 4 47 %idxprom = sext i32 %1 to i64 48 %2 = load i32*, i32** %trigger.addr, align 8 49 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom 50 %3 = load i32, i32* %arrayidx, align 4 51 %cmp1 = icmp sgt i32 %3, 0 52 br i1 %cmp1, label %if.then, label %if.end 53 54 if.then: ; preds = %for.body 55 %4 = load i32, i32* %i, align 4 56 %idxprom2 = sext i32 %4 to i64 57 %5 = load i32*, i32** %index.addr, align 8 58 %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2 59 %6 = load i32, i32* %arrayidx3, align 4 60 %idxprom4 = sext i32 %6 to i64 61 %7 = load float*, float** %in.addr, align 8 62 %arrayidx5 = getelementptr inbounds float, float* %7, i64 %idxprom4 63 %8 = load float, float* %arrayidx5, align 4 64 %add = fadd float %8, 5.000000e-01 65 %9 = load i32, i32* %i, align 4 66 %idxprom6 = sext i32 %9 to i64 67 %10 = load float*, float** %out.addr, align 8 68 %arrayidx7 = getelementptr inbounds float, float* %10, i64 %idxprom6 69 store float %add, float* %arrayidx7, align 4 70 br label %if.end 71 72 if.end: ; preds = %if.then, %for.body 73 br label %for.inc 74 75 for.inc: ; preds = %if.end 76 %11 = load i32, i32* %i, align 4 77 %inc = add nsw i32 %11, 1 78 store i32 %inc, i32* %i, align 4 79 br label %for.cond 80 81 for.end: ; preds = %for.cond 82 ret void 83 } 84 85 ; The source code 86 ;void foo2 (In * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger) { 87 ; 88 ; for (int i=0; i<SIZE; ++i) { 89 ; if (trigger[i] > 0) { 90 ; out[i] = in[i].b + (float) 0.5; 91 ; } 92 ; } 93 ;} 94 95 %struct.In = type { float, float } 96 97 ;AVX512-LABEL: @foo2 98 ;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1 99 ;AVX512: llvm.masked.gather.v16f32 100 ;AVX512: llvm.masked.store.v16f32 101 ;AVX512: ret void 102 define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 { 103 entry: 104 %in.addr = alloca %struct.In*, align 8 105 %out.addr = alloca float*, align 8 106 %trigger.addr = alloca i32*, align 8 107 %index.addr = alloca i32*, align 8 108 %i = alloca i32, align 4 109 store %struct.In* %in, %struct.In** %in.addr, align 8 110 store float* %out, float** %out.addr, align 8 111 store i32* %trigger, i32** %trigger.addr, align 8 112 store i32* %index, i32** %index.addr, align 8 113 store i32 0, i32* %i, align 4 114 br label %for.cond 115 116 for.cond: ; preds = %for.inc, %entry 117 %0 = load i32, i32* %i, align 4 118 %cmp = icmp slt i32 %0, 4096 119 br i1 %cmp, label %for.body, label %for.end 120 121 for.body: ; preds = %for.cond 122 %1 = load i32, i32* %i, align 4 123 %idxprom = sext i32 %1 to i64 124 %2 = load i32*, i32** %trigger.addr, align 8 125 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom 126 %3 = load i32, i32* %arrayidx, align 4 127 %cmp1 = icmp sgt i32 %3, 0 128 br i1 %cmp1, label %if.then, label %if.end 129 130 if.then: ; preds = %for.body 131 %4 = load i32, i32* %i, align 4 132 %idxprom2 = sext i32 %4 to i64 133 %5 = load %struct.In*, %struct.In** %in.addr, align 8 134 %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2 135 %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1 136 %6 = load float, float* %b, align 4 137 %add = fadd float %6, 5.000000e-01 138 %7 = load i32, i32* %i, align 4 139 %idxprom4 = sext i32 %7 to i64 140 %8 = load float*, float** %out.addr, align 8 141 %arrayidx5 = getelementptr inbounds float, float* %8, i64 %idxprom4 142 store float %add, float* %arrayidx5, align 4 143 br label %if.end 144 145 if.end: ; preds = %if.then, %for.body 146 br label %for.inc 147 148 for.inc: ; preds = %if.end 149 %9 = load i32, i32* %i, align 4 150 %inc = add nsw i32 %9, 1 151 store i32 %inc, i32* %i, align 4 152 br label %for.cond 153 154 for.end: ; preds = %for.cond 155 ret void 156 } 157 158 ; The source code 159 ;struct Out { 160 ; float a; 161 ; float b; 162 ;}; 163 ;void foo3 (In * __restrict__ in, Out * __restrict__ out, int * __restrict__ trigger) { 164 ; 165 ; for (int i=0; i<SIZE; ++i) { 166 ; if (trigger[i] > 0) { 167 ; out[i].b = in[i].b + (float) 0.5; 168 ; } 169 ; } 170 ;} 171 172 ;AVX512-LABEL: @foo3 173 ;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1 174 ;AVX512: llvm.masked.gather.v16f32 175 ;AVX512: fadd <16 x float> 176 ;AVX512: getelementptr inbounds %struct.Out, %struct.Out* %out, <16 x i64> %{{.*}}, i32 1 177 ;AVX512: llvm.masked.scatter.v16f32 178 ;AVX512: ret void 179 180 %struct.Out = type { float, float } 181 182 define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noalias %trigger) { 183 entry: 184 %in.addr = alloca %struct.In*, align 8 185 %out.addr = alloca %struct.Out*, align 8 186 %trigger.addr = alloca i32*, align 8 187 %i = alloca i32, align 4 188 store %struct.In* %in, %struct.In** %in.addr, align 8 189 store %struct.Out* %out, %struct.Out** %out.addr, align 8 190 store i32* %trigger, i32** %trigger.addr, align 8 191 store i32 0, i32* %i, align 4 192 br label %for.cond 193 194 for.cond: ; preds = %for.inc, %entry 195 %0 = load i32, i32* %i, align 4 196 %cmp = icmp slt i32 %0, 4096 197 br i1 %cmp, label %for.body, label %for.end 198 199 for.body: ; preds = %for.cond 200 %1 = load i32, i32* %i, align 4 201 %idxprom = sext i32 %1 to i64 202 %2 = load i32*, i32** %trigger.addr, align 8 203 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom 204 %3 = load i32, i32* %arrayidx, align 4 205 %cmp1 = icmp sgt i32 %3, 0 206 br i1 %cmp1, label %if.then, label %if.end 207 208 if.then: ; preds = %for.body 209 %4 = load i32, i32* %i, align 4 210 %idxprom2 = sext i32 %4 to i64 211 %5 = load %struct.In*, %struct.In** %in.addr, align 8 212 %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2 213 %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1 214 %6 = load float, float* %b, align 4 215 %add = fadd float %6, 5.000000e-01 216 %7 = load i32, i32* %i, align 4 217 %idxprom4 = sext i32 %7 to i64 218 %8 = load %struct.Out*, %struct.Out** %out.addr, align 8 219 %arrayidx5 = getelementptr inbounds %struct.Out, %struct.Out* %8, i64 %idxprom4 220 %b6 = getelementptr inbounds %struct.Out, %struct.Out* %arrayidx5, i32 0, i32 1 221 store float %add, float* %b6, align 4 222 br label %if.end 223 224 if.end: ; preds = %if.then, %for.body 225 br label %for.inc 226 227 for.inc: ; preds = %if.end 228 %9 = load i32, i32* %i, align 4 229 %inc = add nsw i32 %9, 1 230 store i32 %inc, i32* %i, align 4 231 br label %for.cond 232 233 for.end: ; preds = %for.cond 234 ret void 235 } 236 declare void @llvm.masked.scatter.v16f32(<16 x float>, <16 x float*>, i32, <16 x i1>) 237