1 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck -check-prefix=SM20 %s 2 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck -check-prefix=SM35 %s 3 4 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" 5 target triple = "nvptx64-unknown-unknown" 6 7 ; SM20-LABEL: .visible .entry foo1( 8 ; SM20: ld.global.f32 9 ; SM35-LABEL: .visible .entry foo1( 10 ; SM35: ld.global.nc.f32 11 define void @foo1(float * noalias readonly %from, float * %to) { 12 %1 = load float, float * %from 13 store float %1, float * %to 14 ret void 15 } 16 17 ; SM20-LABEL: .visible .entry foo2( 18 ; SM20: ld.global.f64 19 ; SM35-LABEL: .visible .entry foo2( 20 ; SM35: ld.global.nc.f64 21 define void @foo2(double * noalias readonly %from, double * %to) { 22 %1 = load double, double * %from 23 store double %1, double * %to 24 ret void 25 } 26 27 ; SM20-LABEL: .visible .entry foo3( 28 ; SM20: ld.global.u16 29 ; SM35-LABEL: .visible .entry foo3( 30 ; SM35: ld.global.nc.u16 31 define void @foo3(i16 * noalias readonly %from, i16 * %to) { 32 %1 = load i16, i16 * %from 33 store i16 %1, i16 * %to 34 ret void 35 } 36 37 ; SM20-LABEL: .visible .entry foo4( 38 ; SM20: ld.global.u32 39 ; SM35-LABEL: .visible .entry foo4( 40 ; SM35: ld.global.nc.u32 41 define void @foo4(i32 * noalias readonly %from, i32 * %to) { 42 %1 = load i32, i32 * %from 43 store i32 %1, i32 * %to 44 ret void 45 } 46 47 ; SM20-LABEL: .visible .entry foo5( 48 ; SM20: ld.global.u64 49 ; SM35-LABEL: .visible .entry foo5( 50 ; SM35: ld.global.nc.u64 51 define void @foo5(i64 * noalias readonly %from, i64 * %to) { 52 %1 = load i64, i64 * %from 53 store i64 %1, i64 * %to 54 ret void 55 } 56 57 ; i128 is non standard integer in nvptx64 58 ; SM20-LABEL: .visible .entry foo6( 59 ; SM20: ld.global.u64 60 ; SM20: ld.global.u64 61 ; SM35-LABEL: .visible .entry foo6( 62 ; SM35: ld.global.nc.u64 63 ; SM35: ld.global.nc.u64 64 define void @foo6(i128 * noalias readonly %from, i128 * %to) { 65 %1 = load i128, i128 * %from 66 store i128 %1, i128 * %to 67 ret void 68 } 69 70 ; SM20-LABEL: .visible .entry foo7( 71 ; SM20: ld.global.v2.u8 72 ; SM35-LABEL: .visible .entry foo7( 73 ; SM35: ld.global.nc.v2.u8 74 define void @foo7(<2 x i8> * noalias readonly %from, <2 x i8> * %to) { 75 %1 = load <2 x i8>, <2 x i8> * %from 76 store <2 x i8> %1, <2 x i8> * %to 77 ret void 78 } 79 80 ; SM20-LABEL: .visible .entry foo8( 81 ; SM20: ld.global.v2.u16 82 ; SM35-LABEL: .visible .entry foo8( 83 ; SM35: ld.global.nc.v2.u16 84 define void @foo8(<2 x i16> * noalias readonly %from, <2 x i16> * %to) { 85 %1 = load <2 x i16>, <2 x i16> * %from 86 store <2 x i16> %1, <2 x i16> * %to 87 ret void 88 } 89 90 ; SM20-LABEL: .visible .entry foo9( 91 ; SM20: ld.global.v2.u32 92 ; SM35-LABEL: .visible .entry foo9( 93 ; SM35: ld.global.nc.v2.u32 94 define void @foo9(<2 x i32> * noalias readonly %from, <2 x i32> * %to) { 95 %1 = load <2 x i32>, <2 x i32> * %from 96 store <2 x i32> %1, <2 x i32> * %to 97 ret void 98 } 99 100 ; SM20-LABEL: .visible .entry foo10( 101 ; SM20: ld.global.v2.u64 102 ; SM35-LABEL: .visible .entry foo10( 103 ; SM35: ld.global.nc.v2.u64 104 define void @foo10(<2 x i64> * noalias readonly %from, <2 x i64> * %to) { 105 %1 = load <2 x i64>, <2 x i64> * %from 106 store <2 x i64> %1, <2 x i64> * %to 107 ret void 108 } 109 110 ; SM20-LABEL: .visible .entry foo11( 111 ; SM20: ld.global.v2.f32 112 ; SM35-LABEL: .visible .entry foo11( 113 ; SM35: ld.global.nc.v2.f32 114 define void @foo11(<2 x float> * noalias readonly %from, <2 x float> * %to) { 115 %1 = load <2 x float>, <2 x float> * %from 116 store <2 x float> %1, <2 x float> * %to 117 ret void 118 } 119 120 ; SM20-LABEL: .visible .entry foo12( 121 ; SM20: ld.global.v2.f64 122 ; SM35-LABEL: .visible .entry foo12( 123 ; SM35: ld.global.nc.v2.f64 124 define void @foo12(<2 x double> * noalias readonly %from, <2 x double> * %to) { 125 %1 = load <2 x double>, <2 x double> * %from 126 store <2 x double> %1, <2 x double> * %to 127 ret void 128 } 129 130 ; SM20-LABEL: .visible .entry foo13( 131 ; SM20: ld.global.v4.u8 132 ; SM35-LABEL: .visible .entry foo13( 133 ; SM35: ld.global.nc.v4.u8 134 define void @foo13(<4 x i8> * noalias readonly %from, <4 x i8> * %to) { 135 %1 = load <4 x i8>, <4 x i8> * %from 136 store <4 x i8> %1, <4 x i8> * %to 137 ret void 138 } 139 140 ; SM20-LABEL: .visible .entry foo14( 141 ; SM20: ld.global.v4.u16 142 ; SM35-LABEL: .visible .entry foo14( 143 ; SM35: ld.global.nc.v4.u16 144 define void @foo14(<4 x i16> * noalias readonly %from, <4 x i16> * %to) { 145 %1 = load <4 x i16>, <4 x i16> * %from 146 store <4 x i16> %1, <4 x i16> * %to 147 ret void 148 } 149 150 ; SM20-LABEL: .visible .entry foo15( 151 ; SM20: ld.global.v4.u32 152 ; SM35-LABEL: .visible .entry foo15( 153 ; SM35: ld.global.nc.v4.u32 154 define void @foo15(<4 x i32> * noalias readonly %from, <4 x i32> * %to) { 155 %1 = load <4 x i32>, <4 x i32> * %from 156 store <4 x i32> %1, <4 x i32> * %to 157 ret void 158 } 159 160 ; SM20-LABEL: .visible .entry foo16( 161 ; SM20: ld.global.v4.f32 162 ; SM35-LABEL: .visible .entry foo16( 163 ; SM35: ld.global.nc.v4.f32 164 define void @foo16(<4 x float> * noalias readonly %from, <4 x float> * %to) { 165 %1 = load <4 x float>, <4 x float> * %from 166 store <4 x float> %1, <4 x float> * %to 167 ret void 168 } 169 170 ; SM20-LABEL: .visible .entry foo17( 171 ; SM20: ld.global.v2.f64 172 ; SM20: ld.global.v2.f64 173 ; SM35-LABEL: .visible .entry foo17( 174 ; SM35: ld.global.nc.v2.f64 175 ; SM35: ld.global.nc.v2.f64 176 define void @foo17(<4 x double> * noalias readonly %from, <4 x double> * %to) { 177 %1 = load <4 x double>, <4 x double> * %from 178 store <4 x double> %1, <4 x double> * %to 179 ret void 180 } 181 182 ; SM20-LABEL: .visible .entry foo18( 183 ; SM20: ld.global.u64 184 ; SM35-LABEL: .visible .entry foo18( 185 ; SM35: ld.global.nc.u64 186 define void @foo18(float ** noalias readonly %from, float ** %to) { 187 %1 = load float *, float ** %from 188 store float * %1, float ** %to 189 ret void 190 } 191 192 ; Test that we can infer a cached load for a pointer induction variable. 193 ; SM20-LABEL: .visible .entry foo19( 194 ; SM20: ld.global.f32 195 ; SM35-LABEL: .visible .entry foo19( 196 ; SM35: ld.global.nc.f32 197 define void @foo19(float * noalias readonly %from, float * %to, i32 %n) { 198 entry: 199 br label %loop 200 201 loop: 202 %i = phi i32 [ 0, %entry ], [ %nexti, %loop ] 203 %sum = phi float [ 0.0, %entry ], [ %nextsum, %loop ] 204 %ptr = getelementptr inbounds float, float * %from, i32 %i 205 %value = load float, float * %ptr, align 4 206 %nextsum = fadd float %value, %sum 207 %nexti = add nsw i32 %i, 1 208 %exitcond = icmp eq i32 %nexti, %n 209 br i1 %exitcond, label %exit, label %loop 210 211 exit: 212 store float %nextsum, float * %to 213 ret void 214 } 215 216 ; This test captures the case of a non-kernel function. In a 217 ; non-kernel function, without interprocedural analysis, we do not 218 ; know that the parameter is global. We also do not know that the 219 ; pointed-to memory is never written to (for the duration of the 220 ; kernel). For both reasons, we cannot use a cached load here. 221 ; SM20-LABEL: notkernel( 222 ; SM20: ld.f32 223 ; SM35-LABEL: notkernel( 224 ; SM35: ld.f32 225 define void @notkernel(float * noalias readonly %from, float * %to) { 226 %1 = load float, float * %from 227 store float %1, float * %to 228 ret void 229 } 230 231 ; As @notkernel, but with the parameter explicitly marked as global. We still 232 ; do not know that the parameter is never written to (for the duration of the 233 ; kernel). This case does not currently come up normally since we do not infer 234 ; that pointers are global interprocedurally as of 2015-08-05. 235 ; SM20-LABEL: notkernel2( 236 ; SM20: ld.global.f32 237 ; SM35-LABEL: notkernel2( 238 ; SM35: ld.global.f32 239 define void @notkernel2(float addrspace(1) * noalias readonly %from, float * %to) { 240 %1 = load float, float addrspace(1) * %from 241 store float %1, float * %to 242 ret void 243 } 244 245 !nvvm.annotations = !{!1 ,!2 ,!3 ,!4 ,!5 ,!6, !7 ,!8 ,!9 ,!10 ,!11 ,!12, !13, !14, !15, !16, !17, !18, !19} 246 !1 = !{void (float *, float *)* @foo1, !"kernel", i32 1} 247 !2 = !{void (double *, double *)* @foo2, !"kernel", i32 1} 248 !3 = !{void (i16 *, i16 *)* @foo3, !"kernel", i32 1} 249 !4 = !{void (i32 *, i32 *)* @foo4, !"kernel", i32 1} 250 !5 = !{void (i64 *, i64 *)* @foo5, !"kernel", i32 1} 251 !6 = !{void (i128 *, i128 *)* @foo6, !"kernel", i32 1} 252 !7 = !{void (<2 x i8> *, <2 x i8> *)* @foo7, !"kernel", i32 1} 253 !8 = !{void (<2 x i16> *, <2 x i16> *)* @foo8, !"kernel", i32 1} 254 !9 = !{void (<2 x i32> *, <2 x i32> *)* @foo9, !"kernel", i32 1} 255 !10 = !{void (<2 x i64> *, <2 x i64> *)* @foo10, !"kernel", i32 1} 256 !11 = !{void (<2 x float> *, <2 x float> *)* @foo11, !"kernel", i32 1} 257 !12 = !{void (<2 x double> *, <2 x double> *)* @foo12, !"kernel", i32 1} 258 !13 = !{void (<4 x i8> *, <4 x i8> *)* @foo13, !"kernel", i32 1} 259 !14 = !{void (<4 x i16> *, <4 x i16> *)* @foo14, !"kernel", i32 1} 260 !15 = !{void (<4 x i32> *, <4 x i32> *)* @foo15, !"kernel", i32 1} 261 !16 = !{void (<4 x float> *, <4 x float> *)* @foo16, !"kernel", i32 1} 262 !17 = !{void (<4 x double> *, <4 x double> *)* @foo17, !"kernel", i32 1} 263 !18 = !{void (float **, float **)* @foo18, !"kernel", i32 1} 264 !19 = !{void (float *, float *, i32)* @foo19, !"kernel", i32 1} 265