1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s 2 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s 3 4 ; IEEE bit enabled for compute kernel, no shouldn't use. 5 ; GCN-LABEL: {{^}}v_omod_div2_f32_enable_ieee_signed_zeros: 6 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 7 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}} 8 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}} 9 define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 { 10 %tid = call i32 @llvm.amdgcn.workitem.id.x() 11 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 12 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 13 %a = load float, float addrspace(1)* %gep0 14 %add = fadd float %a, 1.0 15 %div2 = fmul float %add, 0.5 16 store float %div2, float addrspace(1)* %out.gep 17 ret void 18 } 19 20 ; IEEE bit enabled for compute kernel, no shouldn't use even though nsz is allowed 21 ; GCN-LABEL: {{^}}v_omod_div2_f32_enable_ieee_nsz: 22 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 23 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}} 24 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}} 25 define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 26 %tid = call i32 @llvm.amdgcn.workitem.id.x() 27 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 28 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 29 %a = load float, float addrspace(1)* %gep0 30 %add = fadd float %a, 1.0 31 %div2 = fmul float %add, 0.5 32 store float %div2, float addrspace(1)* %out.gep 33 ret void 34 } 35 36 ; Only allow without IEEE bit if signed zeros are significant. 37 ; GCN-LABEL: {{^}}v_omod_div2_f32_signed_zeros: 38 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} 39 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}} 40 define amdgpu_ps void @v_omod_div2_f32_signed_zeros(float %a) #4 { 41 %add = fadd float %a, 1.0 42 %div2 = fmul float %add, 0.5 43 store float %div2, float addrspace(1)* undef 44 ret void 45 } 46 47 ; GCN-LABEL: {{^}}v_omod_div2_f32: 48 ; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 div:2{{$}} 49 define amdgpu_ps void @v_omod_div2_f32(float %a) #0 { 50 %add = fadd float %a, 1.0 51 %div2 = fmul float %add, 0.5 52 store float %div2, float addrspace(1)* undef 53 ret void 54 } 55 56 ; GCN-LABEL: {{^}}v_omod_mul2_f32: 57 ; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:2{{$}} 58 define amdgpu_ps void @v_omod_mul2_f32(float %a) #0 { 59 %add = fadd float %a, 1.0 60 %div2 = fmul float %add, 2.0 61 store float %div2, float addrspace(1)* undef 62 ret void 63 } 64 65 ; GCN-LABEL: {{^}}v_omod_mul4_f32: 66 ; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:4{{$}} 67 define amdgpu_ps void @v_omod_mul4_f32(float %a) #0 { 68 %add = fadd float %a, 1.0 69 %div2 = fmul float %add, 4.0 70 store float %div2, float addrspace(1)* undef 71 ret void 72 } 73 74 ; GCN-LABEL: {{^}}v_omod_mul4_multi_use_f32: 75 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} 76 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 4.0, [[ADD]]{{$}} 77 define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 { 78 %add = fadd float %a, 1.0 79 %div2 = fmul float %add, 4.0 80 store float %div2, float addrspace(1)* undef 81 store volatile float %add, float addrspace(1)* undef 82 ret void 83 } 84 85 ; GCN-LABEL: {{^}}v_omod_mul4_dbg_use_f32: 86 ; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:4{{$}} 87 define amdgpu_ps void @v_omod_mul4_dbg_use_f32(float %a) #0 { 88 %add = fadd float %a, 1.0 89 call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10 90 %div2 = fmul float %add, 4.0 91 store float %div2, float addrspace(1)* undef 92 ret void 93 } 94 95 ; Clamp is applied after omod, folding both into instruction is OK. 96 ; GCN-LABEL: {{^}}v_clamp_omod_div2_f32: 97 ; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 clamp div:2{{$}} 98 define amdgpu_ps void @v_clamp_omod_div2_f32(float %a) #0 { 99 %add = fadd float %a, 1.0 100 %div2 = fmul float %add, 0.5 101 102 %max = call float @llvm.maxnum.f32(float %div2, float 0.0) 103 %clamp = call float @llvm.minnum.f32(float %max, float 1.0) 104 store float %clamp, float addrspace(1)* undef 105 ret void 106 } 107 108 ; Cannot fold omod into clamp 109 ; GCN-LABEL: {{^}}v_omod_div2_clamp_f32: 110 ; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], v0, 1.0 clamp{{$}} 111 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}} 112 define amdgpu_ps void @v_omod_div2_clamp_f32(float %a) #0 { 113 %add = fadd float %a, 1.0 114 %max = call float @llvm.maxnum.f32(float %add, float 0.0) 115 %clamp = call float @llvm.minnum.f32(float %max, float 1.0) 116 %div2 = fmul float %clamp, 0.5 117 store float %div2, float addrspace(1)* undef 118 ret void 119 } 120 121 ; GCN-LABEL: {{^}}v_omod_div2_abs_src_f32: 122 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} 123 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ADD]]|, 0.5{{$}} 124 define amdgpu_ps void @v_omod_div2_abs_src_f32(float %a) #0 { 125 %add = fadd float %a, 1.0 126 %abs.add = call float @llvm.fabs.f32(float %add) 127 %div2 = fmul float %abs.add, 0.5 128 store float %div2, float addrspace(1)* undef 129 ret void 130 } 131 132 ; GCN-LABEL: {{^}}v_omod_add_self_clamp_f32: 133 ; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, v0 clamp{{$}} 134 define amdgpu_ps void @v_omod_add_self_clamp_f32(float %a) #0 { 135 %add = fadd float %a, %a 136 %max = call float @llvm.maxnum.f32(float %add, float 0.0) 137 %clamp = call float @llvm.minnum.f32(float %max, float 1.0) 138 store float %clamp, float addrspace(1)* undef 139 ret void 140 } 141 142 ; GCN-LABEL: {{^}}v_omod_add_clamp_self_f32: 143 ; GCN: v_max_f32_e64 [[CLAMP:v[0-9]+]], v0, v0 clamp{{$}} 144 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[CLAMP]], [[CLAMP]]{{$}} 145 define amdgpu_ps void @v_omod_add_clamp_self_f32(float %a) #0 { 146 %max = call float @llvm.maxnum.f32(float %a, float 0.0) 147 %clamp = call float @llvm.minnum.f32(float %max, float 1.0) 148 %add = fadd float %clamp, %clamp 149 store float %add, float addrspace(1)* undef 150 ret void 151 } 152 153 ; GCN-LABEL: {{^}}v_omod_add_abs_self_f32: 154 ; GCN: v_add_f32_e32 [[X:v[0-9]+]], 1.0, v0 155 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, |[[X]]|{{$}} 156 define amdgpu_ps void @v_omod_add_abs_self_f32(float %a) #0 { 157 %x = fadd float %a, 1.0 158 %abs.x = call float @llvm.fabs.f32(float %x) 159 %add = fadd float %abs.x, %abs.x 160 store float %add, float addrspace(1)* undef 161 ret void 162 } 163 164 ; GCN-LABEL: {{^}}v_omod_add_abs_x_x_f32: 165 166 ; GCN: v_add_f32_e32 [[X:v[0-9]+]], 1.0, v0 167 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, [[X]]{{$}} 168 define amdgpu_ps void @v_omod_add_abs_x_x_f32(float %a) #0 { 169 %x = fadd float %a, 1.0 170 %abs.x = call float @llvm.fabs.f32(float %x) 171 %add = fadd float %abs.x, %x 172 store float %add, float addrspace(1)* undef 173 ret void 174 } 175 176 ; GCN-LABEL: {{^}}v_omod_add_x_abs_x_f32: 177 ; GCN: v_add_f32_e32 [[X:v[0-9]+]], 1.0, v0 178 ; GCN: v_add_f32_e64 v{{[0-9]+}}, [[X]], |[[X]]|{{$}} 179 define amdgpu_ps void @v_omod_add_x_abs_x_f32(float %a) #0 { 180 %x = fadd float %a, 1.0 181 %abs.x = call float @llvm.fabs.f32(float %x) 182 %add = fadd float %x, %abs.x 183 store float %add, float addrspace(1)* undef 184 ret void 185 } 186 187 ; Don't fold omod into omod into another omod. 188 ; GCN-LABEL: {{^}}v_omod_div2_omod_div2_f32: 189 ; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], v0, 1.0 div:2{{$}} 190 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}} 191 define amdgpu_ps void @v_omod_div2_omod_div2_f32(float %a) #0 { 192 %add = fadd float %a, 1.0 193 %div2.0 = fmul float %add, 0.5 194 %div2.1 = fmul float %div2.0, 0.5 195 store float %div2.1, float addrspace(1)* undef 196 ret void 197 } 198 199 ; Don't fold omod if denorms enabled 200 ; GCN-LABEL: {{^}}v_omod_div2_f32_denormals: 201 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} 202 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}} 203 define amdgpu_ps void @v_omod_div2_f32_denormals(float %a) #2 { 204 %add = fadd float %a, 1.0 205 %div2 = fmul float %add, 0.5 206 store float %div2, float addrspace(1)* undef 207 ret void 208 } 209 210 ; Don't fold omod if denorms enabled for add form. 211 ; GCN-LABEL: {{^}}v_omod_mul2_f32_denormals: 212 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} 213 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[ADD]], [[ADD]]{{$}} 214 define amdgpu_ps void @v_omod_mul2_f32_denormals(float %a) #2 { 215 %add = fadd float %a, 1.0 216 %mul2 = fadd float %add, %add 217 store float %mul2, float addrspace(1)* undef 218 ret void 219 } 220 221 ; Don't fold omod if denorms enabled 222 ; GCN-LABEL: {{^}}v_omod_div2_f16_denormals: 223 ; VI: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} 224 ; VI: v_mul_f16_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}} 225 define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 { 226 %add = fadd half %a, 1.0 227 %div2 = fmul half %add, 0.5 228 store half %div2, half addrspace(1)* undef 229 ret void 230 } 231 232 ; Don't fold omod if denorms enabled for add form. 233 ; GCN-LABEL: {{^}}v_omod_mul2_f16_denormals: 234 ; VI: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}} 235 ; VI: v_add_f16_e32 v{{[0-9]+}}, [[ADD]], [[ADD]]{{$}} 236 define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 { 237 %add = fadd half %a, 1.0 238 %mul2 = fadd half %add, %add 239 store half %mul2, half addrspace(1)* undef 240 ret void 241 } 242 243 ; GCN-LABEL: {{^}}v_omod_div2_f16_no_denormals: 244 ; VI-NOT: v0 245 ; VI: v_add_f16_e64 [[ADD:v[0-9]+]], v0, 1.0 div:2{{$}} 246 define amdgpu_ps void @v_omod_div2_f16_no_denormals(half %a) #3 { 247 %add = fadd half %a, 1.0 248 %div2 = fmul half %add, 0.5 249 store half %div2, half addrspace(1)* undef 250 ret void 251 } 252 253 ; GCN-LABEL: {{^}}v_omod_mac_to_mad: 254 ; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} mul:2{{$}} 255 define amdgpu_ps void @v_omod_mac_to_mad(float %b, float %a) #0 { 256 %mul = fmul float %a, %a 257 %add = fadd float %mul, %b 258 %mad = fmul float %add, 2.0 259 %res = fmul float %mad, %b 260 store float %res, float addrspace(1)* undef 261 ret void 262 } 263 264 declare i32 @llvm.amdgcn.workitem.id.x() #1 265 declare float @llvm.fabs.f32(float) #1 266 declare float @llvm.floor.f32(float) #1 267 declare float @llvm.minnum.f32(float, float) #1 268 declare float @llvm.maxnum.f32(float, float) #1 269 declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1 270 declare double @llvm.fabs.f64(double) #1 271 declare double @llvm.minnum.f64(double, double) #1 272 declare double @llvm.maxnum.f64(double, double) #1 273 declare half @llvm.fabs.f16(half) #1 274 declare half @llvm.minnum.f16(half, half) #1 275 declare half @llvm.maxnum.f16(half, half) #1 276 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 277 278 attributes #0 = { nounwind "no-signed-zeros-fp-math"="true" } 279 attributes #1 = { nounwind readnone } 280 attributes #2 = { nounwind "target-features"="+fp32-denormals" "no-signed-zeros-fp-math"="true" } 281 attributes #3 = { nounwind "target-features"="-fp64-fp16-denormals" "no-signed-zeros-fp-math"="true" } 282 attributes #4 = { nounwind "no-signed-zeros-fp-math"="false" } 283 284 !llvm.dbg.cu = !{!0} 285 !llvm.module.flags = !{!2, !3} 286 287 !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug) 288 !1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null") 289 !2 = !{i32 2, !"Dwarf Version", i32 4} 290 !3 = !{i32 2, !"Debug Info Version", i32 3} 291 !4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1) 292 !5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0) 293 !6 = !DISubroutineType(types: !7) 294 !7 = !{null, !8} 295 !8 = !DIBasicType(name: "float", size: 32, align: 32) 296 !9 = !DIExpression() 297 !10 = !DILocation(line: 1, column: 42, scope: !5) 298