Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-FLUSH %s
      2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals,+fp-exceptions < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-EXCEPT,VI,GCN-FLUSH %s
      3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-DENORM,GCN-DENORM %s
      4 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLUSH,GCN-FLUSH %s
      5 
      6 ; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32:
      7 ; GCN-FLUSH:   v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
      8 ; GFX9-DENORM: v_max_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
      9 define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(float addrspace(1)* %arg) {
     10   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
     11   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
     12   %v = load float, float addrspace(1)* %gep, align 4
     13   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
     14   store float %canonicalized, float addrspace(1)* %gep, align 4
     15   ret void
     16 }
     17 
     18 ; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_value_f32:
     19 ; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
     20 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
     21 ; GCN-NOT: 1.0
     22 define amdgpu_kernel void @test_fold_canonicalize_fmul_value_f32(float addrspace(1)* %arg) {
     23   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
     24   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
     25   %load = load float, float addrspace(1)* %gep, align 4
     26   %v = fmul float %load, 15.0
     27   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
     28   store float %canonicalized, float addrspace(1)* %gep, align 4
     29   ret void
     30 }
     31 
     32 ; GCN-LABEL: {{^}}test_fold_canonicalize_sub_value_f32:
     33 ; GCN: v_sub_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
     34 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
     35 ; GCN-NOT: 1.0
     36 define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(float addrspace(1)* %arg) {
     37   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
     38   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
     39   %load = load float, float addrspace(1)* %gep, align 4
     40   %v = fsub float 15.0, %load
     41   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
     42   store float %canonicalized, float addrspace(1)* %gep, align 4
     43   ret void
     44 }
     45 
     46 ; GCN-LABEL: {{^}}test_fold_canonicalize_add_value_f32:
     47 ; GCN: v_add_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
     48 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
     49 ; GCN-NOT: 1.0
     50 define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(float addrspace(1)* %arg) {
     51   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
     52   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
     53   %load = load float, float addrspace(1)* %gep, align 4
     54   %v = fadd float %load, 15.0
     55   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
     56   store float %canonicalized, float addrspace(1)* %gep, align 4
     57   ret void
     58 }
     59 
     60 ; GCN-LABEL: {{^}}test_fold_canonicalize_sqrt_value_f32:
     61 ; GCN: v_sqrt_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
     62 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
     63 ; GCN-NOT: 1.0
     64 define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(float addrspace(1)* %arg) {
     65   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
     66   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
     67   %load = load float, float addrspace(1)* %gep, align 4
     68   %v = call float @llvm.sqrt.f32(float %load)
     69   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
     70   store float %canonicalized, float addrspace(1)* %gep, align 4
     71   ret void
     72 }
     73 
     74 ; GCN-LABEL: test_fold_canonicalize_fceil_value_f32:
     75 ; GCN: v_ceil_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
     76 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
     77 ; GCN-NOT: 1.0
     78 define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(float addrspace(1)* %arg) {
     79   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
     80   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
     81   %load = load float, float addrspace(1)* %gep, align 4
     82   %v = call float @llvm.ceil.f32(float %load)
     83   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
     84   store float %canonicalized, float addrspace(1)* %gep, align 4
     85   ret void
     86 }
     87 
     88 ; GCN-LABEL: test_fold_canonicalize_floor_value_f32:
     89 ; GCN: v_floor_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
     90 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
     91 ; GCN-NOT: 1.0
     92 define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(float addrspace(1)* %arg) {
     93   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
     94   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
     95   %load = load float, float addrspace(1)* %gep, align 4
     96   %v = call float @llvm.floor.f32(float %load)
     97   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
     98   store float %canonicalized, float addrspace(1)* %gep, align 4
     99   ret void
    100 }
    101 
    102 ; GCN-LABEL: test_fold_canonicalize_fma_value_f32:
    103 ; GCN: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
    104 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
    105 ; GCN-NOT: 1.0
    106 define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(1)* %arg) {
    107   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    108   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
    109   %load = load float, float addrspace(1)* %gep, align 4
    110   %v = call float @llvm.fma.f32(float %load, float 15.0, float 15.0)
    111   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
    112   store float %canonicalized, float addrspace(1)* %gep, align 4
    113   ret void
    114 }
    115 
    116 ; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32:
    117 ; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
    118 ; GFX9-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
    119 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
    120 ; GCN-NOT: 1.0
    121 define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(float addrspace(1)* %arg) {
    122   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    123   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
    124   %load = load float, float addrspace(1)* %gep, align 4
    125   %v = call float @llvm.fmuladd.f32(float %load, float 15.0, float 15.0)
    126   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
    127   store float %canonicalized, float addrspace(1)* %gep, align 4
    128   ret void
    129 }
    130 
    131 ; GCN-LABEL: test_fold_canonicalize_canonicalize_value_f32:
    132 ; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]],
    133 ; GCN-FLUSH:  v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]]
    134 ; GCN-DENORM: v_max_f32_e32 [[V:v[0-9]+]], [[LOAD]], [[LOAD]]
    135 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
    136 ; GCN-NOT: 1.0
    137 define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(float addrspace(1)* %arg) {
    138   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    139   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
    140   %load = load float, float addrspace(1)* %gep, align 4
    141   %v = call float @llvm.canonicalize.f32(float %load)
    142   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
    143   store float %canonicalized, float addrspace(1)* %gep, align 4
    144   ret void
    145 }
    146 
    147 ; GCN-LABEL: test_fold_canonicalize_fpextend_value_f64_f32:
    148 ; GCN: v_cvt_f64_f32_e32 [[V:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
    149 ; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]]
    150 ; GCN-NOT: 1.0
    151 define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(float addrspace(1)* %arg, double addrspace(1)* %out) {
    152   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    153   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
    154   %load = load float, float addrspace(1)* %gep, align 4
    155   %v = fpext float %load to double
    156   %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
    157   %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
    158   store double %canonicalized, double addrspace(1)* %gep2, align 8
    159   ret void
    160 }
    161 
    162 ; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16:
    163 ; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}}
    164 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
    165 ; GCN-NOT: 1.0
    166 define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(half addrspace(1)* %arg, float addrspace(1)* %out) {
    167   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    168   %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
    169   %load = load half, half addrspace(1)* %gep, align 2
    170   %v = fpext half %load to float
    171   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
    172   %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
    173   store float %canonicalized, float addrspace(1)* %gep2, align 4
    174   ret void
    175 }
    176 
    177 ; GCN-LABEL: test_fold_canonicalize_fpround_value_f32_f64:
    178 ; GCN: v_cvt_f32_f64_e32 [[V:v[0-9]+]], v[{{[0-9:]+}}]
    179 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
    180 ; GCN-NOT: 1.0
    181 define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(double addrspace(1)* %arg, float addrspace(1)* %out) {
    182   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    183   %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
    184   %load = load double, double addrspace(1)* %gep, align 8
    185   %v = fptrunc double %load to float
    186   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
    187   %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
    188   store float %canonicalized, float addrspace(1)* %gep2, align 4
    189   ret void
    190 }
    191 
    192 ; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32:
    193 ; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
    194 ; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
    195 ; GCN-NOT: 1.0
    196 define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(float addrspace(1)* %arg, half addrspace(1)* %out) {
    197   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    198   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
    199   %load = load float, float addrspace(1)* %gep, align 4
    200   %v = fptrunc float %load to half
    201   %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
    202   %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
    203   store half %canonicalized, half addrspace(1)* %gep2, align 2
    204   ret void
    205 }
    206 
    207 ; GCN-LABEL: test_fold_canonicalize_fpround_value_v2f16_v2f32:
    208 ; GCN-DAG: v_cvt_f16_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
    209 ; VI-DAG: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}}
    210 ; VI: v_or_b32_e32 [[V:v[0-9]+]], [[V0]], [[V1]]
    211 ; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}}
    212 ; GFX9: v_and_b32_e32 [[V0_16:v[0-9]+]], 0xffff, [[V0]]
    213 ; GFX9: v_lshl_or_b32 [[V:v[0-9]+]], [[V1]], 16, [[V0_16]]
    214 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
    215 ; GCN-NOT: 1.0
    216 define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(<2 x float> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) {
    217   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    218   %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %arg, i32 %id
    219   %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
    220   %v = fptrunc <2 x float> %load to <2 x half>
    221   %canonicalized = tail call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %v)
    222   %gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i32 %id
    223   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %gep2, align 4
    224   ret void
    225 }
    226 
    227 ; GCN-LABEL: test_no_fold_canonicalize_fneg_value_f32:
    228 ; GCN-FLUSH:  v_mul_f32_e32 v{{[0-9]+}}, -1.0, v{{[0-9]+}}
    229 ; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
    230 define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
    231   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    232   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
    233   %load = load float, float addrspace(1)* %gep, align 4
    234   %v = fsub float -0.0, %load
    235   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
    236   store float %canonicalized, float addrspace(1)* %gep, align 4
    237   ret void
    238 }
    239 
    240 ; GCN-LABEL: test_fold_canonicalize_fneg_value_f32:
    241 ; GCN: v_xor_b32_e32 [[V:v[0-9]+]], 0x80000000, v{{[0-9]+}}
    242 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
    243 ; GCN-NOT: 1.0
    244 define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
    245   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    246   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
    247   %load = load float, float addrspace(1)* %gep, align 4
    248   %v0 = fadd float %load, 0.0
    249   %v = fsub float -0.0, %v0
    250   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
    251   store float %canonicalized, float addrspace(1)* %gep, align 4
    252   ret void
    253 }
    254 
    255 ; GCN-LABEL: test_no_fold_canonicalize_fabs_value_f32:
    256 ; GCN-FLUSH:  v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
    257 ; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
    258 define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
    259   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    260   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
    261   %load = load float, float addrspace(1)* %gep, align 4
    262   %v = tail call float @llvm.fabs.f32(float %load)
    263   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
    264   store float %canonicalized, float addrspace(1)* %gep, align 4
    265   ret void
    266 }
    267 
    268 ; GCN-LABEL: test_fold_canonicalize_fabs_value_f32:
    269 ; GCN: v_and_b32_e32 [[V:v[0-9]+]], 0x7fffffff, v{{[0-9]+}}
    270 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
    271 ; GCN-NOT: 1.0
    272 define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
    273   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    274   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
    275   %load = load float, float addrspace(1)* %gep, align 4
    276   %v0 = fadd float %load, 0.0
    277   %v = tail call float @llvm.fabs.f32(float %v0)
    278   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
    279   store float %canonicalized, float addrspace(1)* %gep, align 4
    280   ret void
    281 }
    282 
    283 ; GCN-LABEL: test_fold_canonicalize_sin_value_f32:
    284 ; GCN: v_sin_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
    285 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
    286 ; GCN-NOT: 1.0
    287 define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(float addrspace(1)* %arg) {
    288   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    289   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
    290   %load = load float, float addrspace(1)* %gep, align 4
    291   %v = tail call float @llvm.sin.f32(float %load)
    292   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
    293   store float %canonicalized, float addrspace(1)* %gep, align 4
    294   ret void
    295 }
    296 
    297 ; GCN-LABEL: test_fold_canonicalize_cos_value_f32:
    298 ; GCN: v_cos_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
    299 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
    300 ; GCN-NOT: 1.0
    301 define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(float addrspace(1)* %arg) {
    302   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    303   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
    304   %load = load float, float addrspace(1)* %gep, align 4
    305   %v = tail call float @llvm.cos.f32(float %load)
    306   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
    307   store float %canonicalized, float addrspace(1)* %gep, align 4
    308   ret void
    309 }
    310 
    311 ; GCN-LABEL: test_fold_canonicalize_sin_value_f16:
    312 ; GCN: v_sin_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
    313 ; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]]
    314 ; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
    315 ; GCN-NOT: 1.0
    316 define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(half addrspace(1)* %arg) {
    317   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    318   %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
    319   %load = load half, half addrspace(1)* %gep, align 2
    320   %v = tail call half @llvm.sin.f16(half %load)
    321   %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
    322   store half %canonicalized, half addrspace(1)* %gep, align 2
    323   ret void
    324 }
    325 
    326 ; GCN-LABEL: test_fold_canonicalize_cos_value_f16:
    327 ; GCN: v_cos_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
    328 ; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]]
    329 ; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
    330 ; GCN-NOT: 1.0
    331 define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(half addrspace(1)* %arg) {
    332   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    333   %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
    334   %load = load half, half addrspace(1)* %gep, align 2
    335   %v = tail call half @llvm.cos.f16(half %load)
    336   %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
    337   store half %canonicalized, half addrspace(1)* %gep, align 2
    338   ret void
    339 }
    340 
    341 ; GCN-LABEL: test_fold_canonicalize_qNaN_value_f32:
    342 ; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x7fc00000
    343 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
    344 ; GCN-NOT: 1.0
    345 define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace(1)* %arg) {
    346   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    347   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
    348   %canonicalized = tail call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
    349   store float %canonicalized, float addrspace(1)* %gep, align 4
    350   ret void
    351 }
    352 
    353 ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32:
    354 ; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
    355 ; GFX9: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
    356 ; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
    357 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32(float addrspace(1)* %arg) {
    358   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    359   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
    360   %load = load float, float addrspace(1)* %gep, align 4
    361   %v = tail call float @llvm.minnum.f32(float %load, float 0.0)
    362   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
    363   store float %canonicalized, float addrspace(1)* %gep, align 4
    364   ret void
    365 }
    366 
    367 ; GCN-LABEL: test_fold_canonicalize_minnum_value_f32:
    368 ; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
    369 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
    370 ; GCN-NOT: 1.0
    371 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(float addrspace(1)* %arg) {
    372   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    373   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
    374   %load = load float, float addrspace(1)* %gep, align 4
    375   %v0 = fadd float %load, 0.0
    376   %v = tail call float @llvm.minnum.f32(float %v0, float 0.0)
    377   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
    378   store float %canonicalized, float addrspace(1)* %gep, align 4
    379   ret void
    380 }
    381 
    382 ; FIXME: Should there be more checks here? minnum with NaN operand is simplified away.
    383 
    384 ; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32:
    385 ; VI:   v_add_u32_e32 v{{[0-9]+}}
    386 ; GFX9:	v_add_co_u32_e32 v{{[0-9]+}}
    387 ; GCN:  {{flat|global}}_store_dword v[{{[0-9:]+}}]
    388 define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace(1)* %arg) {
    389   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    390   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
    391   %load = load float, float addrspace(1)* %gep, align 4
    392   %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 2139095041 to float))
    393   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
    394   store float %canonicalized, float addrspace(1)* %gep, align 4
    395   ret void
    396 }
    397 
    398 ; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
    399 ; GFX9:  v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
    400 ; VI:    v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
    401 ; VI:    v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]]
    402 ; GCN:   {{flat|global}}_store_dword v[{{[0-9:]+}}], [[RESULT]]
    403 ; GFX9-NOT: 1.0
    404 define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) {
    405   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    406   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
    407   %load = load float, float addrspace(1)* %gep, align 4
    408   %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float))
    409   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
    410   store float %canonicalized, float addrspace(1)* %gep, align 4
    411   ret void
    412 }
    413 
    414 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32:
    415 ; GFX9:  v_max_f32_e32 [[RESULT:v[0-9]+]], 0, v{{[0-9]+}}
    416 ; VI:    v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
    417 ; VI:    v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]]
    418 ; GCN:  {{flat|global}}_store_dword v[{{[0-9:]+}}], [[RESULT]]
    419 ; GFX9-NOT: 1.0
    420 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32(float addrspace(1)* %arg) {
    421   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    422   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
    423   %load = load float, float addrspace(1)* %gep, align 4
    424   %v = tail call float @llvm.maxnum.f32(float %load, float 0.0)
    425   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
    426   store float %canonicalized, float addrspace(1)* %gep, align 4
    427   ret void
    428 }
    429 
    430 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_f32:
    431 ; GCN: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
    432 ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
    433 ; GCN-NOT: 1.0
    434 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(float addrspace(1)* %arg) {
    435   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    436   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
    437   %load = load float, float addrspace(1)* %gep, align 4
    438   %v0 = fadd float %load, 0.0
    439   %v = tail call float @llvm.maxnum.f32(float %v0, float 0.0)
    440   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
    441   store float %canonicalized, float addrspace(1)* %gep, align 4
    442   ret void
    443 }
    444 
    445 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_f64:
    446 ; GCN: v_max_f64 [[V:v\[[0-9]+:[0-9]+\]]], v[{{[0-9:]+}}], 0
    447 ; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]]
    448 ; GCN-NOT: 1.0
    449 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(double addrspace(1)* %arg) {
    450   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    451   %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
    452   %load = load double, double addrspace(1)* %gep, align 8
    453   %v0 = fadd double %load, 0.0
    454   %v = tail call double @llvm.maxnum.f64(double %v0, double 0.0)
    455   %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
    456   store double %canonicalized, double addrspace(1)* %gep, align 8
    457   ret void
    458 }
    459 
    460 ; GCN-LABEL: test_no_fold_canonicalize_fmul_value_f32_no_ieee:
    461 ; GCN-EXCEPT: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
    462 define amdgpu_ps float @test_no_fold_canonicalize_fmul_value_f32_no_ieee(float %arg) {
    463 entry:
    464   %v = fmul float %arg, 15.0
    465   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
    466   ret float %canonicalized
    467 }
    468 
    469 ; GCN-LABEL: test_fold_canonicalize_fmul_nnan_value_f32_no_ieee:
    470 ; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
    471 ; GCN-NEXT: ; return
    472 ; GCN-NOT: 1.0
    473 define amdgpu_ps float @test_fold_canonicalize_fmul_nnan_value_f32_no_ieee(float %arg) {
    474 entry:
    475   %v = fmul nnan float %arg, 15.0
    476   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
    477   ret float %canonicalized
    478 }
    479 
    480 ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f32
    481 ; GFX9-DENORM: global_load_dword [[V:v[0-9]+]],
    482 ; GFX9-DENORM: global_store_dword v[{{[0-9:]+}}], [[V]]
    483 ; GFX9-DENORM-NOT: 1.0
    484 ; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
    485 define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f32(float addrspace(1)* %arg, float addrspace(1)* %out) #1 {
    486   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    487   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
    488   %v = load float, float addrspace(1)* %gep, align 4
    489   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
    490   %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
    491   store float %canonicalized, float addrspace(1)* %gep2, align 4
    492   ret void
    493 }
    494 
    495 ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f64
    496 ; GCN: {{flat|global}}_load_dwordx2 [[V:v\[[0-9:]+\]]],
    497 ; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]]
    498 ; GCN-NOT: 1.0
    499 define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(double addrspace(1)* %arg, double addrspace(1)* %out) #1 {
    500   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    501   %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
    502   %v = load double, double addrspace(1)* %gep, align 8
    503   %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
    504   %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
    505   store double %canonicalized, double addrspace(1)* %gep2, align 8
    506   ret void
    507 }
    508 
    509 ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16
    510 ; GCN: {{flat|global}}_load_ushort [[V:v[0-9]+]],
    511 ; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
    512 ; GCN-NOT: 1.0
    513 define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(half addrspace(1)* %arg, half addrspace(1)* %out) #1 {
    514   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
    515   %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
    516   %v = load half, half addrspace(1)* %gep, align 2
    517   %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
    518   %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
    519   store half %canonicalized, half addrspace(1)* %gep2, align 2
    520   ret void
    521 }
    522 
    523 ; Avoid failing the test on FreeBSD11.0 which will match the GCN-NOT: 1.0
    524 ; in the .amd_amdgpu_isa "amdgcn-unknown-freebsd11.0--gfx802" directive
    525 ; CHECK: .amd_amdgpu_isa
    526 
    527 declare float @llvm.canonicalize.f32(float) #0
    528 declare double @llvm.canonicalize.f64(double) #0
    529 declare half @llvm.canonicalize.f16(half) #0
    530 declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
    531 declare i32 @llvm.amdgcn.workitem.id.x() #0
    532 declare float @llvm.sqrt.f32(float) #0
    533 declare float @llvm.ceil.f32(float) #0
    534 declare float @llvm.floor.f32(float) #0
    535 declare float @llvm.fma.f32(float, float, float) #0
    536 declare float @llvm.fmuladd.f32(float, float, float) #0
    537 declare float @llvm.fabs.f32(float) #0
    538 declare float @llvm.sin.f32(float) #0
    539 declare float @llvm.cos.f32(float) #0
    540 declare half @llvm.sin.f16(half) #0
    541 declare half @llvm.cos.f16(half) #0
    542 declare float @llvm.minnum.f32(float, float) #0
    543 declare float @llvm.maxnum.f32(float, float) #0
    544 declare double @llvm.maxnum.f64(double, double) #0
    545 
    546 attributes #0 = { nounwind readnone }
    547 attributes #1 = { "no-nans-fp-math"="true" }
    548