Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
      2 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,VI %s
      3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s
      4 
      5 ; GCN-LABEL: {{^}}v_clamp_f32:
      6 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
      7 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
      8 define amdgpu_kernel void @v_clamp_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
      9   %tid = call i32 @llvm.amdgcn.workitem.id.x()
     10   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
     11   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
     12   %a = load float, float addrspace(1)* %gep0
     13   %max = call float @llvm.maxnum.f32(float %a, float 0.0)
     14   %med = call float @llvm.minnum.f32(float %max, float 1.0)
     15 
     16   store float %med, float addrspace(1)* %out.gep
     17   ret void
     18 }
     19 
     20 ; GCN-LABEL: {{^}}v_clamp_neg_f32:
     21 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
     22 ; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
     23 define amdgpu_kernel void @v_clamp_neg_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
     24   %tid = call i32 @llvm.amdgcn.workitem.id.x()
     25   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
     26   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
     27   %a = load float, float addrspace(1)* %gep0
     28   %fneg.a = fsub float -0.0, %a
     29   %max = call float @llvm.maxnum.f32(float %fneg.a, float 0.0)
     30   %med = call float @llvm.minnum.f32(float %max, float 1.0)
     31 
     32   store float %med, float addrspace(1)* %out.gep
     33   ret void
     34 }
     35 
     36 ; GCN-LABEL: {{^}}v_clamp_negabs_f32:
     37 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
     38 ; GCN: v_max_f32_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}}
     39 define amdgpu_kernel void @v_clamp_negabs_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
     40   %tid = call i32 @llvm.amdgcn.workitem.id.x()
     41   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
     42   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
     43   %a = load float, float addrspace(1)* %gep0
     44   %fabs.a = call float @llvm.fabs.f32(float %a)
     45   %fneg.fabs.a = fsub float -0.0, %fabs.a
     46 
     47   %max = call float @llvm.maxnum.f32(float %fneg.fabs.a, float 0.0)
     48   %med = call float @llvm.minnum.f32(float %max, float 1.0)
     49 
     50   store float %med, float addrspace(1)* %out.gep
     51   ret void
     52 }
     53 
     54 ; GCN-LABEL: {{^}}v_clamp_negzero_f32:
     55 ; GCN-DAG: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
     56 ; GCN-DAG: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1
     57 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[SIGNBIT]], 1.0
     58 define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
     59   %tid = call i32 @llvm.amdgcn.workitem.id.x()
     60   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
     61   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
     62   %a = load float, float addrspace(1)* %gep0
     63   %max = call float @llvm.maxnum.f32(float %a, float -0.0)
     64   %med = call float @llvm.minnum.f32(float %max, float 1.0)
     65 
     66   store float %med, float addrspace(1)* %out.gep
     67   ret void
     68 }
     69 
     70 ; GCN-LABEL: {{^}}v_clamp_multi_use_max_f32:
     71 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
     72 ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
     73 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
     74 define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
     75   %tid = call i32 @llvm.amdgcn.workitem.id.x()
     76   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
     77   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
     78   %a = load float, float addrspace(1)* %gep0
     79   %max = call float @llvm.maxnum.f32(float %a, float 0.0)
     80   %med = call float @llvm.minnum.f32(float %max, float 1.0)
     81 
     82   store float %med, float addrspace(1)* %out.gep
     83   store volatile float %max, float addrspace(1)* undef
     84   ret void
     85 }
     86 
     87 ; GCN-LABEL: {{^}}v_clamp_f16:
     88 ; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]]
     89 ; GFX89: v_max_f16_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
     90 
     91 ; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], [[A]] clamp{{$}}
     92 ; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
     93 define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
     94   %tid = call i32 @llvm.amdgcn.workitem.id.x()
     95   %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
     96   %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
     97   %a = load half, half addrspace(1)* %gep0
     98   %max = call half @llvm.maxnum.f16(half %a, half 0.0)
     99   %med = call half @llvm.minnum.f16(half %max, half 1.0)
    100 
    101   store half %med, half addrspace(1)* %out.gep
    102   ret void
    103 }
    104 
    105 ; GCN-LABEL: {{^}}v_clamp_neg_f16:
    106 ; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]]
    107 ; GFX89: v_max_f16_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
    108 
    109 ; FIXME: Better to fold neg into max
    110 ; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] clamp{{$}}
    111 ; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
    112 define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
    113   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    114   %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
    115   %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
    116   %a = load half, half addrspace(1)* %gep0
    117   %fneg.a = fsub half -0.0, %a
    118   %max = call half @llvm.maxnum.f16(half %fneg.a, half 0.0)
    119   %med = call half @llvm.minnum.f16(half %max, half 1.0)
    120 
    121   store half %med, half addrspace(1)* %out.gep
    122   ret void
    123 }
    124 
    125 ; GCN-LABEL: {{^}}v_clamp_negabs_f16:
    126 ; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]]
    127 ; GFX89: v_max_f16_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}}
    128 
    129 ; FIXME: Better to fold neg/abs into max
    130 
    131 ; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[A]]| clamp{{$}}
    132 ; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
    133 define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
    134   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    135   %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
    136   %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
    137   %a = load half, half addrspace(1)* %gep0
    138   %fabs.a = call half @llvm.fabs.f16(half %a)
    139   %fneg.fabs.a = fsub half -0.0, %fabs.a
    140 
    141   %max = call half @llvm.maxnum.f16(half %fneg.fabs.a, half 0.0)
    142   %med = call half @llvm.minnum.f16(half %max, half 1.0)
    143 
    144   store half %med, half addrspace(1)* %out.gep
    145   ret void
    146 }
    147 
    148 ; FIXME: Do f64 instructions support clamp?
    149 ; GCN-LABEL: {{^}}v_clamp_f64:
    150 ; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
    151 ; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], [[A]] clamp{{$}}
    152 define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
    153   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    154   %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
    155   %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
    156   %a = load double, double addrspace(1)* %gep0
    157   %max = call double @llvm.maxnum.f64(double %a, double 0.0)
    158   %med = call double @llvm.minnum.f64(double %max, double 1.0)
    159 
    160   store double %med, double addrspace(1)* %out.gep
    161   ret void
    162 }
    163 
    164 ; GCN-LABEL: {{^}}v_clamp_neg_f64:
    165 ; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
    166 ; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -[[A]], -[[A]] clamp{{$}}
    167 define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
    168   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    169   %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
    170   %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
    171   %a = load double, double addrspace(1)* %gep0
    172   %fneg.a = fsub double -0.0, %a
    173   %max = call double @llvm.maxnum.f64(double %fneg.a, double 0.0)
    174   %med = call double @llvm.minnum.f64(double %max, double 1.0)
    175 
    176   store double %med, double addrspace(1)* %out.gep
    177   ret void
    178 }
    179 
    180 ; GCN-LABEL: {{^}}v_clamp_negabs_f64:
    181 ; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
    182 ; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -|[[A]]|, -|[[A]]| clamp{{$}}
    183 define amdgpu_kernel void @v_clamp_negabs_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
    184   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    185   %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
    186   %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
    187   %a = load double, double addrspace(1)* %gep0
    188   %fabs.a = call double @llvm.fabs.f64(double %a)
    189   %fneg.fabs.a = fsub double -0.0, %fabs.a
    190 
    191   %max = call double @llvm.maxnum.f64(double %fneg.fabs.a, double 0.0)
    192   %med = call double @llvm.minnum.f64(double %max, double 1.0)
    193 
    194   store double %med, double addrspace(1)* %out.gep
    195   ret void
    196 }
    197 
    198 ; GCN-LABEL: {{^}}v_clamp_med3_aby_negzero_f32:
    199 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    200 ; GCN: v_med3_f32
    201 define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
    202   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    203   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
    204   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    205   %a = load float, float addrspace(1)* %gep0
    206   %med = call float @llvm.amdgcn.fmed3.f32(float -0.0, float 1.0, float %a)
    207   store float %med, float addrspace(1)* %out.gep
    208   ret void
    209 }
    210 
    211 ; GCN-LABEL: {{^}}v_clamp_med3_aby_f32:
    212 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    213 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
    214 define amdgpu_kernel void @v_clamp_med3_aby_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
    215   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    216   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
    217   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    218   %a = load float, float addrspace(1)* %gep0
    219   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
    220   store float %med, float addrspace(1)* %out.gep
    221   ret void
    222 }
    223 
    224 ; GCN-LABEL: {{^}}v_clamp_med3_bay_f32:
    225 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    226 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
    227 define amdgpu_kernel void @v_clamp_med3_bay_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
    228   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    229   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
    230   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    231   %a = load float, float addrspace(1)* %gep0
    232   %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
    233   store float %med, float addrspace(1)* %out.gep
    234   ret void
    235 }
    236 
    237 ; GCN-LABEL: {{^}}v_clamp_med3_yab_f32:
    238 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    239 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
    240 define amdgpu_kernel void @v_clamp_med3_yab_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
    241   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    242   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
    243   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    244   %a = load float, float addrspace(1)* %gep0
    245   %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
    246   store float %med, float addrspace(1)* %out.gep
    247   ret void
    248 }
    249 
    250 ; GCN-LABEL: {{^}}v_clamp_med3_yba_f32:
    251 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    252 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
    253 define amdgpu_kernel void @v_clamp_med3_yba_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
    254   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    255   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
    256   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    257   %a = load float, float addrspace(1)* %gep0
    258   %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
    259   store float %med, float addrspace(1)* %out.gep
    260   ret void
    261 }
    262 
    263 ; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32:
    264 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    265 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
    266 define amdgpu_kernel void @v_clamp_med3_ayb_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
    267   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    268   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
    269   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    270   %a = load float, float addrspace(1)* %gep0
    271   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
    272   store float %med, float addrspace(1)* %out.gep
    273   ret void
    274 }
    275 
    276 ; GCN-LABEL: {{^}}v_clamp_med3_bya_f32:
    277 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    278 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
    279 define amdgpu_kernel void @v_clamp_med3_bya_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
    280   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    281   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
    282   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    283   %a = load float, float addrspace(1)* %gep0
    284   %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
    285   store float %med, float addrspace(1)* %out.gep
    286   ret void
    287 }
    288 
    289 ; GCN-LABEL: {{^}}v_clamp_constants_to_one_f32:
    290 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 1.0
    291 define amdgpu_kernel void @v_clamp_constants_to_one_f32(float addrspace(1)* %out) #0 {
    292   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    293   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    294   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 4.0)
    295   store float %med, float addrspace(1)* %out.gep
    296   ret void
    297 }
    298 
    299 ; GCN-LABEL: {{^}}v_clamp_constants_to_zero_f32:
    300 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
    301 define amdgpu_kernel void @v_clamp_constants_to_zero_f32(float addrspace(1)* %out) #0 {
    302   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    303   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    304   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float -4.0)
    305   store float %med, float addrspace(1)* %out.gep
    306   ret void
    307 }
    308 
    309 ; GCN-LABEL: {{^}}v_clamp_constant_preserve_f32:
    310 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0.5
    311 define amdgpu_kernel void @v_clamp_constant_preserve_f32(float addrspace(1)* %out) #0 {
    312   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    313   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    314   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0.5)
    315   store float %med, float addrspace(1)* %out.gep
    316   ret void
    317 }
    318 
    319 ; GCN-LABEL: {{^}}v_clamp_constant_preserve_denorm_f32:
    320 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fffff{{$}}
    321 define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(float addrspace(1)* %out) #0 {
    322   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    323   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    324   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 8388607 to float))
    325   store float %med, float addrspace(1)* %out.gep
    326   ret void
    327 }
    328 
    329 ; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32:
    330 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
    331 define amdgpu_kernel void @v_clamp_constant_qnan_f32(float addrspace(1)* %out) #0 {
    332   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    333   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    334   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
    335   store float %med, float addrspace(1)* %out.gep
    336   ret void
    337 }
    338 
    339 ; GCN-LABEL: {{^}}v_clamp_constant_snan_f32:
    340 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
    341 define amdgpu_kernel void @v_clamp_constant_snan_f32(float addrspace(1)* %out) #0 {
    342   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    343   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    344   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
    345   store float %med, float addrspace(1)* %out.gep
    346   ret void
    347 }
    348 
    349 ; ---------------------------------------------------------------------
    350 ; Test non-default behaviors enabling snans and disabling dx10_clamp
    351 ; ---------------------------------------------------------------------
    352 
    353 ; GCN-LABEL: {{^}}v_clamp_f32_no_dx10_clamp:
    354 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    355 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
    356 define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
    357   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    358   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
    359   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    360   %a = load float, float addrspace(1)* %gep0
    361   %max = call float @llvm.maxnum.f32(float %a, float 0.0)
    362   %med = call float @llvm.minnum.f32(float %max, float 1.0)
    363 
    364   store float %med, float addrspace(1)* %out.gep
    365   ret void
    366 }
    367 
    368 ; GCN-LABEL: {{^}}v_clamp_f32_snan_dx10clamp:
    369 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    370 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
    371 define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #3 {
    372   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    373   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
    374   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    375   %a = load float, float addrspace(1)* %gep0
    376   %max = call float @llvm.maxnum.f32(float %a, float 0.0)
    377   %med = call float @llvm.minnum.f32(float %max, float 1.0)
    378 
    379   store float %med, float addrspace(1)* %out.gep
    380   ret void
    381 }
    382 
    383 ; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp:
    384 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    385 ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
    386 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
    387 define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
    388   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    389   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
    390   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    391   %a = load float, float addrspace(1)* %gep0
    392   %max = call float @llvm.maxnum.f32(float %a, float 0.0)
    393   %med = call float @llvm.minnum.f32(float %max, float 1.0)
    394 
    395   store float %med, float addrspace(1)* %out.gep
    396   ret void
    397 }
    398 
    399 ; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src:
    400 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    401 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]
    402 ; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0
    403 define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
    404   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    405   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
    406   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    407   %a = load float, float addrspace(1)* %gep0
    408   %add  = fadd nnan float %a, 1.0
    409   %max = call float @llvm.maxnum.f32(float %add, float 0.0)
    410   %med = call float @llvm.minnum.f32(float %max, float 1.0)
    411 
    412   store float %med, float addrspace(1)* %out.gep
    413   ret void
    414 }
    415 
    416 ; GCN-LABEL: {{^}}v_clamp_med3_aby_f32_no_dx10_clamp:
    417 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    418 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
    419 define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
    420   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    421   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
    422   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    423   %a = load float, float addrspace(1)* %gep0
    424   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
    425   store float %med, float addrspace(1)* %out.gep
    426   ret void
    427 }
    428 
    429 ; GCN-LABEL: {{^}}v_clamp_med3_bay_f32_no_dx10_clamp:
    430 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    431 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
    432 define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
    433   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    434   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
    435   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    436   %a = load float, float addrspace(1)* %gep0
    437   %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
    438   store float %med, float addrspace(1)* %out.gep
    439   ret void
    440 }
    441 
    442 ; GCN-LABEL: {{^}}v_clamp_med3_yab_f32_no_dx10_clamp:
    443 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    444 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
    445 define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
    446   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    447   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
    448   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    449   %a = load float, float addrspace(1)* %gep0
    450   %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
    451   store float %med, float addrspace(1)* %out.gep
    452   ret void
    453 }
    454 
    455 ; GCN-LABEL: {{^}}v_clamp_med3_yba_f32_no_dx10_clamp:
    456 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    457 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 1.0, 0
    458 define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
    459   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    460   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
    461   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    462   %a = load float, float addrspace(1)* %gep0
    463   %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
    464   store float %med, float addrspace(1)* %out.gep
    465   ret void
    466 }
    467 
    468 ; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32_no_dx10_clamp:
    469 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    470 ; GCN: v_med3_f32 v{{[0-9]+}}, 0, [[A]], 1.0
    471 define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
    472   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    473   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
    474   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    475   %a = load float, float addrspace(1)* %gep0
    476   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
    477   store float %med, float addrspace(1)* %out.gep
    478   ret void
    479 }
    480 
    481 ; GCN-LABEL: {{^}}v_clamp_med3_bya_f32_no_dx10_clamp:
    482 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    483 ; GCN: v_med3_f32 v{{[0-9]+}}, 1.0, [[A]], 0
    484 define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
    485   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    486   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
    487   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    488   %a = load float, float addrspace(1)* %gep0
    489   %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
    490   store float %med, float addrspace(1)* %out.gep
    491   ret void
    492 }
    493 
    494 ; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32_no_dx10_clamp:
    495 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fc00000
    496 define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
    497   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    498   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    499   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
    500   store float %med, float addrspace(1)* %out.gep
    501   ret void
    502 }
    503 
    504 ; GCN-LABEL: {{^}}v_clamp_constant_snan_f32_no_dx10_clamp:
    505 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7f800001
    506 define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
    507   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    508   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
    509   %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
    510   store float %med, float addrspace(1)* %out.gep
    511   ret void
    512 }
    513 
    514 ; GCN-LABEL: {{^}}v_clamp_v2f16:
    515 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    516 ; GFX9-NOT: [[A]]
    517 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
    518 define amdgpu_kernel void @v_clamp_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
    519   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    520   %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
    521   %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
    522   %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
    523   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> zeroinitializer)
    524   %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
    525 
    526   store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
    527   ret void
    528 }
    529 
    530 ; GCN-LABEL: {{^}}v_clamp_v2f16_undef_elt:
    531 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    532 ; GFX9-NOT: [[A]]
    533 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
    534 define amdgpu_kernel void @v_clamp_v2f16_undef_elt(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
    535   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    536   %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
    537   %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
    538   %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
    539   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
    540   %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
    541 
    542   store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
    543   ret void
    544 }
    545 
    546 ; GCN-LABEL: {{^}}v_clamp_v2f16_not_zero:
    547 ; GFX9: v_pk_max_f16
    548 ; GFX9: v_pk_min_f16
    549 define amdgpu_kernel void @v_clamp_v2f16_not_zero(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
    550   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    551   %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
    552   %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
    553   %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
    554   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 0.0>)
    555   %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
    556 
    557   store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
    558   ret void
    559 }
    560 
    561 ; GCN-LABEL: {{^}}v_clamp_v2f16_not_one:
    562 ; GFX9: v_pk_max_f16
    563 ; GFX9: v_pk_min_f16
    564 define amdgpu_kernel void @v_clamp_v2f16_not_one(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
    565   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    566   %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
    567   %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
    568   %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
    569   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half 0.0>)
    570   %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 0.0, half 1.0>)
    571 
    572   store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
    573   ret void
    574 }
    575 
    576 ; GCN-LABEL: {{^}}v_clamp_neg_v2f16:
    577 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    578 ; GFX9-NOT: [[A]]
    579 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_lo:[1,1] neg_hi:[1,1] clamp{{$}}
    580 define amdgpu_kernel void @v_clamp_neg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
    581   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    582   %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
    583   %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
    584   %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
    585   %fneg.a = fsub <2 x half> <half -0.0, half -0.0>, %a
    586   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.a, <2 x half> zeroinitializer)
    587   %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
    588 
    589   store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
    590   ret void
    591 }
    592 
    593 ; GCN-LABEL: {{^}}v_clamp_negabs_v2f16:
    594 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    595 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, [[A]]
    596 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[ABS]], [[ABS]] neg_lo:[1,1] neg_hi:[1,1] clamp{{$}}
    597 define amdgpu_kernel void @v_clamp_negabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
    598   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    599   %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
    600   %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
    601   %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
    602   %fabs.a = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
    603   %fneg.fabs.a = fsub <2 x half> <half -0.0, half -0.0>, %fabs.a
    604 
    605   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.fabs.a, <2 x half> zeroinitializer)
    606   %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
    607 
    608   store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
    609   ret void
    610 }
    611 
    612 ; GCN-LABEL: {{^}}v_clamp_neglo_v2f16:
    613 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    614 ; GFX9-NOT: [[A]]
    615 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_lo:[1,1] clamp{{$}}
    616 define amdgpu_kernel void @v_clamp_neglo_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
    617   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    618   %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
    619   %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
    620   %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
    621   %lo = extractelement <2 x half> %a, i32 0
    622   %neg.lo = fsub half -0.0, %lo
    623   %neg.lo.vec = insertelement <2 x half> %a, half %neg.lo, i32 0
    624   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.lo.vec, <2 x half> zeroinitializer)
    625   %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
    626 
    627   store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
    628   ret void
    629 }
    630 
    631 ; GCN-LABEL: {{^}}v_clamp_neghi_v2f16:
    632 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    633 ; GFX9-NOT: [[A]]
    634 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_hi:[1,1] clamp{{$}}
    635 define amdgpu_kernel void @v_clamp_neghi_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
    636   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    637   %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
    638   %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
    639   %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
    640   %hi = extractelement <2 x half> %a, i32 1
    641   %neg.hi = fsub half -0.0, %hi
    642   %neg.hi.vec = insertelement <2 x half> %a, half %neg.hi, i32 1
    643   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.hi.vec, <2 x half> zeroinitializer)
    644   %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
    645 
    646   store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
    647   ret void
    648 }
    649 
    650 ; GCN-LABEL: {{^}}v_clamp_v2f16_shuffle:
    651 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
    652 ; GFX9-NOT: [[A]]
    653 ; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] op_sel:[1,1] op_sel_hi:[0,0] clamp{{$}}
    654 define amdgpu_kernel void @v_clamp_v2f16_shuffle(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
    655   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    656   %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
    657   %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
    658   %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
    659   %shuf = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> <i32 1, i32 0>
    660   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %shuf, <2 x half> zeroinitializer)
    661   %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
    662 
    663   store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
    664   ret void
    665 }
    666 
    667 ; GCN-LABEL: {{^}}v_clamp_diff_source_f32:
    668 ; GCN: v_add_f32_e32 [[A:v[0-9]+]]
    669 ; GCN: v_add_f32_e32 [[B:v[0-9]+]]
    670 ; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[B]] clamp{{$}}
    671 define amdgpu_kernel void @v_clamp_diff_source_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0
    672 {
    673   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 0
    674   %gep1 = getelementptr float, float addrspace(1)* %aptr, i32 1
    675   %gep2 = getelementptr float, float addrspace(1)* %aptr, i32 2
    676   %l0 = load float, float addrspace(1)* %gep0
    677   %l1 = load float, float addrspace(1)* %gep1
    678   %l2 = load float, float addrspace(1)* %gep2
    679   %a = fadd nsz float %l0, %l1
    680   %b = fadd nsz float %l0, %l2
    681   %res = call nsz float @llvm.maxnum.f32(float %a, float %b)
    682   %max = call nsz float @llvm.maxnum.f32(float %res, float 0.0)
    683   %min = call nsz float @llvm.minnum.f32(float %max, float 1.0)
    684   %out.gep = getelementptr float, float addrspace(1)* %out, i32 3
    685   store float %min, float addrspace(1)* %out.gep
    686   ret void
    687 }
    688 
    689 declare i32 @llvm.amdgcn.workitem.id.x() #1
    690 declare float @llvm.fabs.f32(float) #1
    691 declare float @llvm.minnum.f32(float, float) #1
    692 declare float @llvm.maxnum.f32(float, float) #1
    693 declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
    694 declare double @llvm.fabs.f64(double) #1
    695 declare double @llvm.minnum.f64(double, double) #1
    696 declare double @llvm.maxnum.f64(double, double) #1
    697 declare half @llvm.fabs.f16(half) #1
    698 declare half @llvm.minnum.f16(half, half) #1
    699 declare half @llvm.maxnum.f16(half, half) #1
    700 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
    701 declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1
    702 declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1
    703 
    704 attributes #0 = { nounwind }
    705 attributes #1 = { nounwind readnone }
    706 attributes #2 = { nounwind "target-features"="-dx10-clamp,-fp-exceptions" "no-nans-fp-math"="false" }
    707 attributes #3 = { nounwind "target-features"="+dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" }
    708 attributes #4 = { nounwind "target-features"="-dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" }
    709