Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+fp32-denormals < %s | FileCheck --check-prefixes=GCN,GCN-DENORM %s
      2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-fp32-denormals < %s | FileCheck --check-prefixes=GCN,GCN-FLUSH %s
      3 
      4 ; GCN-LABEL: {{^}}div_1_by_x_25ulp:
      5 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
      6 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
      7 ; GCN-DAG:        s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
      8 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
      9 ; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
     10 ; GCN-DENORM:     v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]]
     11 ; GCN-DENORM:     v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
     12 ; GCN-DENORM:     v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
     13 
     14 ; GCN-FLUSH:      v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]]
     15 
     16 ; GCN:            global_store_dword v[{{[0-9:]+}}], [[OUT]], off
     17 define amdgpu_kernel void @div_1_by_x_25ulp(float addrspace(1)* %arg) {
     18   %load = load float, float addrspace(1)* %arg, align 4
     19   %div = fdiv float 1.000000e+00, %load, !fpmath !0
     20   store float %div, float addrspace(1)* %arg, align 4
     21   ret void
     22 }
     23 
     24 ; GCN-LABEL: {{^}}div_minus_1_by_x_25ulp:
     25 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
     26 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
     27 ; GCN-DAG:        s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
     28 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
     29 ; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
     30 ; GCN-DENORM:     v_mul_f32_e64 [[PRESCALED:v[0-9]+]], [[VAL]], -[[SCALE]]
     31 ; GCN-DENORM:     v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
     32 ; GCN-DENORM:     v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
     33 
     34 ; GCN-FLUSH:      v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]]
     35 
     36 ; GCN:            global_store_dword v[{{[0-9:]+}}], [[OUT]], off
     37 define amdgpu_kernel void @div_minus_1_by_x_25ulp(float addrspace(1)* %arg) {
     38   %load = load float, float addrspace(1)* %arg, align 4
     39   %div = fdiv float -1.000000e+00, %load, !fpmath !0
     40   store float %div, float addrspace(1)* %arg, align 4
     41   ret void
     42 }
     43 
     44 ; GCN-LABEL: {{^}}div_1_by_minus_x_25ulp:
     45 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
     46 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
     47 ; GCN-DAG:        s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
     48 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
     49 ; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
     50 ; GCN-DENORM:     v_mul_f32_e64 [[PRESCALED:v[0-9]+]], -[[VAL]], [[SCALE]]
     51 ; GCN-DENORM:     v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
     52 ; GCN-DENORM:     v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
     53 
     54 ; GCN-FLUSH:      v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]]
     55 
     56 ; GCN:            global_store_dword v[{{[0-9:]+}}], [[OUT]], off
     57 define amdgpu_kernel void @div_1_by_minus_x_25ulp(float addrspace(1)* %arg) {
     58   %load = load float, float addrspace(1)* %arg, align 4
     59   %neg = fsub float -0.000000e+00, %load
     60   %div = fdiv float 1.000000e+00, %neg, !fpmath !0
     61   store float %div, float addrspace(1)* %arg, align 4
     62   ret void
     63 }
     64 
     65 ; GCN-LABEL: {{^}}div_minus_1_by_minus_x_25ulp:
     66 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
     67 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
     68 ; GCN-DAG:        s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
     69 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
     70 ; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
     71 ; GCN-DENORM:     v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]]
     72 ; GCN-DENORM:     v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
     73 ; GCN-DENORM:     v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
     74 
     75 ; GCN-FLUSH:      v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]]
     76 
     77 ; GCN:            global_store_dword v[{{[0-9:]+}}], [[OUT]], off
     78 define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(float addrspace(1)* %arg) {
     79   %load = load float, float addrspace(1)* %arg, align 4
     80   %neg = fsub float -0.000000e+00, %load
     81   %div = fdiv float -1.000000e+00, %neg, !fpmath !0
     82   store float %div, float addrspace(1)* %arg, align 4
     83   ret void
     84 }
     85 
     86 ; GCN-LABEL: {{^}}div_v4_1_by_x_25ulp:
     87 ; GCN-DAG:        s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
     88 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
     89 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
     90 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
     91 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
     92 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
     93 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
     94 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
     95 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
     96 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
     97 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
     98 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
     99 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
    100 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
    101 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
    102 ; GCN-DENORM-DAG: v_rcp_f32_e32
    103 ; GCN-DENORM-DAG: v_rcp_f32_e32
    104 ; GCN-DENORM-DAG: v_rcp_f32_e32
    105 ; GCN-DENORM-DAG: v_rcp_f32_e32
    106 ; GCN-DENORM-DAG: v_mul_f32_e32
    107 ; GCN-DENORM-DAG: v_mul_f32_e32
    108 ; GCN-DENORM-DAG: v_mul_f32_e32
    109 ; GCN-DENORM-DAG: v_mul_f32_e32
    110 
    111 ; GCN-FLUSH:      v_rcp_f32_e32 v[[OUT0:[0-9]+]], s[[VAL0]]
    112 ; GCN-FLUSH:      v_rcp_f32_e32
    113 ; GCN-FLUSH:      v_rcp_f32_e32
    114 ; GCN-FLUSH:      v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]]
    115 ; GCN-FLUSH:      global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off
    116 define amdgpu_kernel void @div_v4_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
    117   %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
    118   %div = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %load, !fpmath !0
    119   store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
    120   ret void
    121 }
    122 
    123 ; GCN-LABEL: {{^}}div_v4_minus_1_by_x_25ulp:
    124 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
    125 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
    126 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
    127 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
    128 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
    129 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
    130 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
    131 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
    132 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
    133 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
    134 ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
    135 ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
    136 ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
    137 ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
    138 ; GCN-DENORM-DAG: v_rcp_f32_e32
    139 ; GCN-DENORM-DAG: v_rcp_f32_e32
    140 ; GCN-DENORM-DAG: v_rcp_f32_e32
    141 ; GCN-DENORM-DAG: v_rcp_f32_e32
    142 ; GCN-DENORM-DAG: v_mul_f32_e32
    143 ; GCN-DENORM-DAG: v_mul_f32_e32
    144 ; GCN-DENORM-DAG: v_mul_f32_e32
    145 ; GCN-DENORM-DAG: v_mul_f32_e32
    146 
    147 ; GCN-FLUSH:      v_rcp_f32_e64 v[[OUT0:[0-9]+]], -s[[VAL0]]
    148 ; GCN-FLUSH:      v_rcp_f32_e64
    149 ; GCN-FLUSH:      v_rcp_f32_e64
    150 ; GCN-FLUSH:      v_rcp_f32_e64 v[[OUT3:[0-9]+]], -s[[VAL3]]
    151 define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
    152   %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
    153   %div = fdiv <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, %load, !fpmath !0
    154   store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
    155   ret void
    156 }
    157 
    158 ; GCN-LABEL: {{^}}div_v4_1_by_minus_x_25ulp:
    159 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
    160 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
    161 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
    162 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
    163 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
    164 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
    165 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
    166 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
    167 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
    168 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
    169 ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
    170 ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
    171 ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
    172 ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
    173 ; GCN-DENORM-DAG: v_rcp_f32_e32
    174 ; GCN-DENORM-DAG: v_rcp_f32_e32
    175 ; GCN-DENORM-DAG: v_rcp_f32_e32
    176 ; GCN-DENORM-DAG: v_rcp_f32_e32
    177 ; GCN-DENORM-DAG: v_mul_f32_e32
    178 ; GCN-DENORM-DAG: v_mul_f32_e32
    179 ; GCN-DENORM-DAG: v_mul_f32_e32
    180 ; GCN-DENORM-DAG: v_mul_f32_e32
    181 
    182 ; GCN-FLUSH:      v_rcp_f32_e64 v[[OUT0:[0-9]+]], -s[[VAL0]]
    183 ; GCN-FLUSH:      v_rcp_f32_e64
    184 ; GCN-FLUSH:      v_rcp_f32_e64
    185 ; GCN-FLUSH:      v_rcp_f32_e64 v[[OUT3:[0-9]+]], -s[[VAL3]]
    186 ; GCN-FLUSH:      global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off
    187 define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
    188   %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
    189   %neg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %load
    190   %div = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %neg, !fpmath !0
    191   store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
    192   ret void
    193 }
    194 
    195 ; GCN-LABEL: {{^}}div_v4_minus_1_by_minus_x_25ulp:
    196 ; GCN-DAG:        s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
    197 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
    198 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
    199 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
    200 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
    201 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
    202 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
    203 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
    204 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
    205 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
    206 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
    207 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
    208 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
    209 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
    210 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
    211 ; GCN-DENORM-DAG: v_rcp_f32_e32
    212 ; GCN-DENORM-DAG: v_rcp_f32_e32
    213 ; GCN-DENORM-DAG: v_rcp_f32_e32
    214 ; GCN-DENORM-DAG: v_rcp_f32_e32
    215 ; GCN-DENORM-DAG: v_mul_f32_e32
    216 ; GCN-DENORM-DAG: v_mul_f32_e32
    217 ; GCN-DENORM-DAG: v_mul_f32_e32
    218 ; GCN-DENORM-DAG: v_mul_f32_e32
    219 
    220 ; GCN-FLUSH:      v_rcp_f32_e32 v[[OUT0:[0-9]+]], s[[VAL0]]
    221 ; GCN-FLUSH:      v_rcp_f32_e32
    222 ; GCN-FLUSH:      v_rcp_f32_e32
    223 ; GCN-FLUSH:      v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]]
    224 ; GCN-FLUSH:      global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off
    225 define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
    226   %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
    227   %neg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %load
    228   %div = fdiv <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, %neg, !fpmath !0
    229   store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
    230   ret void
    231 }
    232 
    233 ; GCN-LABEL: {{^}}div_v4_c_by_x_25ulp:
    234 ; GCN-DAG:        v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
    235 ; GCN-DAG:        v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
    236 ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}}
    237 ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}}
    238 ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
    239 ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
    240 ; GCN-DENORM-DAG: v_rcp_f32_e32
    241 ; GCN-DENORM-DAG: v_rcp_f32_e32
    242 
    243 ; GCN-DAG:        v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
    244 ; GCN-DAG:        v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
    245 ; GCN-DAG:        v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
    246 ; GCN-DAG:        v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
    247 
    248 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
    249 ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
    250 ; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}}
    251 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]]
    252 ; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}}
    253 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP2]]
    254 
    255 ; GCN-DENORM-DAG: v_div_fmas_f32
    256 ; GCN-DENORM-DAG: v_div_fmas_f32
    257 ; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, 2.0{{$}}
    258 ; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}}
    259 
    260 ; GCN-FLUSH-DAG:  v_rcp_f32_e32
    261 ; GCN-FLUSH-DAG:  v_rcp_f32_e64
    262 
    263 ; GCN-NOT:        v_cmp_gt_f32_e64
    264 ; GCN-NOT:        v_cndmask_b32_e32
    265 ; GCN-FLUSH-NOT:  v_div
    266 
    267 ; GCN:            global_store_dwordx4
    268 define amdgpu_kernel void @div_v4_c_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
    269   %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
    270   %div = fdiv <4 x float> <float 2.000000e+00, float 1.000000e+00, float -1.000000e+00, float -2.000000e+00>, %load, !fpmath !0
    271   store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
    272   ret void
    273 }
    274 
    275 ; GCN-LABEL: {{^}}div_v4_c_by_minus_x_25ulp:
    276 ; GCN-DAG:        v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
    277 ; GCN-DAG:        v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
    278 ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
    279 ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
    280 ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
    281 ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
    282 ; GCN-DENORM-DAG: v_rcp_f32_e32
    283 ; GCN-DENORM-DAG: v_rcp_f32_e32
    284 
    285 ; GCN-DAG:        v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
    286 ; GCN-DAG:        v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
    287 ; GCN-DAG:        v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
    288 ; GCN-DAG:        v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
    289 
    290 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
    291 ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
    292 ; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}}
    293 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]]
    294 ; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}}
    295 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP2]]
    296 
    297 ; GCN-DENORM-DAG: v_div_fmas_f32
    298 ; GCN-DENORM-DAG: v_div_fmas_f32
    299 ; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}}
    300 ; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}}
    301 
    302 ; GCN-FLUSH-DAG:  v_rcp_f32_e32
    303 ; GCN-FLUSH-DAG:  v_rcp_f32_e64
    304 
    305 ; GCN-NOT:        v_cmp_gt_f32_e64
    306 ; GCN-NOT:        v_cndmask_b32_e32
    307 ; GCN-FLUSH-NOT:  v_div
    308 
    309 ; GCN:            global_store_dwordx4
    310 define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
    311   %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
    312   %neg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %load
    313   %div = fdiv <4 x float> <float 2.000000e+00, float 1.000000e+00, float -1.000000e+00, float -2.000000e+00>, %neg, !fpmath !0
    314   store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
    315   ret void
    316 }
    317 
    318 ; GCN-LABEL: {{^}}div_v_by_x_25ulp:
    319 ; GCN-DAG:        s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
    320 
    321 ; GCN-DENORM-DAG: v_div_scale_f32
    322 ; GCN-DENORM-DAG: v_rcp_f32_e32
    323 ; GCN-DENORM-DAG: v_div_scale_f32
    324 ; GCN-DENORM:     v_div_fmas_f32
    325 ; GCN-DENORM:     v_div_fixup_f32 [[OUT:v[0-9]+]],
    326 
    327 ; GCN-FLUSF-DAG:  v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
    328 ; GCN-FLUSH-DAG:  v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
    329 ; GCN-FLUSH-DAG:  v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
    330 ; GCN-FLUSH-DAG:  v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
    331 ; GCN-FLUSH:      v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]]
    332 ; GCN-FLUSH:      v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
    333 ; GCN-FLUSH:      v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
    334 
    335 ; GCN:            global_store_dword v[{{[0-9:]+}}], [[OUT]], off
    336 define amdgpu_kernel void @div_v_by_x_25ulp(float addrspace(1)* %arg, float %num) {
    337   %load = load float, float addrspace(1)* %arg, align 4
    338   %div = fdiv float %num, %load, !fpmath !0
    339   store float %div, float addrspace(1)* %arg, align 4
    340   ret void
    341 }
    342 
    343 ; GCN-LABEL: {{^}}div_1_by_x_fast:
    344 ; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
    345 ; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
    346 ; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
    347 define amdgpu_kernel void @div_1_by_x_fast(float addrspace(1)* %arg) {
    348   %load = load float, float addrspace(1)* %arg, align 4
    349   %div = fdiv fast float 1.000000e+00, %load
    350   store float %div, float addrspace(1)* %arg, align 4
    351   ret void
    352 }
    353 
    354 ; GCN-LABEL: {{^}}div_minus_1_by_x_fast:
    355 ; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
    356 ; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
    357 ; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
    358 define amdgpu_kernel void @div_minus_1_by_x_fast(float addrspace(1)* %arg) {
    359   %load = load float, float addrspace(1)* %arg, align 4
    360   %div = fdiv fast float -1.000000e+00, %load
    361   store float %div, float addrspace(1)* %arg, align 4
    362   ret void
    363 }
    364 
    365 ; GCN-LABEL: {{^}}div_1_by_minus_x_fast:
    366 ; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
    367 ; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
    368 ; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
    369 define amdgpu_kernel void @div_1_by_minus_x_fast(float addrspace(1)* %arg) {
    370   %load = load float, float addrspace(1)* %arg, align 4
    371   %neg = fsub float -0.000000e+00, %load
    372   %div = fdiv fast float 1.000000e+00, %neg
    373   store float %div, float addrspace(1)* %arg, align 4
    374   ret void
    375 }
    376 
    377 ; GCN-LABEL: {{^}}div_minus_1_by_minus_x_fast:
    378 ; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
    379 ; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
    380 ; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
    381 define amdgpu_kernel void @div_minus_1_by_minus_x_fast(float addrspace(1)* %arg) {
    382   %load = load float, float addrspace(1)* %arg, align 4
    383   %neg = fsub float -0.000000e+00, %load
    384   %div = fdiv fast float -1.000000e+00, %neg
    385   store float %div, float addrspace(1)* %arg, align 4
    386   ret void
    387 }
    388 
    389 ; GCN-LABEL: {{^}}div_1_by_x_correctly_rounded:
    390 ; GCN-DENORM-DAG: v_div_scale_f32
    391 ; GCN-DENORM-DAG: v_rcp_f32_e32
    392 ; GCN-DENORM-DAG: v_div_scale_f32
    393 ; GCN-DENORM:     v_div_fmas_f32
    394 ; GCN-DENORM:     v_div_fixup_f32
    395 
    396 ; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
    397 ; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
    398 ; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
    399 define amdgpu_kernel void @div_1_by_x_correctly_rounded(float addrspace(1)* %arg) {
    400   %load = load float, float addrspace(1)* %arg, align 4
    401   %div = fdiv float 1.000000e+00, %load
    402   store float %div, float addrspace(1)* %arg, align 4
    403   ret void
    404 }
    405 
    406 ; GCN-LABEL: {{^}}div_minus_1_by_x_correctly_rounded:
    407 ; GCN-DENORM-DAG: v_div_scale_f32
    408 ; GCN-DENORM-DAG: v_rcp_f32_e32
    409 ; GCN-DENORM-DAG: v_div_scale_f32
    410 ; GCN-DENORM:     v_div_fmas_f32
    411 ; GCN-DENORM:     v_div_fixup_f32
    412 
    413 ; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
    414 ; GCN-FLUSH: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
    415 ; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
    416 define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(float addrspace(1)* %arg) {
    417   %load = load float, float addrspace(1)* %arg, align 4
    418   %div = fdiv float -1.000000e+00, %load
    419   store float %div, float addrspace(1)* %arg, align 4
    420   ret void
    421 }
    422 
    423 ; GCN-LABEL: {{^}}div_1_by_minus_x_correctly_rounded:
    424 ; GCN-DENORM-DAG: v_div_scale_f32
    425 ; GCN-DENORM-DAG: v_rcp_f32_e32
    426 ; GCN-DENORM-DAG: v_div_scale_f32
    427 ; GCN-DENORM:     v_div_fmas_f32
    428 ; GCN-DENORM:     v_div_fixup_f32
    429 
    430 ; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
    431 ; GCN-FLUSH: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
    432 ; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
    433 define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) {
    434   %load = load float, float addrspace(1)* %arg, align 4
    435   %neg = fsub float -0.000000e+00, %load
    436   %div = fdiv float 1.000000e+00, %neg
    437   store float %div, float addrspace(1)* %arg, align 4
    438   ret void
    439 }
    440 
    441 ; GCN-LABEL: {{^}}div_minus_1_by_minus_x_correctly_rounded:
    442 ; GCN-DENORM-DAG: v_div_scale_f32
    443 ; GCN-DENORM-DAG: v_rcp_f32_e32
    444 ; GCN-DENORM-DAG: v_div_scale_f32
    445 ; GCN-DENORM:     v_div_fmas_f32
    446 ; GCN-DENORM:     v_div_fixup_f32
    447 
    448 ; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
    449 ; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
    450 ; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
    451 define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) {
    452   %load = load float, float addrspace(1)* %arg, align 4
    453   %neg = fsub float -0.000000e+00, %load
    454   %div = fdiv float -1.000000e+00, %neg
    455   store float %div, float addrspace(1)* %arg, align 4
    456   ret void
    457 }
    458 
    459 !0 = !{float 2.500000e+00}
    460