Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
      2 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s
      3 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s
      4 
      5 ; GCN-LABEL: {{^}}mac_vvv:
      6 ; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
      7 ; GCN: buffer_load_dword [[B:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4
      8 ; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
      9 ; GCN: v_mac_f32_e32 [[C]], [[A]], [[B]]
     10 ; GCN: buffer_store_dword [[C]]
     11 define amdgpu_kernel void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
     12 entry:
     13   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
     14   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
     15 
     16   %a = load volatile float, float addrspace(1)* %in
     17   %b = load volatile float, float addrspace(1)* %b_ptr
     18   %c = load volatile float, float addrspace(1)* %c_ptr
     19 
     20   %tmp0 = fmul float %a, %b
     21   %tmp1 = fadd float %tmp0, %c
     22   store float %tmp1, float addrspace(1)* %out
     23   ret void
     24 }
     25 
     26 ; GCN-LABEL: {{^}}mad_inline_sgpr_inline:
     27 ; GCN-NOT: v_mac_f32
     28 ; GCN: v_mad_f32 v{{[0-9]}}, s{{[0-9]+}}, 0.5, 0.5
     29 define amdgpu_kernel void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) #0 {
     30 entry:
     31   %tmp0 = fmul float 0.5, %in
     32   %tmp1 = fadd float %tmp0, 0.5
     33   store float %tmp1, float addrspace(1)* %out
     34   ret void
     35 }
     36 
     37 ; GCN-LABEL: {{^}}mad_vvs:
     38 ; GCN-NOT: v_mac_f32
     39 ; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
     40 define amdgpu_kernel void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) #0 {
     41 entry:
     42   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
     43 
     44   %a = load float, float addrspace(1)* %in
     45   %b = load float, float addrspace(1)* %b_ptr
     46 
     47   %tmp0 = fmul float %a, %b
     48   %tmp1 = fadd float %tmp0, %c
     49   store float %tmp1, float addrspace(1)* %out
     50   ret void
     51 }
     52 
     53 ; GCN-LABEL: {{^}}mac_ssv:
     54 ; GCN: v_mac_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
     55 define amdgpu_kernel void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) #0 {
     56 entry:
     57   %c = load float, float addrspace(1)* %in
     58 
     59   %tmp0 = fmul float %a, %a
     60   %tmp1 = fadd float %tmp0, %c
     61   store float %tmp1, float addrspace(1)* %out
     62   ret void
     63 }
     64 
     65 ; GCN-LABEL: {{^}}mac_mad_same_add:
     66 ; GCN: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
     67 ; GCN: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
     68 define amdgpu_kernel void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
     69 entry:
     70   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
     71   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
     72   %d_ptr = getelementptr float, float addrspace(1)* %in, i32 3
     73   %e_ptr = getelementptr float, float addrspace(1)* %in, i32 4
     74 
     75   %a = load volatile float, float addrspace(1)* %in
     76   %b = load volatile float, float addrspace(1)* %b_ptr
     77   %c = load volatile float, float addrspace(1)* %c_ptr
     78   %d = load volatile float, float addrspace(1)* %d_ptr
     79   %e = load volatile float, float addrspace(1)* %e_ptr
     80 
     81   %tmp0 = fmul float %a, %b
     82   %tmp1 = fadd float %tmp0, %c
     83 
     84   %tmp2 = fmul float %d, %e
     85   %tmp3 = fadd float %tmp2, %c
     86 
     87   %out1 = getelementptr float, float addrspace(1)* %out, i32 1
     88   store float %tmp1, float addrspace(1)* %out
     89   store float %tmp3, float addrspace(1)* %out1
     90   ret void
     91 }
     92 
     93 ; There is no advantage to using v_mac when one of the operands is negated
     94 ; and v_mad accepts more operand types.
     95 
     96 ; GCN-LABEL: {{^}}mad_neg_src0:
     97 ; GCN-NOT: v_mac_f32
     98 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
     99 define amdgpu_kernel void @mad_neg_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
    100 entry:
    101   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
    102   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
    103 
    104   %a = load float, float addrspace(1)* %in
    105   %b = load float, float addrspace(1)* %b_ptr
    106   %c = load float, float addrspace(1)* %c_ptr
    107 
    108   %neg_a = fsub float -0.0, %a
    109   %tmp0 = fmul float %neg_a, %b
    110   %tmp1 = fadd float %tmp0, %c
    111 
    112   store float %tmp1, float addrspace(1)* %out
    113   ret void
    114 }
    115 
    116 ; GCN-LABEL: {{^}}nsz_mad_sub0_src0:
    117 ; GCN-NOT: v_mac_f32
    118 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
    119 define amdgpu_kernel void @nsz_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
    120 entry:
    121   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
    122   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
    123 
    124   %a = load float, float addrspace(1)* %in
    125   %b = load float, float addrspace(1)* %b_ptr
    126   %c = load float, float addrspace(1)* %c_ptr
    127 
    128   %neg_a = fsub float 0.0, %a
    129   %tmp0 = fmul float %neg_a, %b
    130   %tmp1 = fadd float %tmp0, %c
    131 
    132   store float %tmp1, float addrspace(1)* %out
    133   ret void
    134 }
    135 
    136 ; GCN-LABEL: {{^}}safe_mad_sub0_src0:
    137 ; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0,
    138 ; GCN: v_mac_f32_e32 v{{[0-9]+}}, [[SUB0]], v{{[0-9]+}}
    139 define amdgpu_kernel void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
    140 entry:
    141   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
    142   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
    143 
    144   %a = load float, float addrspace(1)* %in
    145   %b = load float, float addrspace(1)* %b_ptr
    146   %c = load float, float addrspace(1)* %c_ptr
    147 
    148   %neg_a = fsub float 0.0, %a
    149   %tmp0 = fmul float %neg_a, %b
    150   %tmp1 = fadd float %tmp0, %c
    151 
    152   store float %tmp1, float addrspace(1)* %out
    153   ret void
    154 }
    155 
    156 ; GCN-LABEL: {{^}}mad_neg_src1:
    157 ; GCN-NOT: v_mac_f32
    158 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
    159 define amdgpu_kernel void @mad_neg_src1(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
    160 entry:
    161   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
    162   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
    163 
    164   %a = load float, float addrspace(1)* %in
    165   %b = load float, float addrspace(1)* %b_ptr
    166   %c = load float, float addrspace(1)* %c_ptr
    167 
    168   %neg_b = fsub float -0.0, %b
    169   %tmp0 = fmul float %a, %neg_b
    170   %tmp1 = fadd float %tmp0, %c
    171 
    172   store float %tmp1, float addrspace(1)* %out
    173   ret void
    174 }
    175 
    176 ; GCN-LABEL: {{^}}nsz_mad_sub0_src1:
    177 ; GCN-NOT: v_mac_f32
    178 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
    179 define amdgpu_kernel void @nsz_mad_sub0_src1(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
    180 entry:
    181   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
    182   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
    183 
    184   %a = load float, float addrspace(1)* %in
    185   %b = load float, float addrspace(1)* %b_ptr
    186   %c = load float, float addrspace(1)* %c_ptr
    187 
    188   %neg_b = fsub float 0.0, %b
    189   %tmp0 = fmul float %a, %neg_b
    190   %tmp1 = fadd float %tmp0, %c
    191 
    192   store float %tmp1, float addrspace(1)* %out
    193   ret void
    194 }
    195 
    196 ; GCN-LABEL: {{^}}mad_neg_src2:
    197 ; GCN-NOT: v_mac
    198 ; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
    199 define amdgpu_kernel void @mad_neg_src2(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
    200 entry:
    201   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
    202   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
    203 
    204   %a = load float, float addrspace(1)* %in
    205   %b = load float, float addrspace(1)* %b_ptr
    206   %c = load float, float addrspace(1)* %c_ptr
    207 
    208   %neg_c = fsub float -0.0, %c
    209   %tmp0 = fmul float %a, %b
    210   %tmp1 = fadd float %tmp0, %neg_c
    211 
    212   store float %tmp1, float addrspace(1)* %out
    213   ret void
    214 }
    215 
    216 ; Without special casing the inline constant check for v_mac_f32's
    217 ; src2, this fails to fold the 1.0 into a mad.
    218 
    219 ; GCN-LABEL: {{^}}fold_inline_imm_into_mac_src2_f32:
    220 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
    221 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
    222 
    223 ; GCN: v_add_f32_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
    224 ; GCN: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
    225 define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) #3 {
    226 bb:
    227   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    228   %tid.ext = sext i32 %tid to i64
    229   %gep.a = getelementptr inbounds float, float addrspace(1)* %a, i64 %tid.ext
    230   %gep.b = getelementptr inbounds float, float addrspace(1)* %b, i64 %tid.ext
    231   %gep.out = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
    232   %tmp = load volatile float, float addrspace(1)* %gep.a
    233   %tmp1 = load volatile float, float addrspace(1)* %gep.b
    234   %tmp2 = fadd float %tmp, %tmp
    235   %tmp3 = fmul float %tmp2, 4.0
    236   %tmp4 = fsub float 1.0, %tmp3
    237   %tmp5 = fadd float %tmp4, %tmp1
    238   %tmp6 = fadd float %tmp1, %tmp1
    239   %tmp7 = fmul float %tmp6, %tmp
    240   %tmp8 = fsub float 1.0, %tmp7
    241   %tmp9 = fmul float %tmp8, 8.0
    242   %tmp10 = fadd float %tmp5, %tmp9
    243   store float %tmp10, float addrspace(1)* %gep.out
    244   ret void
    245 }
    246 
    247 ; GCN-LABEL: {{^}}fold_inline_imm_into_mac_src2_f16:
    248 ; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
    249 ; GCN: {{buffer|flat}}_load_ushort [[B:v[0-9]+]]
    250 
    251 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[A]]
    252 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[B]]
    253 
    254 ; SI: v_add_f32_e32 [[TMP2:v[0-9]+]], [[CVT_A]], [[CVT_A]]
    255 ; SI: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
    256 ; SI: v_mac_f32_e32 v{{[0-9]+}}, 0x41000000, v{{[0-9]+}}
    257 
    258 ; VI-FLUSH: v_add_f16_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
    259 ; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
    260 define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f16(half addrspace(1)* %out, half addrspace(1)* %a, half addrspace(1)* %b) #3 {
    261 bb:
    262   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    263   %tid.ext = sext i32 %tid to i64
    264   %gep.a = getelementptr inbounds half, half addrspace(1)* %a, i64 %tid.ext
    265   %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
    266   %gep.out = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
    267   %tmp = load volatile half, half addrspace(1)* %gep.a
    268   %tmp1 = load volatile half, half addrspace(1)* %gep.b
    269   %tmp2 = fadd half %tmp, %tmp
    270   %tmp3 = fmul half %tmp2, 4.0
    271   %tmp4 = fsub half 1.0, %tmp3
    272   %tmp5 = fadd half %tmp4, %tmp1
    273   %tmp6 = fadd half %tmp1, %tmp1
    274   %tmp7 = fmul half %tmp6, %tmp
    275   %tmp8 = fsub half 1.0, %tmp7
    276   %tmp9 = fmul half %tmp8, 8.0
    277   %tmp10 = fadd half %tmp5, %tmp9
    278   store half %tmp10, half addrspace(1)* %gep.out
    279   ret void
    280 }
    281 
    282 declare i32 @llvm.amdgcn.workitem.id.x() #2
    283 
    284 attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" }
    285 attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" }
    286 attributes #2 = { nounwind readnone }
    287 attributes #3 = { nounwind }
    288