Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
      2 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=+fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9 %s
      3 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9 %s
      4 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9 %s
      5 
      6 ; Make sure fdiv is promoted to f32.
      7 
      8 ; GCN-LABEL: {{^}}v_fdiv_f16
      9 ; SI:     v_cvt_f32_f16
     10 ; SI:     v_cvt_f32_f16
     11 ; SI:     v_div_scale_f32
     12 ; SI-DAG: v_div_scale_f32
     13 ; SI-DAG: v_rcp_f32
     14 ; SI:     v_fma_f32
     15 ; SI:     v_fma_f32
     16 ; SI:     v_mul_f32
     17 ; SI:     v_fma_f32
     18 ; SI:     v_fma_f32
     19 ; SI:     v_fma_f32
     20 ; SI:     v_div_fmas_f32
     21 ; SI:     v_div_fixup_f32
     22 ; SI:     v_cvt_f16_f32
     23 
     24 ; GFX8_9: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
     25 ; GFX8_9: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
     26 
     27 ; GFX8_9-DAG: v_cvt_f32_f16_e32 [[CVT_LHS:v[0-9]+]], [[LHS]]
     28 ; GFX8_9-DAG: v_cvt_f32_f16_e32 [[CVT_RHS:v[0-9]+]], [[RHS]]
     29 
     30 ; GFX8_9-DAG: v_rcp_f32_e32 [[RCP_RHS:v[0-9]+]], [[CVT_RHS]]
     31 ; GFX8_9: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_LHS]], [[RCP_RHS]]
     32 ; GFX8_9: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]]
     33 ; GFX8_9: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]]
     34 ; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
     35 define amdgpu_kernel void @v_fdiv_f16(
     36     half addrspace(1)* %r,
     37     half addrspace(1)* %a,
     38     half addrspace(1)* %b) #0 {
     39 entry:
     40   %tid = call i32 @llvm.amdgcn.workitem.id.x()
     41   %tid.ext = sext i32 %tid to i64
     42   %gep.a = getelementptr inbounds half, half addrspace(1)* %a, i64 %tid.ext
     43   %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
     44   %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
     45   %a.val = load volatile half, half addrspace(1)* %gep.a
     46   %b.val = load volatile half, half addrspace(1)* %gep.b
     47   %r.val = fdiv half %a.val, %b.val
     48   store half %r.val, half addrspace(1)* %gep.r
     49   ret void
     50 }
     51 
     52 ; GCN-LABEL: {{^}}v_rcp_f16:
     53 ; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
     54 ; GFX8_9-NOT: [[VAL]]
     55 ; GFX8_9: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
     56 ; GFX8_9-NOT: [[RESULT]]
     57 ; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
     58 define amdgpu_kernel void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
     59 entry:
     60   %tid = call i32 @llvm.amdgcn.workitem.id.x()
     61   %tid.ext = sext i32 %tid to i64
     62   %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
     63   %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
     64   %b.val = load volatile half, half addrspace(1)* %gep.b
     65   %r.val = fdiv half 1.0, %b.val
     66   store half %r.val, half addrspace(1)* %gep.r
     67   ret void
     68 }
     69 
     70 ; GCN-LABEL: {{^}}v_rcp_f16_abs:
     71 ; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
     72 ; GFX8_9-NOT: [[VAL]]
     73 ; GFX8_9: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]|
     74 ; GFX8_9-NOT: [RESULT]]
     75 ; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
     76 define amdgpu_kernel void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
     77 entry:
     78   %tid = call i32 @llvm.amdgcn.workitem.id.x()
     79   %tid.ext = sext i32 %tid to i64
     80   %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
     81   %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
     82   %b.val = load volatile half, half addrspace(1)* %gep.b
     83   %b.abs = call half @llvm.fabs.f16(half %b.val)
     84   %r.val = fdiv half 1.0, %b.abs
     85   store half %r.val, half addrspace(1)* %gep.r
     86   ret void
     87 }
     88 
     89 ; GCN-LABEL: {{^}}v_rcp_f16_arcp:
     90 ; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
     91 ; GFX8_9-NOT: [[VAL]]
     92 ; GFX8_9: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
     93 ; GFX8_9-NOT: [[RESULT]]
     94 ; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
     95 define amdgpu_kernel void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
     96 entry:
     97   %tid = call i32 @llvm.amdgcn.workitem.id.x()
     98   %tid.ext = sext i32 %tid to i64
     99   %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
    100   %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
    101   %b.val = load volatile half, half addrspace(1)* %gep.b
    102   %r.val = fdiv arcp half 1.0, %b.val
    103   store half %r.val, half addrspace(1)* %gep.r
    104   ret void
    105 }
    106 
    107 ; GCN-LABEL: {{^}}v_rcp_f16_neg:
    108 ; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
    109 ; GFX8_9-NOT: [[VAL]]
    110 ; GFX8_9: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]]
    111 ; GFX8_9-NOT: [RESULT]]
    112 ; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    113 define amdgpu_kernel void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
    114 entry:
    115   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    116   %tid.ext = sext i32 %tid to i64
    117   %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
    118   %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
    119   %b.val = load volatile half, half addrspace(1)* %gep.b
    120   %r.val = fdiv half -1.0, %b.val
    121   store half %r.val, half addrspace(1)* %gep.r
    122   ret void
    123 }
    124 
    125 ; GCN-LABEL: {{^}}v_rsq_f16:
    126 ; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
    127 ; GFX8_9-NOT: [[VAL]]
    128 ; GFX8_9: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
    129 ; GFX8_9-NOT: [RESULT]]
    130 ; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    131 define amdgpu_kernel void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
    132 entry:
    133   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    134   %tid.ext = sext i32 %tid to i64
    135   %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
    136   %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
    137   %b.val = load volatile half, half addrspace(1)* %gep.b
    138   %b.sqrt = call half @llvm.sqrt.f16(half %b.val)
    139   %r.val = fdiv half 1.0, %b.sqrt
    140   store half %r.val, half addrspace(1)* %gep.r
    141   ret void
    142 }
    143 
    144 ; GCN-LABEL: {{^}}v_rsq_f16_neg:
    145 ; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
    146 ; GFX8_9-NOT: [[VAL]]
    147 ; GFX8_9: v_sqrt_f16_e32 [[SQRT:v[0-9]+]], [[VAL]]
    148 ; GFX8_9-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]]
    149 ; GFX8_9-NOT: [RESULT]]
    150 ; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    151 define amdgpu_kernel void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
    152 entry:
    153   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    154   %tid.ext = sext i32 %tid to i64
    155   %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
    156   %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
    157   %b.val = load volatile half, half addrspace(1)* %gep.b
    158   %b.sqrt = call half @llvm.sqrt.f16(half %b.val)
    159   %r.val = fdiv half -1.0, %b.sqrt
    160   store half %r.val, half addrspace(1)* %gep.r
    161   ret void
    162 }
    163 
    164 ; GCN-LABEL: {{^}}v_fdiv_f16_arcp:
    165 ; GFX8_9: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
    166 ; GFX8_9: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
    167 
    168 ; GFX8_9: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
    169 ; GFX8_9: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
    170 
    171 ; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    172 define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
    173 entry:
    174   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    175   %tid.ext = sext i32 %tid to i64
    176   %gep.a = getelementptr inbounds half, half addrspace(1)* %a, i64 %tid.ext
    177   %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
    178   %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
    179   %a.val = load volatile half, half addrspace(1)* %gep.a
    180   %b.val = load volatile half, half addrspace(1)* %gep.b
    181   %r.val = fdiv arcp half %a.val, %b.val
    182   store half %r.val, half addrspace(1)* %gep.r
    183   ret void
    184 }
    185 
    186 ; GCN-LABEL: {{^}}v_fdiv_f16_unsafe:
    187 ; GFX8_9: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
    188 ; GFX8_9: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
    189 
    190 ; GFX8_9: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
    191 ; GFX8_9: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
    192 
    193 ; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
    194 define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 {
    195 entry:
    196   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    197   %tid.ext = sext i32 %tid to i64
    198   %gep.a = getelementptr inbounds half, half addrspace(1)* %a, i64 %tid.ext
    199   %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
    200   %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
    201   %a.val = load volatile half, half addrspace(1)* %gep.a
    202   %b.val = load volatile half, half addrspace(1)* %gep.b
    203   %r.val = fdiv half %a.val, %b.val
    204   store half %r.val, half addrspace(1)* %gep.r
    205   ret void
    206 }
    207 
    208 ; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f16:
    209 ; SI: v_mul_f32_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
    210 
    211 ; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
    212 ; GFX8_9: buffer_store_short [[MUL]]
    213 define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {
    214   %x = load half, half addrspace(1)* undef
    215   %rcp = fdiv arcp half %x, 2.0
    216   store half %rcp, half addrspace(1)* %out, align 4
    217   ret void
    218 }
    219 
    220 ; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f16:
    221 ; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dccc000, v{{[0-9]+}}
    222 
    223 ; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}
    224 ; GFX8_9: buffer_store_short [[MUL]]
    225 define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {
    226   %x = load half, half addrspace(1)* undef
    227   %rcp = fdiv arcp half %x, 10.0
    228   store half %rcp, half addrspace(1)* %out, align 4
    229   ret void
    230 }
    231 
    232 ; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f16:
    233 ; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdccc000, v{{[0-9]+}}
    234 
    235 ; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}
    236 ; GFX8_9: buffer_store_short [[MUL]]
    237 define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
    238   %x = load half, half addrspace(1)* undef
    239   %rcp = fdiv arcp half %x, -10.0
    240   store half %rcp, half addrspace(1)* %out, align 4
    241   ret void
    242 }
    243 
    244 declare i32 @llvm.amdgcn.workitem.id.x() #1
    245 declare half @llvm.sqrt.f16(half) #1
    246 declare half @llvm.fabs.f16(half) #1
    247 
    248 attributes #0 = { nounwind }
    249 attributes #1 = { nounwind readnone }
    250 attributes #2 = { nounwind "unsafe-fp-math"="true" }
    251