Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
      2 
      3 ; GCN-LABEL: {{^}}udiv32_invariant_denom:
      4 ; GCN:     v_cvt_f32_u32
      5 ; GCN:     v_rcp_iflag_f32
      6 ; GCN:     v_mul_f32_e32 v{{[0-9]+}}, 0x4f800000,
      7 ; GCN:     v_cvt_u32_f32_e32
      8 ; GCN-DAG: v_mul_hi_u32
      9 ; GCN-DAG: v_mul_lo_i32
     10 ; GCN-DAG: v_sub_i32_e32
     11 ; GCN-DAG: v_cmp_eq_u32_e64
     12 ; GCN-DAG: v_cndmask_b32_e64
     13 ; GCN-DAG: v_mul_hi_u32
     14 ; GCN-DAG: v_add_i32_e32
     15 ; GCN-DAG: v_subrev_i32_e32
     16 ; GCN-DAG: v_cndmask_b32_e64
     17 ; GCN:     [[LOOP:BB[0-9_]+]]:
     18 ; GCN-NOT: v_rcp
     19 ; GCN:     s_cbranch_scc0 [[LOOP]]
     20 ; GCN:     s_endpgm
     21 define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
     22 bb:
     23   br label %bb3
     24 
     25 bb2:                                              ; preds = %bb3
     26   ret void
     27 
     28 bb3:                                              ; preds = %bb3, %bb
     29   %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
     30   %tmp4 = udiv i32 %tmp, %arg1
     31   %tmp5 = zext i32 %tmp to i64
     32   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
     33   store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
     34   %tmp7 = add nuw nsw i32 %tmp, 1
     35   %tmp8 = icmp eq i32 %tmp7, 1024
     36   br i1 %tmp8, label %bb2, label %bb3
     37 }
     38 
     39 ; GCN-LABEL: {{^}}urem32_invariant_denom:
     40 ; GCN:     v_cvt_f32_u32
     41 ; GCN:     v_rcp_iflag_f32
     42 ; GCN:     v_mul_f32_e32 v{{[0-9]+}}, 0x4f800000,
     43 ; GCN:     v_cvt_u32_f32_e32
     44 ; GCN-DAG: v_mul_hi_u32
     45 ; GCN-DAG: v_mul_lo_i32
     46 ; GCN-DAG: v_sub_i32_e32
     47 ; GCN-DAG: v_cmp_eq_u32_e64
     48 ; GCN-DAG: v_cndmask_b32_e64
     49 ; GCN-DAG: v_mul_hi_u32
     50 ; GCN-DAG: v_add_i32_e32
     51 ; GCN-DAG: v_subrev_i32_e32
     52 ; GCN-DAG: v_cndmask_b32_e64
     53 ; GCN:     [[LOOP:BB[0-9_]+]]:
     54 ; GCN-NOT: v_rcp
     55 ; GCN:     s_cbranch_scc0 [[LOOP]]
     56 ; GCN:     s_endpgm
     57 define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
     58 bb:
     59   br label %bb3
     60 
     61 bb2:                                              ; preds = %bb3
     62   ret void
     63 
     64 bb3:                                              ; preds = %bb3, %bb
     65   %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
     66   %tmp4 = urem i32 %tmp, %arg1
     67   %tmp5 = zext i32 %tmp to i64
     68   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
     69   store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
     70   %tmp7 = add nuw nsw i32 %tmp, 1
     71   %tmp8 = icmp eq i32 %tmp7, 1024
     72   br i1 %tmp8, label %bb2, label %bb3
     73 }
     74 
     75 ; GCN-LABEL: {{^}}sdiv32_invariant_denom:
     76 ; GCN:     v_cvt_f32_u32
     77 ; GCN:     v_rcp_iflag_f32
     78 ; GCN:     v_mul_f32_e32 v{{[0-9]+}}, 0x4f800000,
     79 ; GCN:     v_cvt_u32_f32_e32
     80 ; GCN-DAG: v_mul_hi_u32
     81 ; GCN-DAG: v_mul_lo_i32
     82 ; GCN-DAG: v_sub_i32_e32
     83 ; GCN-DAG: v_cmp_eq_u32_e64
     84 ; GCN-DAG: v_cndmask_b32_e64
     85 ; GCN-DAG: v_mul_hi_u32
     86 ; GCN-DAG: v_add_i32_e32
     87 ; GCN-DAG: v_subrev_i32_e32
     88 ; GCN-DAG: v_cndmask_b32_e64
     89 ; GCN:     [[LOOP:BB[0-9_]+]]:
     90 ; GCN-NOT: v_rcp
     91 ; GCN:     s_cbranch_scc0 [[LOOP]]
     92 ; GCN:     s_endpgm
     93 define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
     94 bb:
     95   br label %bb3
     96 
     97 bb2:                                              ; preds = %bb3
     98   ret void
     99 
    100 bb3:                                              ; preds = %bb3, %bb
    101   %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
    102   %tmp4 = sdiv i32 %tmp, %arg1
    103   %tmp5 = zext i32 %tmp to i64
    104   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
    105   store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
    106   %tmp7 = add nuw nsw i32 %tmp, 1
    107   %tmp8 = icmp eq i32 %tmp7, 1024
    108   br i1 %tmp8, label %bb2, label %bb3
    109 }
    110 
    111 ; GCN-LABEL: {{^}}srem32_invariant_denom:
    112 ; GCN:     v_cvt_f32_u32
    113 ; GCN:     v_rcp_iflag_f32
    114 ; GCN:     v_mul_f32_e32 v{{[0-9]+}}, 0x4f800000,
    115 ; GCN:     v_cvt_u32_f32_e32
    116 ; GCN-DAG: v_mul_hi_u32
    117 ; GCN-DAG: v_mul_lo_i32
    118 ; GCN-DAG: v_sub_i32_e32
    119 ; GCN-DAG: v_cmp_eq_u32_e64
    120 ; GCN-DAG: v_cndmask_b32_e64
    121 ; GCN-DAG: v_mul_hi_u32
    122 ; GCN-DAG: v_add_i32_e32
    123 ; GCN-DAG: v_subrev_i32_e32
    124 ; GCN-DAG: v_cndmask_b32_e64
    125 ; GCN:     [[LOOP:BB[0-9_]+]]:
    126 ; GCN-NOT: v_rcp
    127 ; GCN:     s_cbranch_scc0 [[LOOP]]
    128 ; GCN:     s_endpgm
    129 define amdgpu_kernel void @srem32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
    130 bb:
    131   br label %bb3
    132 
    133 bb2:                                              ; preds = %bb3
    134   ret void
    135 
    136 bb3:                                              ; preds = %bb3, %bb
    137   %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
    138   %tmp4 = srem i32 %tmp, %arg1
    139   %tmp5 = zext i32 %tmp to i64
    140   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
    141   store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
    142   %tmp7 = add nuw nsw i32 %tmp, 1
    143   %tmp8 = icmp eq i32 %tmp7, 1024
    144   br i1 %tmp8, label %bb2, label %bb3
    145 }
    146 
    147 ; GCN-LABEL: {{^}}udiv16_invariant_denom:
    148 ; GCN:     v_cvt_f32_u32
    149 ; GCN:     v_rcp_iflag_f32
    150 ; GCN:     [[LOOP:BB[0-9_]+]]:
    151 ; GCN-NOT: v_rcp
    152 ; GCN:     s_cbranch_scc0 [[LOOP]]
    153 ; GCN:     s_endpgm
    154 define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
    155 bb:
    156   br label %bb3
    157 
    158 bb2:                                              ; preds = %bb3
    159   ret void
    160 
    161 bb3:                                              ; preds = %bb3, %bb
    162   %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
    163   %tmp4 = udiv i16 %tmp, %arg1
    164   %tmp5 = zext i16 %tmp to i64
    165   %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
    166   store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
    167   %tmp7 = add nuw nsw i16 %tmp, 1
    168   %tmp8 = icmp eq i16 %tmp7, 1024
    169   br i1 %tmp8, label %bb2, label %bb3
    170 }
    171 
    172 ; GCN-LABEL: {{^}}urem16_invariant_denom:
    173 ; GCN:     v_cvt_f32_u32
    174 ; GCN:     v_rcp_iflag_f32
    175 ; GCN:     [[LOOP:BB[0-9_]+]]:
    176 ; GCN-NOT: v_rcp
    177 ; GCN:     s_cbranch_scc0 [[LOOP]]
    178 ; GCN:     s_endpgm
    179 define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
    180 bb:
    181   br label %bb3
    182 
    183 bb2:                                              ; preds = %bb3
    184   ret void
    185 
    186 bb3:                                              ; preds = %bb3, %bb
    187   %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
    188   %tmp4 = urem i16 %tmp, %arg1
    189   %tmp5 = zext i16 %tmp to i64
    190   %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
    191   store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
    192   %tmp7 = add nuw nsw i16 %tmp, 1
    193   %tmp8 = icmp eq i16 %tmp7, 1024
    194   br i1 %tmp8, label %bb2, label %bb3
    195 }
    196 
    197 ; GCN-LABEL: {{^}}sdiv16_invariant_denom:
    198 ; GCN-DAG: s_sext_i32_i16
    199 ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fffffff
    200 ; GCN-DAG: v_cvt_f32_i32
    201 ; GCN-DAG: v_rcp_iflag_f32
    202 ; GCN:     [[LOOP:BB[0-9_]+]]:
    203 ; GCN-NOT: v_rcp
    204 ; GCN:     s_cbranch_scc0 [[LOOP]]
    205 ; GCN:     s_endpgm
    206 define amdgpu_kernel void @sdiv16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
    207 bb:
    208   br label %bb3
    209 
    210 bb2:                                              ; preds = %bb3
    211   ret void
    212 
    213 bb3:                                              ; preds = %bb3, %bb
    214   %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
    215   %tmp4 = sdiv i16 %tmp, %arg1
    216   %tmp5 = zext i16 %tmp to i64
    217   %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
    218   store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
    219   %tmp7 = add nuw nsw i16 %tmp, 1
    220   %tmp8 = icmp eq i16 %tmp7, 1024
    221   br i1 %tmp8, label %bb2, label %bb3
    222 }
    223 
    224 ; GCN-LABEL: {{^}}srem16_invariant_denom:
    225 ; GCN-DAG: s_sext_i32_i16
    226 ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fffffff
    227 ; GCN-DAG: v_cvt_f32_i32
    228 ; GCN-DAG: v_rcp_iflag_f32
    229 ; GCN:     [[LOOP:BB[0-9_]+]]:
    230 ; GCN-NOT: v_rcp
    231 ; GCN:     s_cbranch_scc0 [[LOOP]]
    232 ; GCN:     s_endpgm
    233 define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
    234 bb:
    235   br label %bb3
    236 
    237 bb2:                                              ; preds = %bb3
    238   ret void
    239 
    240 bb3:                                              ; preds = %bb3, %bb
    241   %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
    242   %tmp4 = srem i16 %tmp, %arg1
    243   %tmp5 = zext i16 %tmp to i64
    244   %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
    245   store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
    246   %tmp7 = add nuw nsw i16 %tmp, 1
    247   %tmp8 = icmp eq i16 %tmp7, 1024
    248   br i1 %tmp8, label %bb2, label %bb3
    249 }
    250