Home | History | Annotate | Download | only in lib
      1 .section #gk110_builtin_code
      2 // DIV U32
      3 //
      4 // UNR recurrence (q = a / b):
      5 // look for z such that 2^32 - b <= b * z < 2^32
      6 // then q - 1 <= (a * z) / 2^32 <= q
      7 //
      8 // INPUT:   $r0: dividend, $r1: divisor
      9 // OUTPUT:  $r0: result, $r1: modulus
     10 // CLOBBER: $r2 - $r3, $p0 - $p1
     11 // SIZE:    22 / 14 * 8 bytes
     12 //
     13 gk110_div_u32:
     14    sched 0x28 0x04 0x28 0x04 0x28 0x28 0x28
     15    bfind u32 $r2 $r1
     16    xor b32 $r2 $r2 0x1f
     17    mov b32 $r3 0x1
     18    shl b32 $r2 $r3 clamp $r2
     19    cvt u32 $r1 neg u32 $r1
     20    mul $r3 u32 $r1 u32 $r2
     21    add $r2 (mul high u32 $r2 u32 $r3) $r2
     22    sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
     23    mul $r3 u32 $r1 u32 $r2
     24    add $r2 (mul high u32 $r2 u32 $r3) $r2
     25    mul $r3 u32 $r1 u32 $r2
     26    add $r2 (mul high u32 $r2 u32 $r3) $r2
     27    mul $r3 u32 $r1 u32 $r2
     28    add $r2 (mul high u32 $r2 u32 $r3) $r2
     29    mul $r3 u32 $r1 u32 $r2
     30    sched 0x04 0x28 0x04 0x28 0x28 0x2c 0x04
     31    add $r2 (mul high u32 $r2 u32 $r3) $r2
     32    mov b32 $r3 $r0
     33    mul high $r0 u32 $r0 u32 $r2
     34    cvt u32 $r2 neg u32 $r1
     35    add $r1 (mul u32 $r1 u32 $r0) $r3
     36    set $p0 0x1 ge u32 $r1 $r2
     37    $p0 sub b32 $r1 $r1 $r2
     38    sched 0x28 0x2c 0x04 0x20 0x2e 0x28 0x20
     39    $p0 add b32 $r0 $r0 0x1
     40    $p0 set $p0 0x1 ge u32 $r1 $r2
     41    $p0 sub b32 $r1 $r1 $r2
     42    $p0 add b32 $r0 $r0 0x1
     43    ret
     44 
     45 // DIV S32, like DIV U32 after taking ABS(inputs)
     46 //
     47 // INPUT:   $r0: dividend, $r1: divisor
     48 // OUTPUT:  $r0: result, $r1: modulus
     49 // CLOBBER: $r2 - $r3, $p0 - $p3
     50 //
     51 gk110_div_s32:
     52    set $p2 0x1 lt s32 $r0 0x0
     53    set $p3 0x1 lt s32 $r1 0x0 xor $p2
     54    sched 0x20 0x28 0x28 0x04 0x28 0x04 0x28
     55    cvt s32 $r0 abs s32 $r0
     56    cvt s32 $r1 abs s32 $r1
     57    bfind u32 $r2 $r1
     58    xor b32 $r2 $r2 0x1f
     59    mov b32 $r3 0x1
     60    shl b32 $r2 $r3 clamp $r2
     61    cvt u32 $r1 neg u32 $r1
     62    sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
     63    mul $r3 u32 $r1 u32 $r2
     64    add $r2 (mul high u32 $r2 u32 $r3) $r2
     65    mul $r3 u32 $r1 u32 $r2
     66    add $r2 (mul high u32 $r2 u32 $r3) $r2
     67    mul $r3 u32 $r1 u32 $r2
     68    add $r2 (mul high u32 $r2 u32 $r3) $r2
     69    mul $r3 u32 $r1 u32 $r2
     70    sched 0x28 0x28 0x04 0x28 0x04 0x28 0x28
     71    add $r2 (mul high u32 $r2 u32 $r3) $r2
     72    mul $r3 u32 $r1 u32 $r2
     73    add $r2 (mul high u32 $r2 u32 $r3) $r2
     74    mov b32 $r3 $r0
     75    mul high $r0 u32 $r0 u32 $r2
     76    cvt u32 $r2 neg u32 $r1
     77    add $r1 (mul u32 $r1 u32 $r0) $r3
     78    sched 0x2c 0x04 0x28 0x2c 0x04 0x28 0x20
     79    set $p0 0x1 ge u32 $r1 $r2
     80    $p0 sub b32 $r1 $r1 $r2
     81    $p0 add b32 $r0 $r0 0x1
     82    $p0 set $p0 0x1 ge u32 $r1 $r2
     83    $p0 sub b32 $r1 $r1 $r2
     84    $p0 add b32 $r0 $r0 0x1
     85    $p3 cvt s32 $r0 neg s32 $r0
     86    sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c
     87    $p2 cvt s32 $r1 neg s32 $r1
     88    ret
     89 
     90 gk110_rcp_f64:
     91 gk110_rsq_f64:
     92    ret
     93 
     94 .section #gk110_builtin_offsets
     95 .b64 #gk110_div_u32
     96 .b64 #gk110_div_s32
     97 .b64 #gk110_rcp_f64
     98 .b64 #gk110_rsq_f64
     99