Home | History | Annotate | Download | only in lib
      1 .section #gm107_builtin_code
      2 // DIV U32
      3 //
      4 // UNR recurrence (q = a / b):
      5 // look for z such that 2^32 - b <= b * z < 2^32
      6 // then q - 1 <= (a * z) / 2^32 <= q
      7 //
      8 // INPUT:   $r0: dividend, $r1: divisor
      9 // OUTPUT:  $r0: result, $r1: modulus
     10 // CLOBBER: $r2 - $r3, $p0 - $p1
     11 // SIZE:    22 / 14 * 8 bytes
     12 //
     13 gm107_div_u32:
     14    sched (st 0xd wr 0x0 wt 0x3f) (st 0x1 wt 0x1) (st 0x6)
     15    flo u32 $r2 $r1
     16    lop xor 1 $r2 $r2 0x1f
     17    mov $r3 0x1 0xf
     18    sched (st 0x1) (st 0xf wr 0x0) (st 0x6 wr 0x0 wt 0x1)
     19    shl $r2 $r3 $r2
     20    i2i u32 u32 $r1 neg $r1
     21    imul u32 u32 $r3 $r1 $r2
     22    sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1)
     23    imad u32 u32 hi $r2 $r2 $r3 $r2
     24    imul u32 u32 $r3 $r1 $r2
     25    imad u32 u32 hi $r2 $r2 $r3 $r2
     26    sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1)
     27    imul u32 u32 $r3 $r1 $r2
     28    imad u32 u32 hi $r2 $r2 $r3 $r2
     29    imul u32 u32 $r3 $r1 $r2
     30    sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1)
     31    imad u32 u32 hi $r2 $r2 $r3 $r2
     32    imul u32 u32 $r3 $r1 $r2
     33    imad u32 u32 hi $r2 $r2 $r3 $r2
     34    sched (st 0x6) (st 0x6 wr 0x0 rd 0x1 wt 0x1) (st 0xf wr 0x0 rd 0x1 wt 0x2)
     35    mov $r3 $r0 0xf
     36    imul u32 u32 hi $r0 $r0 $r2
     37    i2i u32 u32 $r2 neg $r1
     38    sched (st 0x6 wr 0x0 wt 0x3) (st 0xd wt 0x1) (st 0x1)
     39    imad u32 u32 $r1 $r1 $r0 $r3
     40    isetp ge u32 and $p0 1 $r1 $r2 1
     41    $p0 iadd $r1 $r1 neg $r2
     42    sched (st 0x5) (st 0xd) (st 0x1)
     43    $p0 iadd $r0 $r0 0x1
     44    $p0 isetp ge u32 and $p0 1 $r1 $r2 1
     45    $p0 iadd $r1 $r1 neg $r2
     46    sched (st 0x1) (st 0xf) (st 0xf)
     47    $p0 iadd $r0 $r0 0x1
     48    ret
     49    nop 0
     50 
     51 // DIV S32, like DIV U32 after taking ABS(inputs)
     52 //
     53 // INPUT:   $r0: dividend, $r1: divisor
     54 // OUTPUT:  $r0: result, $r1: modulus
     55 // CLOBBER: $r2 - $r3, $p0 - $p3
     56 //
     57 gm107_div_s32:
     58    sched (st 0xd wt 0x3f) (st 0x1) (st 0x1 wr 0x0)
     59    isetp lt and $p2 0x1 $r0 0 1
     60    isetp lt xor $p3 1 $r1 0 $p2
     61    i2i s32 s32 $r0 abs $r0
     62    sched (st 0xf wr 0x1) (st 0xd wr 0x1 wt 0x2) (st 0x1 wt 0x2)
     63    i2i s32 s32 $r1 abs $r1
     64    flo u32 $r2 $r1
     65    lop xor 1 $r2 $r2 0x1f
     66    sched (st 0x6) (st 0x1) (st 0xf wr 0x1)
     67    mov $r3 0x1 0xf
     68    shl $r2 $r3 $r2
     69    i2i u32 u32 $r1 neg $r1
     70    sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2)
     71    imul u32 u32 $r3 $r1 $r2
     72    imad u32 u32 hi $r2 $r2 $r3 $r2
     73    imul u32 u32 $r3 $r1 $r2
     74    sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2)
     75    imad u32 u32 hi $r2 $r2 $r3 $r2
     76    imul u32 u32 $r3 $r1 $r2
     77    imad u32 u32 hi $r2 $r2 $r3 $r2
     78    sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2)
     79    imul u32 u32 $r3 $r1 $r2
     80    imad u32 u32 hi $r2 $r2 $r3 $r2
     81    imul u32 u32 $r3 $r1 $r2
     82    sched (st 0x6 wr 0x1 rd 0x2 wt 0x2) (st 0x2 wt 0x5) (st 0x6 wr 0x0 rd 0x1 wt 0x2)
     83    imad u32 u32 hi $r2 $r2 $r3 $r2
     84    mov $r3 $r0 0xf
     85    imul u32 u32 hi $r0 $r0 $r2
     86    sched (st 0xf wr 0x1 rd 0x2 wt 0x2) (st 0x6 wr 0x0 wt 0x5) (st 0xd wt 0x3)
     87    i2i u32 u32 $r2 neg $r1
     88    imad u32 u32 $r1 $r1 $r0 $r3
     89    isetp ge u32 and $p0 1 $r1 $r2 1
     90    sched (st 0x1) (st 0x5) (st 0xd)
     91    $p0 iadd $r1 $r1 neg $r2
     92    $p0 iadd $r0 $r0 0x1
     93    $p0 isetp ge u32 and $p0 1 $r1 $r2 1
     94    sched (st 0x1) (st 0x2) (st 0xf wr 0x0)
     95    $p0 iadd $r1 $r1 neg $r2
     96    $p0 iadd $r0 $r0 0x1
     97    $p3 i2i s32 s32 $r0 neg $r0
     98    sched (st 0xf wr 0x1) (st 0xf wt 0x3) (st 0xf)
     99    $p2 i2i s32 s32 $r1 neg $r1
    100    ret
    101    nop 0
    102 
    103 // STUB
    104 gm107_rcp_f64:
    105 gm107_rsq_f64:
    106    sched (st 0x0) (st 0x0) (st 0x0)
    107    ret
    108    nop 0
    109    nop 0
    110 
    111 .section #gm107_builtin_offsets
    112 .b64 #gm107_div_u32
    113 .b64 #gm107_div_s32
    114 .b64 #gm107_rcp_f64
    115 .b64 #gm107_rsq_f64
    116