Home | History | Annotate | Download | only in armv8
      1 //.include "ihevc_neon_macros.s"
      2 .macro push_v_regs
      3     stp             X8, X9, [sp, #-16]!
      4     stp             X10, X11, [sp, #-16]!
      5     stp             X12, X13, [sp, #-16]!
      6     stp             X14, X15, [sp, #-16]!
      7     stp             X16, X17, [sp, #-16]!
      8     stp             x19, x20, [sp, #-16]!
      9     stp             x21, x22, [sp, #-16]!
     10     stp             X29, X30, [sp, #-16]!
     11 .endm
     12 .macro pop_v_regs
     13     ldp             X29, X30, [sp], #16
     14     ldp             x21, x22, [sp], #16
     15     ldp             x19, x20, [sp], #16
     16     ldp             X16, X17, [sp], #16
     17     ldp             X14, X15, [sp], #16
     18     ldp             X12, X13, [sp], #16
     19     ldp             X10, X11, [sp], #16
     20     ldp             X8, X9, [sp], #16
     21 .endm
     22 
     23 .text
     24 .p2align 2
     25 .global ixheaacd_scale_factor_process_armv8
     26 
     27 ixheaacd_scale_factor_process_armv8:
     28 
     29     push_v_regs
     30 
     31     MOV             x9, x4
     32 
     33     MOV             x21, x6
     34     MOV             x22, x7
     35     CMP             x2, #0              // Tbands
     36 
     37     BGT             lbl17
     38 
     39     pop_v_regs
     40     ret
     41 lbl17:
     42     MOV             x10, #0
     43     CMP             x5, #2
     44     BGT             ADD_34
     45     MOV             x11, #0x25
     46     B               TBANDS_LOOP
     47 ADD_34:
     48     MOV             x11, #0x22
     49     // MOV         x11, #0x25 // temp=37
     50 
     51 TBANDS_LOOP:
     52     LDRSH           x5, [x1], #2        // scale_factor = *Scfactor++;
     53     LDRB            w4, [x3], #1        //Offset [1]
     54     sxtw            x4, w4
     55 
     56 
     57     CMP             x5, #0x18           //if(scale_factor < 24)
     58     BGE             SCALE_FACTOR_GE_12  //
     59 
     60     CMP             x4, #0
     61     BLE             OFFSET_ZERO
     62 
     63 SCALE_FACTOR_LT_12:
     64 
     65     STR             x10, [x0], #8
     66     STR             x10, [x0], #8
     67     SUBS            x4, x4, #4
     68     BGT             SCALE_FACTOR_LT_12
     69     B               OFFSET_ZERO
     70 
     71 SCALE_FACTOR_GE_12:
     72 
     73     SUBS            x6, x11, x5, ASR #2 // 37-(scale_factor >> 2)
     74     AND             x5, x5, #3          // scale_factor & 0x0003
     75 
     76     //ADD x5,x9,x5,LSL #1 ; scale_table_ptr[(scale_factor & 0x0003)];
     77     LDR             w5, [x9, x5, LSL #2] // scale_short = scale_table_ptr[(scale_factor & 0x0003)];
     78     sxtw            x5, w5
     79     AND             w17, w5, #0x0000FFFF
     80     sxth            w17, w17            //16-bit value stored as 32-bit,so SMULWB can still be used
     81     BLE             SHIFT_LE_ZERO       // if shift less than or equal to zero
     82 
     83     SUB             x14, x6, #1         //dont do that extra LSL #1 in SMULWB
     84 
     85 SHIFT_POSITIVE: //loop over sfbWidth a multiple of 4
     86     LDP             w6, w7 , [x0, #0]   // temp1 = *x_invquant
     87     LDP             w19, w20, [x0, #8]
     88 
     89     //SMULWB      x6, x6, x5 // buffex1 = mult32x16in32(temp1, scale_short);
     90     SMULL           x6, w6, w17
     91     SMULL           x7, w7, w17
     92     SMULL           x19, w19, w17
     93     SMULL           x20, w20, w17
     94 
     95     ASR             x6, x6, #16
     96     ASR             x7, x7 , #16
     97     ASR             x19, x19 , #16
     98     ASR             x20, x20 , #16
     99 
    100     ASR             x6, x6, x14         // buffex1 = shx32(buffex1, shift);
    101     ASR             x7, x7, x14
    102     ASR             x19, x19, x14
    103     ASR             x20, x20, x14
    104 
    105     stp             w6, w7, [x0], #8
    106     stp             w19, w20, [x0], #8
    107 
    108     SUBS            x4, x4, #4
    109 
    110     BGT             SHIFT_POSITIVE
    111     B               OFFSET_ZERO
    112 SHIFT_LE_ZERO:
    113 
    114     //RSBS        x14, x6, #0 //-shift
    115     NEGS            x14, x6
    116     BGT             SHIFT_NEGTIVE1
    117 
    118 SHIFT_ZERO: //loop over sfbWidth a multiple of 4
    119     LDP             w6, w7, [x0, #0]    // temp1 = *x_invquant;
    120 
    121     //SMULWB      x6, x6, x5 // buffex1 = mult32x16in32(temp1, scale_short);
    122     SMULL           x6, w6, w17
    123     SMULL           x7, w7, w17
    124 
    125     ASR             x6, x6, #16
    126     ASR             x7, x7, #16
    127 
    128     LSL             x6, x6, #1
    129     LSL             x7, x7, #1
    130 
    131     STP             w6, w7, [x0], #8    // *x_invquant++ = buffex1;
    132 
    133     SUBS            x4, x4, #2
    134 
    135     BGT             SHIFT_ZERO
    136     B               OFFSET_ZERO
    137 
    138 SHIFT_NEGTIVE1:
    139     SUB             x14, x14, #1
    140 SHIFT_NEGTIVE: //;loop over sfbWidth a multiple of 4
    141 
    142     LDP             w6, w7, [x0, #0]
    143     LSL             w6, w6, w14         // buffex1 = shl32(buffex1, shift-1);
    144     LSL             w7, w7, w14         // buffex1 = shl32(buffex1, shift-1);
    145 
    146     //SMULWB      x6, x6, x5 // buffex1 = mult32x16in32(temp1, scale_short);
    147     SMULL           x6, w6, w17
    148     SMULL           x7, w7, w17
    149     ASR             x6, x6, #16
    150     ASR             x7, x7, #16
    151 
    152     LSL             x6, x6, #2          // shl for fixmul_32x16b and shl32(buffer,1)
    153     LSL             x7, x7, #2          // shl for fixmul_32x16b and shl32(buffer,1)
    154 
    155     STP             w6, w7, [x0], #8    // *x_invquant++ = buffex1;
    156 
    157     SUBS            x4, x4, #2
    158 
    159     BGT             SHIFT_NEGTIVE
    160 
    161 OFFSET_ZERO:
    162     SUBS            x2, x2, #1
    163     BGT             TBANDS_LOOP
    164 
    165     pop_v_regs
    166     ret
    167