1 //.include "ihevc_neon_macros.s" 2 .macro push_v_regs 3 stp X8, X9, [sp, #-16]! 4 stp X10, X11, [sp, #-16]! 5 stp X12, X13, [sp, #-16]! 6 stp X14, X15, [sp, #-16]! 7 stp X16, X17, [sp, #-16]! 8 stp x19, x20, [sp, #-16]! 9 stp x21, x22, [sp, #-16]! 10 stp X29, X30, [sp, #-16]! 11 .endm 12 .macro pop_v_regs 13 ldp X29, X30, [sp], #16 14 ldp x21, x22, [sp], #16 15 ldp x19, x20, [sp], #16 16 ldp X16, X17, [sp], #16 17 ldp X14, X15, [sp], #16 18 ldp X12, X13, [sp], #16 19 ldp X10, X11, [sp], #16 20 ldp X8, X9, [sp], #16 21 .endm 22 23 .text 24 .p2align 2 25 .global ixheaacd_scale_factor_process_armv8 26 27 ixheaacd_scale_factor_process_armv8: 28 29 push_v_regs 30 31 MOV x9, x4 32 33 MOV x21, x6 34 MOV x22, x7 35 CMP x2, #0 // Tbands 36 37 BGT lbl17 38 39 pop_v_regs 40 ret 41 lbl17: 42 MOV x10, #0 43 CMP x5, #2 44 BGT ADD_34 45 MOV x11, #0x25 46 B TBANDS_LOOP 47 ADD_34: 48 MOV x11, #0x22 49 // MOV x11, #0x25 // temp=37 50 51 TBANDS_LOOP: 52 LDRSH x5, [x1], #2 // scale_factor = *Scfactor++; 53 LDRB w4, [x3], #1 //Offset [1] 54 sxtw x4, w4 55 56 57 CMP x5, #0x18 //if(scale_factor < 24) 58 BGE SCALE_FACTOR_GE_12 // 59 60 CMP x4, #0 61 BLE OFFSET_ZERO 62 63 SCALE_FACTOR_LT_12: 64 65 STR x10, [x0], #8 66 STR x10, [x0], #8 67 SUBS x4, x4, #4 68 BGT SCALE_FACTOR_LT_12 69 B OFFSET_ZERO 70 71 SCALE_FACTOR_GE_12: 72 73 SUBS x6, x11, x5, ASR #2 // 37-(scale_factor >> 2) 74 AND x5, x5, #3 // scale_factor & 0x0003 75 76 //ADD x5,x9,x5,LSL #1 ; scale_table_ptr[(scale_factor & 0x0003)]; 77 LDR w5, [x9, x5, LSL #2] // scale_short = scale_table_ptr[(scale_factor & 0x0003)]; 78 sxtw x5, w5 79 AND w17, w5, #0x0000FFFF 80 sxth w17, w17 //16-bit value stored as 32-bit,so SMULWB can still be used 81 BLE SHIFT_LE_ZERO // if shift less than or equal to zero 82 83 SUB x14, x6, #1 //dont do that extra LSL #1 in SMULWB 84 85 SHIFT_POSITIVE: //loop over sfbWidth a multiple of 4 86 LDP w6, w7 , [x0, #0] // temp1 = *x_invquant 87 LDP w19, w20, [x0, #8] 88 89 //SMULWB x6, x6, x5 // buffex1 = mult32x16in32(temp1, scale_short); 90 SMULL x6, w6, w17 91 SMULL x7, w7, w17 92 SMULL x19, w19, w17 93 SMULL x20, w20, w17 94 95 ASR x6, x6, #16 96 ASR x7, x7 , #16 97 ASR x19, x19 , #16 98 ASR x20, x20 , #16 99 100 ASR x6, x6, x14 // buffex1 = shx32(buffex1, shift); 101 ASR x7, x7, x14 102 ASR x19, x19, x14 103 ASR x20, x20, x14 104 105 stp w6, w7, [x0], #8 106 stp w19, w20, [x0], #8 107 108 SUBS x4, x4, #4 109 110 BGT SHIFT_POSITIVE 111 B OFFSET_ZERO 112 SHIFT_LE_ZERO: 113 114 //RSBS x14, x6, #0 //-shift 115 NEGS x14, x6 116 BGT SHIFT_NEGTIVE1 117 118 SHIFT_ZERO: //loop over sfbWidth a multiple of 4 119 LDP w6, w7, [x0, #0] // temp1 = *x_invquant; 120 121 //SMULWB x6, x6, x5 // buffex1 = mult32x16in32(temp1, scale_short); 122 SMULL x6, w6, w17 123 SMULL x7, w7, w17 124 125 ASR x6, x6, #16 126 ASR x7, x7, #16 127 128 LSL x6, x6, #1 129 LSL x7, x7, #1 130 131 STP w6, w7, [x0], #8 // *x_invquant++ = buffex1; 132 133 SUBS x4, x4, #2 134 135 BGT SHIFT_ZERO 136 B OFFSET_ZERO 137 138 SHIFT_NEGTIVE1: 139 SUB x14, x14, #1 140 SHIFT_NEGTIVE: //;loop over sfbWidth a multiple of 4 141 142 LDP w6, w7, [x0, #0] 143 LSL w6, w6, w14 // buffex1 = shl32(buffex1, shift-1); 144 LSL w7, w7, w14 // buffex1 = shl32(buffex1, shift-1); 145 146 //SMULWB x6, x6, x5 // buffex1 = mult32x16in32(temp1, scale_short); 147 SMULL x6, w6, w17 148 SMULL x7, w7, w17 149 ASR x6, x6, #16 150 ASR x7, x7, #16 151 152 LSL x6, x6, #2 // shl for fixmul_32x16b and shl32(buffer,1) 153 LSL x7, x7, #2 // shl for fixmul_32x16b and shl32(buffer,1) 154 155 STP w6, w7, [x0], #8 // *x_invquant++ = buffex1; 156 157 SUBS x4, x4, #2 158 159 BGT SHIFT_NEGTIVE 160 161 OFFSET_ZERO: 162 SUBS x2, x2, #1 163 BGT TBANDS_LOOP 164 165 pop_v_regs 166 ret 167