1 @/****************************************************************************** 2 @ * 3 @ * Copyright (C) 2015 The Android Open Source Project 4 @ * 5 @ * Licensed under the Apache License, Version 2.0 (the "License"); 6 @ * you may not use this file except in compliance with the License. 7 @ * You may obtain a copy of the License at: 8 @ * 9 @ * http://www.apache.org/licenses/LICENSE-2.0 10 @ * 11 @ * Unless required by applicable law or agreed to in writing, software 12 @ * distributed under the License is distributed on an "AS IS" BASIS, 13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @ * See the License for the specific language governing permissions and 15 @ * limitations under the License. 16 @ * 17 @ ***************************************************************************** 18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 @*/ 20 @** 21 @ ******************************************************************************* 22 @ * @file 23 @ * ih264_ihadamard_scaling_a9.s 24 @ * 25 @ * @brief 26 @ * Contains function definitions for inverse hadamard transform on 4x4 DC outputs 27 @ * of 16x16 intra-prediction 28 @ * 29 @ * @author 30 @ * Mohit 31 @ * 32 @ * @par List of Functions: 33 @ * - ih264_ihadamard_scaling_4x4_a9() 34 @ * - ih264_ihadamard_scaling_2x2_uv_a9() 35 @ * 36 @ * @remarks 37 @ * None 38 @ * 39 @ ******************************************************************************* 40 @ * 41 @ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients 42 @ * of a 16x16 intra prediction macroblock, and then performs scaling. 43 @ * prediction buffer 44 @ * 45 @ * @par Description: 46 @ * The DC coefficients pass through a 2-stage inverse hadamard transform. 47 @ * This inverse transformed content is scaled to based on Qp value. 48 @ * 49 @ * @param[in] pi2_src 50 @ * input 4x4 block of DC coefficients 51 @ * 52 @ * @param[out] pi2_out 53 @ * output 4x4 block 54 @ * 55 @ * @param[in] pu2_iscal_mat 56 @ * pointer to scaling list 57 @ * 58 @ * @param[in] pu2_weigh_mat 59 @ * pointer to weight matrix 60 @ * 61 @ * @param[in] u4_qp_div_6 62 @ * Floor (qp/6) 63 @ * 64 @ * @param[in] pi4_tmp 65 @ * temporary buffer of size 1*16 66 @ * 67 @ * @returns none 68 @ * 69 @ * @remarks none 70 @ * 71 @ ******************************************************************************* 72 @ * 73 @ * 74 @ ******************************************************************************* 75 @ * 76 @ void ih264_ihadamard_scaling_4x4(WORD16* pi2_src, 77 @ WORD16* pi2_out, 78 @ const UWORD16 *pu2_iscal_mat, 79 @ const UWORD16 *pu2_weigh_mat, 80 @ UWORD32 u4_qp_div_6, 81 @ WORD32* pi4_tmp) 82 @**************Variables Vs Registers***************************************** 83 @r0 => *pi2_src 84 @r1 => *pi2_out 85 @r2 => *pu2_iscal_mat 86 @r3 => *pu2_weigh_mat 87 @r4 => u4_qp_div_6 88 89 .text 90 .p2align 2 91 92 .global ih264_ihadamard_scaling_4x4_a9 93 94 ih264_ihadamard_scaling_4x4_a9: 95 96 @VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4 97 @If the macro value changes need to change the instruction according to it. 98 @Only one shift is done in horizontal inverse because, 99 @if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value 100 @if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 101 102 stmfd sp!, {r4-r12, r14} @ stack stores the values of the arguments 103 ldr r4, [sp, #40] @ Loads u4_qp_div_6 104 vdup.s32 q10, r4 @ Populate the u4_qp_div_6 in Q10 105 ldrh r6, [r3] @ load pu2_weight_mat[0] , H for unsigned halfword load 106 ldrh r7, [r2] @ load pu2_iscal_mat[0] , H for unsigned halfword load 107 mul r6, r6, r7 @ pu2_iscal_mat[0]*pu2_weigh_mat[0] 108 vdup.s32 q9, r6 @ Populate pu2_iscal_mat[0]*pu2_weigh_mat[0] 32-bit in Q9 109 vpush {d8-d15} 110 @=======================INVERSE HADAMARD TRANSFORM================================ 111 112 vld4.s16 {d0, d1, d2, d3}, [r0] @load x4,x5,x6,x7 113 vaddl.s16 q12, d0, d3 @x0 = x4 + x7 114 vaddl.s16 q13, d1, d2 @x1 = x5 + x6 115 vsubl.s16 q14, d1, d2 @x2 = x5 - x6 116 vsubl.s16 q15, d0, d3 @x3 = x4 - x7 117 118 vadd.s32 q2, q12, q13 @pi4_tmp_ptr[0] = x0 + x1 119 vadd.s32 q3, q15, q14 @pi4_tmp_ptr[1] = x3 + x2 120 vsub.s32 q4, q12, q13 @pi4_tmp_ptr[2] = x0 - x1 121 vsub.s32 q5, q15, q14 @pi4_tmp_ptr[3] = x3 - x2 122 123 vtrn.32 q2, q3 @Transpose the register for vertical transform 124 vtrn.32 q4, q5 125 126 vswp d5, d8 @Q2 = x4, Q4 = x6 127 vswp d7, d10 @Q3 = x5, Q5 = x7 128 129 130 vadd.s32 q12, q2, q5 @x0 = x4+x7 131 vadd.s32 q13, q3, q4 @x1 = x5+x6 132 vsub.s32 q14, q3, q4 @x2 = x5-x6 133 vsub.s32 q15, q2, q5 @x3 = x4-x7 134 135 vadd.s32 q0, q12, q13 @pi4_tmp_ptr[0] = x0 + x1 136 vadd.s32 q1, q15, q14 @pi4_tmp_ptr[1] = x3 + x2 137 vsub.s32 q2, q12, q13 @pi4_tmp_ptr[2] = x0 - x1 138 vsub.s32 q3, q15, q14 @pi4_tmp_ptr[3] = x3 - x2 139 140 141 vmul.s32 q0, q0, q9 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 142 vmul.s32 q1, q1, q9 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 143 vmul.s32 q2, q2, q9 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 144 vmul.s32 q3, q3, q9 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 145 146 vshl.s32 q0, q0, q10 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3 147 vshl.s32 q1, q1, q10 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7 148 vshl.s32 q2, q2, q10 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11 149 vshl.s32 q3, q3, q10 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15 150 151 vqrshrn.s32 d0, q0, #0x6 @ D0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3 152 vqrshrn.s32 d1, q1, #0x6 @ D1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7 153 vqrshrn.s32 d2, q2, #0x6 @ D2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11 154 vqrshrn.s32 d3, q3, #0x6 @ D3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15 155 156 vst1.s16 {d0, d1, d2, d3}, [r1] @IV row store the value 157 158 vpop {d8-d15} 159 ldmfd sp!, {r4-r12, r15} @Reload the registers from SP 160 161 162 163 @ ******************************************************************************* 164 @ * 165 @ * @brief This function performs a 2x2 inverse hadamard transform for chroma block 166 @ * 167 @ * @par Description: 168 @ * The DC coefficients pass through a 2-stage inverse hadamard transform. 169 @ * This inverse transformed content is scaled to based on Qp value. 170 @ * Both DC blocks of U and v blocks are processesd 171 @ * 172 @ * @param[in] pi2_src 173 @ * input 1x8 block of ceffs. First 4 are from U and next from V 174 @ * 175 @ * @param[out] pi2_out 176 @ * output 1x8 block 177 @ * 178 @ * @param[in] pu2_iscal_mat 179 @ * pointer to scaling list 180 @ * 181 @ * @param[in] pu2_weigh_mat 182 @ * pointer to weight matrix 183 @ * 184 @ * @param[in] u4_qp_div_6 185 @ * Floor (qp/6) 186 @ * 187 @ * @returns none 188 @ * 189 @ * @remarks none 190 @ * 191 @ ******************************************************************************* 192 @ * 193 @ * 194 @ ******************************************************************************* 195 @ * 196 @ void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src, 197 @ WORD16* pi2_out, 198 @ const UWORD16 *pu2_iscal_mat, 199 @ const UWORD16 *pu2_weigh_mat, 200 @ UWORD32 u4_qp_div_6, 201 202 .global ih264_ihadamard_scaling_2x2_uv_a9 203 ih264_ihadamard_scaling_2x2_uv_a9: 204 205 @Registers used 206 @ r0 : *pi2_src 207 @ r1 : *pi2_out 208 @ r2 : *pu2_iscal_mat 209 @ r3 : *pu2_weigh_mat 210 211 vld1.u16 d26[0], [r2] 212 vld1.u16 d27[0], [r3] 213 vmull.u16 q15, d26, d27 @pu2_iscal_mat[0] * pu2_weigh_mat[0] 214 vdup.u32 q15, d30[0] 215 216 vld1.u16 d28[0], [sp] @load qp/6 217 218 vpush {d8-d15} 219 220 vmov.u16 d29, #5 221 vsubl.u16 q14, d28, d29 @qp\6 - 5 222 vdup.s32 q14, d28[0] 223 224 vld2.s16 {d0, d1}, [r0] @load 8 dc coeffs 225 @i2_x4,i2_x6,i2_y4,i1_y6 -> d0 226 @i2_x5,i2_x7,i2_y5,i1_y6 -> d1 227 228 vaddl.s16 q1, d0, d1 @ i4_x0 = i4_x4 + i4_x5;...x2 229 vsubl.s16 q2, d0, d1 @ i4_x1 = i4_x4 - i4_x5;...x3 230 231 vtrn.s32 q1, q2 @i4_x0 i4_x1 -> q1 232 233 vadd.s32 q3, q1, q2 @i4_x4 = i4_x0+i4_x2;.. i4_x5 234 vsub.s32 q1, q1, q2 @i4_x6 = i4_x0-i4_x2;.. i4_x7 235 236 vmul.s32 q5, q3, q15 237 vmul.s32 q6, q1, q15 238 239 vshl.s32 q7, q5, q14 240 vshl.s32 q8, q6, q14 241 242 vmovn.s32 d18, q7 @i4_x4 i4_x5 i4_y4 i4_y5 243 vmovn.s32 d19, q8 @i4_x6 i4_x7 i4_y6 i4_y7 244 245 vst2.s32 {d18-d19}, [r1] 246 247 vpop {d8-d15} 248 bx lr 249 250 251