1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 // ******************************************************************************* 20 // * @file 21 // * ihevc_itrans_recon_4x4_neon.s 22 // * 23 // * @brief 24 // * contains function definitions for single stage inverse transform 25 // * 26 // * @author 27 // * naveen sr 28 // * 29 // * @par list of functions: 30 // * - ihevc_itrans_recon_4x4() 31 // * 32 // * @remarks 33 // * none 34 // * 35 // ******************************************************************************* 36 //*/ 37 // /** 38 // ******************************************************************************* 39 // * 40 // * @brief 41 // * this function performs inverse transform and reconstruction for 4x4 42 // * input block 43 // * 44 // * @par description: 45 // * performs inverse transform and adds the prediction data and clips output 46 // * to 8 bit 47 // * 48 // * @param[in] pi2_src 49 // * input 4x4 coefficients 50 // * 51 // * @param[in] pi2_tmp 52 // * temporary 4x4 buffer for storing inverse 53 // * 54 // * transform 55 // * 1st stage output 56 // * 57 // * @param[in] pu1_pred 58 // * prediction 4x4 block 59 // * 60 // * @param[out] pu1_dst 61 // * output 4x4 block 62 // * 63 // * @param[in] src_strd 64 // * input stride 65 // * 66 // * @param[in] pred_strd 67 // * prediction stride 68 // * 69 // * @param[in] dst_strd 70 // * output stride 71 // * 72 // * @param[in] shift 73 // * output shift 74 // * 75 // * @param[in] zero_cols 76 // * zero columns in pi2_src 77 // * 78 // * @returns void 79 // * 80 // * @remarks 81 // * none 82 // * 83 // ******************************************************************************* 84 // */ 85 //void ihevc_itrans_recon_4x4(word16 *pi2_src, 86 // word16 *pi2_tmp, 87 // uword8 *pu1_pred, 88 // uword8 *pu1_dst, 89 // word32 src_strd, 90 // word32 pred_strd, 91 // word32 dst_strd, 92 // word32 zero_cols) 93 //**************variables vs registers************************* 94 // x0 => *pi2_src 95 // x1 => *pi2_tmp 96 // x2 => *pu1_pred 97 // x3 => *pu1_dst 98 // x4 => src_strd 99 // x5 => pred_strd 100 // x6 => dst_strd 101 // x7 => zero_cols 102 103 .text 104 .align 4 105 106 .include "ihevc_neon_macros.s" 107 108 .set shift_stage1_idct , 7 109 .set shift_stage2_idct , 12 110 111 112 113 .globl ihevc_itrans_recon_4x4_av8 114 115 .extern g_ai2_ihevc_trans_4_transpose 116 117 .type ihevc_itrans_recon_4x4_av8, %function 118 119 ihevc_itrans_recon_4x4_av8: 120 121 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 122 123 stp x19, x20,[sp,#-16]! 124 125 adrp x8, :got:g_ai2_ihevc_trans_4_transpose 126 ldr x8, [x8, #:got_lo12:g_ai2_ihevc_trans_4_transpose] 127 128 add x4,x4,x4 // src_strd in terms of word16 129 add x9,x0,x4 // pi2_src[0] + src_strd 130 131 ld1 {v4.4h},[x8] //loading first row of g_ai2_ihevc_trans_4_transpose 132 // d4 = {36,64,83,64} 133 //index = 3 2 1 0 134 add x10,x9,x4, lsl #1 // 3*src_strd 135 add x4,x4,x4 136 ld1 {v1.4h},[x9] //loading pi2_src 2nd row 137 ld1 {v3.4h},[x10] //loading pi2_src 4th row 138 ld1 {v0.4h},[x0],x4 //loading pi2_src 1st row 139 ld1 {v2.4h},[x0],x4 //loading pi2_src 3rd row 140 141 142 // first stage computation starts 143 smull v6.4s, v1.4h, v4.h[1] //83 * pi2_src[1] 144 smlal v6.4s, v3.4h, v4.h[3] //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3] 145 smull v5.4s, v1.4h, v4.h[3] //36 * pi2_src[1] 146 ld1 {v22.s}[0],[x2],x5 147 smlsl v5.4s, v3.4h, v4.h[1] //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3] 148 149 saddl v7.4s, v0.4h, v2.4h //pi2_src[0] + pi2_src[2] 150 ssubl v17.4s, v0.4h, v2.4h //pi2_src[0] - pi2_src[2] 151 shl v7.4s, v7.4s,#6 //e[0] = 64*(pi2_src[0] + pi2_src[2]) 152 shl v17.4s, v17.4s,#6 //e[1] = 64*(pi2_src[0] - pi2_src[2]) 153 154 add v19.4s, v7.4s , v6.4s //((e[0] + o[0] ) 155 add v16.4s, v17.4s , v5.4s //((e[1] + o[1]) 156 sub v18.4s, v17.4s , v5.4s //((e[1] - o[1]) 157 sub v20.4s, v7.4s , v6.4s //((e[0] - o[0]) 158 159 sqrshrn v28.4h, v19.4s,#shift_stage1_idct //pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) ) 160 sqrshrn v29.4h, v16.4s,#shift_stage1_idct //pi2_out[1] = clip_s16((e[1] + o[1] + add)>>shift) ) 161 sqrshrn v30.4h, v18.4s,#shift_stage1_idct //pi2_out[2] = clip_s16((e[0] - o[0] + add)>>shift) ) 162 sqrshrn v31.4h, v20.4s,#shift_stage1_idct //pi2_out[3] = clip_s16((e[0] - o[0] + add)>>shift) ) 163 164 trn1 v24.4h, v28.4h, v29.4h 165 trn2 v25.4h, v28.4h, v29.4h 166 trn1 v26.4h, v30.4h, v31.4h 167 trn2 v27.4h, v30.4h, v31.4h 168 trn1 v0.2s, v24.2s, v26.2s 169 trn2 v2.2s, v24.2s, v26.2s 170 trn1 v1.2s, v25.2s, v27.2s 171 trn2 v3.2s, v25.2s, v27.2s 172 173 // first stage ends 174 // output in d0,d1,d2,d3 175 // second stage starts 176 smull v6.4s, v1.4h, v4.h[1] //83 * pi2_src[1] 177 ld1 {v22.s}[1],[x2],x5 178 smlal v6.4s, v3.4h, v4.h[3] //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3] 179 smull v5.4s, v1.4h, v4.h[3] //36 * pi2_src[1] 180 smlsl v5.4s, v3.4h, v4.h[1] //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3] 181 ld1 {v23.s}[0],[x2],x5 182 183 saddl v7.4s, v0.4h, v2.4h //pi2_src[0] + pi2_src[2] 184 ssubl v17.4s, v0.4h, v2.4h //pi2_src[0] - pi2_src[2] 185 shl v7.4s, v7.4s,#6 //e[0] = 64*(pi2_src[0] + pi2_src[2]) 186 shl v17.4s, v17.4s,#6 //e[1] = 64*(pi2_src[0] - pi2_src[2]) 187 188 189 add v19.4s, v7.4s , v6.4s //((e[0] + o[0] ) 190 add v16.4s, v17.4s , v5.4s //((e[1] + o[1]) 191 sub v18.4s, v17.4s , v5.4s //((e[1] - o[1]) 192 sub v20.4s, v7.4s , v6.4s //((e[0] - o[0]) 193 194 sqrshrn v28.4h, v19.4s,#shift_stage2_idct //pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) ) 195 sqrshrn v29.4h, v16.4s,#shift_stage2_idct //pi2_out[1] = clip_s16((e[1] + o[1] + add)>>shift) ) 196 sqrshrn v30.4h, v18.4s,#shift_stage2_idct //pi2_out[2] = clip_s16((e[0] - o[0] + add)>>shift) ) 197 sqrshrn v31.4h, v20.4s,#shift_stage2_idct //pi2_out[3] = clip_s16((e[0] - o[0] + add)>>shift) ) 198 ld1 {v23.s}[1],[x2],x5 199 200 trn1 v24.4h, v28.4h, v29.4h 201 trn2 v25.4h, v28.4h, v29.4h 202 trn1 v26.4h, v30.4h, v31.4h 203 trn2 v27.4h, v30.4h, v31.4h 204 trn1 v0.2s, v24.2s, v26.2s 205 trn2 v2.2s, v24.2s, v26.2s 206 trn1 v1.2s, v25.2s, v27.2s 207 trn2 v3.2s, v25.2s, v27.2s 208 // second stage ends 209 // output in d0,d1,d2,d3 210 // second stage computation ends 211 212 // loading pred 213 214 mov v0.d[1],v1.d[0] 215 mov v2.d[1],v3.d[0] 216 217 uaddw v0.8h, v0.8h , v22.8b // pi2_out(16bit) + pu1_pred(8bit) 218 uaddw v2.8h, v2.8h , v23.8b // pi2_out(16bit) + pu1_pred(8bit) 219 sqxtun v0.8b, v0.8h // clip_u8(pi2_out(16bit) + pu1_pred(8bit)) 220 sqxtun v1.8b, v2.8h // clip_u8(pi2_out(16bit) + pu1_pred(8bit)) 221 222 // storing destination 223 st1 {v0.s}[0],[x3],x6 224 st1 {v0.s}[1],[x3],x6 225 st1 {v1.s}[0],[x3],x6 226 st1 {v1.s}[1],[x3],x6 227 228 229 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 230 ldp x19, x20,[sp],#16 231 232 ret 233 234 235 236 237 238