1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 // ******************************************************************************* 20 // * @file 21 // * ihevc_itrans_recon_4x4_ttype1.s 22 // * 23 // * @brief 24 // * contains function definitions for inverse transform and reconstruction 25 // * 26 // * 27 // * @author 28 // * naveen sr 29 // * 30 // * @par list of functions: 31 // * - ihevc_itrans_recon_4x4_ttype1() 32 // * 33 // * @remarks 34 // * none 35 // * 36 // ******************************************************************************* 37 // */ 38 39 ///* all the functions here are replicated from ihevc_itrans.c and modified to */ 40 ///* include reconstruction */ 41 // 42 ///** 43 // ******************************************************************************* 44 // * 45 // * @brief 46 // * this function performs inverse transform type 1 (dst) and reconstruction 47 // * for 4x4 input block 48 // * 49 // * @par description: 50 // * performs inverse transform and adds the prediction data and clips output 51 // * to 8 bit 52 // * 53 // * @param[in] pi2_src 54 // * input 4x4 coefficients 55 // * 56 // * @param[in] pi2_tmp 57 // * temporary 4x4 buffer for storing inverse 58 // * 59 // * transform 60 // * 1st stage output 61 // * 62 // * @param[in] pu1_pred 63 // * prediction 4x4 block 64 // * 65 // * @param[out] pu1_dst 66 // * output 4x4 block 67 // * 68 // * @param[in] src_strd 69 // * input stride 70 // * 71 // * @param[in] pred_strd 72 // * prediction stride 73 // * 74 // * @param[in] dst_strd 75 // * output stride 76 // * 77 // * @param[in] zero_cols 78 // * zero columns in pi2_src 79 // * 80 // * @returns void 81 // * 82 // * @remarks 83 // * none 84 // * 85 // ******************************************************************************* 86 // */ 87 //void ihevc_itrans_recon_4x4_ttype1(word16 *pi2_src, 88 // word16 *pi2_tmp, 89 // uword8 *pu1_pred, 90 // uword8 *pu1_dst, 91 // word32 src_strd, 92 // word32 pred_strd, 93 // word32 dst_strd, 94 // word32 zero_cols) 95 96 //**************variables vs registers************************* 97 // x0 => *pi2_src 98 // x1 => *pi2_tmp 99 // x2 => *pu1_pred 100 // x3 => *pu1_dst 101 // x4 => src_strd 102 // x5 => pred_strd 103 // x6 => dst_strd 104 // x7 => zero_cols 105 106 .text 107 .align 4 108 109 .include "ihevc_neon_macros.s" 110 111 .set shift_stage1_idct , 7 112 .set shift_stage2_idct , 12 113 114 .globl ihevc_itrans_recon_4x4_ttype1_av8 115 116 .type ihevc_itrans_recon_4x4_ttype1_av8, %function 117 118 ihevc_itrans_recon_4x4_ttype1_av8: 119 120 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 121 122 stp x19, x20,[sp,#-16]! 123 124 add x4,x4,x4 // src_strd in terms of word16 125 126 mov x8,#29 127 mov x9,#55 128 mov x10,#74 129 mov x11,#84 130 mov v4.4h[0], w8 131 ld1 {v0.4h},[x0],x4 //loading pi2_src 1st row 132 mov v4.4h[1], w9 133 ld1 {v1.4h},[x0],x4 //loading pi2_src 2nd row 134 mov v4.4h[2], w10 135 ld1 {v2.4h},[x0],x4 //loading pi2_src 3rd row 136 mov v4.4h[3], w11 137 ld1 {v3.4h},[x0],x4 //loading pi2_src 4th row 138 139 // first stage computation starts 140 smull v6.4s, v1.4h, v4.4h[2] //74 * pi2_src[1] 141 smlal v6.4s, v0.4h, v4.4h[0] //74 * pi2_src[1] + 29 * pi2_src[0] 142 smlal v6.4s, v3.4h, v4.4h[1] //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3] 143 smlal v6.4s, v2.4h, v4.4h[3] //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3] 144 145 smull v5.4s, v1.4h, v4.4h[2] //74 * pi2_src[1] 146 smlal v5.4s, v0.4h, v4.4h[1] //74 * pi2_src[1] + 55 * pi2_src[0] 147 smlsl v5.4s, v2.4h, v4.4h[0] //74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] 148 smlsl v5.4s, v3.4h, v4.4h[3] //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - 84 * pi2_src[3]) 149 150 smull v7.4s, v0.4h, v4.4h[2] // 74 * pi2_src[0] 151 smlsl v7.4s, v2.4h, v4.4h[2] // 74 * pi2_src[0] - 74 * pi2_src[2] 152 smlal v7.4s, v3.4h, v4.4h[2] //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3] 153 154 smull v20.4s, v2.4h, v4.4h[1] // 55 * pi2_src[2] 155 smlsl v20.4s, v1.4h, v4.4h[2] // 55 * pi2_src[2] - 74 * pi2_src[1] 156 smlsl v20.4s, v3.4h, v4.4h[0] // - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3] 157 smlal v20.4s, v0.4h, v4.4h[3] //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3] 158 159 sqrshrn v28.4h, v6.4s,#shift_stage1_idct // (pi2_out[0] + rounding ) >> shift_stage1_idct 160 sqrshrn v29.4h, v5.4s,#shift_stage1_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct 161 sqrshrn v30.4h, v7.4s,#shift_stage1_idct // (pi2_out[2] + rounding ) >> shift_stage1_idct 162 sqrshrn v31.4h, v20.4s,#shift_stage1_idct // (pi2_out[3] + rounding ) >> shift_stage1_idct 163 ld1 {v18.s}[0],[x2],x5 164 165 trn1 v24.4h, v28.4h, v29.4h 166 trn2 v25.4h, v28.4h, v29.4h 167 trn1 v26.4h, v30.4h, v31.4h 168 trn2 v27.4h, v30.4h, v31.4h 169 trn1 v21.2s, v24.2s, v26.2s 170 trn2 v16.2s, v24.2s, v26.2s 171 trn1 v22.2s, v25.2s, v27.2s 172 trn2 v17.2s, v25.2s, v27.2s 173 // output in d14,d15,d16,d17 174 // first stage computation ends 175 176 // second stage computation starts : copy pasting 1st stage 177 // register changes 178 // d14 - d0 179 // d15 - d1 180 // d16 - d2 181 // d17 - d3 182 ld1 {v18.s}[1],[x2],x5 183 smull v6.4s, v22.4h, v4.4h[2] //74 * pi2_src[1] 184 smlal v6.4s, v21.4h, v4.4h[0] //74 * pi2_src[1] + 29 * pi2_src[0] 185 smlal v6.4s, v17.4h, v4.4h[1] //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3] 186 smlal v6.4s, v16.4h, v4.4h[3] //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3] 187 188 smull v5.4s, v22.4h, v4.4h[2] //74 * pi2_src[1] 189 smlal v5.4s, v21.4h, v4.4h[1] //74 * pi2_src[1] + 55 * pi2_src[0] 190 smlsl v5.4s, v16.4h, v4.4h[0] //74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] 191 smlsl v5.4s, v17.4h, v4.4h[3] //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - 84 * pi2_src[3]) 192 193 smull v7.4s, v21.4h, v4.4h[2] // 74 * pi2_src[0] 194 smlsl v7.4s, v16.4h, v4.4h[2] // 74 * pi2_src[0] - 74 * pi2_src[2] 195 smlal v7.4s, v17.4h, v4.4h[2] //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3] 196 ld1 {v19.s}[0],[x2],x5 197 198 smull v20.4s, v16.4h, v4.4h[1] // 55 * pi2_src[2] 199 smlsl v20.4s, v22.4h, v4.4h[2] // - 74 * pi2_src[1] + 55 * pi2_src[2] 200 smlsl v20.4s, v17.4h, v4.4h[0] // - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3] 201 smlal v20.4s, v21.4h, v4.4h[3] //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3] 202 203 sqrshrn v28.4h, v6.4s,#shift_stage2_idct // (pi2_out[0] + rounding ) >> shift_stage1_idct 204 sqrshrn v29.4h, v5.4s,#shift_stage2_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct 205 sqrshrn v30.4h, v7.4s,#shift_stage2_idct // (pi2_out[2] + rounding ) >> shift_stage1_idct 206 sqrshrn v31.4h, v20.4s,#shift_stage2_idct // (pi2_out[3] + rounding ) >> shift_stage1_idct 207 ld1 {v19.s}[1],[x2],x5 208 trn1 v24.4h, v28.4h, v29.4h 209 trn2 v25.4h, v28.4h, v29.4h 210 trn1 v26.4h, v30.4h, v31.4h 211 trn2 v27.4h, v30.4h, v31.4h 212 trn1 v0.2s, v24.2s, v26.2s 213 trn2 v2.2s, v24.2s, v26.2s 214 trn1 v1.2s, v25.2s, v27.2s 215 trn2 v3.2s, v25.2s, v27.2s 216 // output in d0,d1,d2,d3 217 // second stage computation ends 218 219 // loading pred 220 mov v0.d[1],v1.d[0] 221 mov v2.d[1],v3.d[0] 222 223 uaddw v0.8h, v0.8h , v18.8b // pi2_out(16bit) + pu1_pred(8bit) 224 sqxtun v0.8b, v0.8h // clip_u8(pi2_out(16bit) + pu1_pred(8bit)) 225 uaddw v2.8h, v2.8h , v19.8b // pi2_out(16bit) + pu1_pred(8bit) 226 sqxtun v1.8b, v2.8h // clip_u8(pi2_out(16bit) + pu1_pred(8bit)) 227 228 // storing destination 229 st1 {v0.s}[0],[x3],x6 230 st1 {v0.s}[1],[x3],x6 231 st1 {v1.s}[0],[x3],x6 232 st1 {v1.s}[1],[x3],x6 233 234 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 235 ldp x19, x20,[sp],#16 236 237 ret 238 239 240 241 242 243 244 245 246 247