1 @/****************************************************************************** 2 @ * 3 @ * Copyright (C) 2018 The Android Open Source Project 4 @ * 5 @ * Licensed under the Apache License, Version 2.0 (the "License"); 6 @ * you may not use this file except in compliance with the License. 7 @ * You may obtain a copy of the License at: 8 @ * 9 @ * http://www.apache.org/licenses/LICENSE-2.0 10 @ * 11 @ * Unless required by applicable law or agreed to in writing, software 12 @ * distributed under the License is distributed on an "AS IS" BASIS, 13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @ * See the License for the specific language governing permissions and 15 @ * limitations under the License. 16 @ * 17 @ ***************************************************************************** 18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 @*/ 20 21 .text 22 .align 4 23 24 @/** 25 @/******************************************************************************* 26 @/* 27 @/* @brief 28 @/* Residue calculation and Forward Transform for 4x4 block with 8-bit input 29 @/* 30 @/* @par Description: 31 @/* Performs residue calculation by subtracting source and prediction and 32 @/* followed by forward transform 33 @/* 34 @/* @param[in] pu1_src 35 @/* Input 4x4 pixels 36 @/* 37 @/* @param[in] pu1_pred 38 @/* Prediction data 39 @/* 40 @/* @param[in] pi4_tmp 41 @/* Temporary buffer of size 4x4 42 @/* 43 @/* @param[out] pi2_dst 44 @/* Output 4x4 coefficients 45 @/* 46 @/* @param[in] src_strd 47 @/* Input stride 48 @/* 49 @/* @param[in] pred_strd 50 @/* Prediction Stride 51 @/* 52 @/* @param[in] dst_strd_chr_flag 53 @/* Output Stride and Chroma Flag packed in the MS and LS 16-bit 54 @/* 55 @/* @returns Void 56 @/* 57 @/* @remarks 58 @/* None 59 @/* 60 @/******************************************************************************* 61 @/*/ 62 63 @/**************Variables Vs Registers***************************************** 64 @ r0 => *pu1_src 65 @ r1 => *pu1_pred 66 @ r2 => *pi4_temp 67 @ r3 => *pi2_dst 68 @ r4 => src_strd 69 @ r5 => pred_strd 70 @ r6 => dst_strd_chr_flag 71 72 .global ihevc_resi_trans_4x4_a9q 73 74 ihevc_resi_trans_4x4_a9q: 75 76 STMFD sp!, {r4-r7, r14} @ store all the register components from caller function to memory 77 LDR r4, [sp,#20] @ r4 contains src_strd 78 LDR r5, [sp,#24] @ r5 contains pred_strd 79 LDR r6, [sp,#28] @ r6 contains dst_strd_chr_flag 80 81 ANDS r7, r6, #1 @check for chroma flag, if present interleaved data 82 CMP r7, #0 83 BEQ NON_INTERLEAVE_LOAD @if flag == 0, use non-interleaving loads 84 85 VLD1.64 d0, [r0], r4 @ load row 0 src 86 VLD1.64 d4, [r0], r4 @ load row 1 src 87 VLD1.64 d1, [r0], r4 @ load row 2 src 88 VLD1.64 d5, [r0], r4 @ load row 3 src 89 VUZP.8 d0, d4 @ de-interleaving unzip instruction to get luma data of pu1_src in d0 90 VUZP.8 d1, d5 @ de-interleaving unzip instruction to get luma data of pu1_src in d1 91 92 VLD1.64 d2, [r1], r5 @ load row 0 pred 93 VLD1.64 d6, [r1], r5 @ load row 1 pred 94 VLD1.64 d3, [r1], r5 @ load row 2 pred 95 VLD1.64 d7, [r1], r5 @ load row 3 pred 96 VUZP.8 d2, d6 @ de-interleaving unzip instruction to get luma data of pu1_pred in d2 97 VUZP.8 d3, d7 @ de-interleaving unzip instruction to get luma data of pu1_pred in d3 98 99 B LOAD_END 100 101 NON_INTERLEAVE_LOAD: 102 VLD1.U32 d0[0], [r0], r4 @ load row 0 src 103 VLD1.U32 d0[1], [r0], r4 @ load row 1 src 104 VLD1.U32 d1[0], [r0], r4 @ load row 2 src 105 VLD1.U32 d1[1], [r0], r4 @ load row 3 src 106 107 VLD1.U32 d2[0], [r1], r5 @ load row 0 pred 108 VLD1.U32 d2[1], [r1], r5 @ load row 1 pred 109 VLD1.U32 d3[0], [r1], r5 @ load row 2 pred 110 VLD1.U32 d3[1], [r1], r5 @ load row 3 pred 111 112 LOAD_END: 113 @ Finding the residue 114 VSUBL.U8 q2, d0, d2 @ q2 contains 1st 16-bit 8 residues 115 VSUBL.U8 q3, d1, d3 @ q3 contains 2nd 16-bit 8 residues 116 117 @ SAD caculation 118 VABDL.U8 q12, d0, d2 @ q12 contains absolute differences 119 VABAL.U8 q12, d1, d3 @ q12 accumulates absolute differences 120 VADD.U16 d26, d24, d25 @ add d-registers of q12 121 VPADDL.U16 d27, d26 @ d27 contains 2 32-bit values that have to be added 122 VPADDL.U32 d28, d27 @ d28 contains 64-bit SAD, only LSB important 123 VMOV.32 r0, d28[0] @ SAD stored in r0 for return 124 @ SAD caculation ends 125 126 @ Forward transform - step 1 127 VMOV.I16 d2, #64 @ generate immediate constant in d2 for even row multiplication 128 VTRN.16 d4, d5 @ 3-step transpose of residue matrix starts 129 VTRN.16 d6, d7 @ 2nd step of the 3-step matrix transpose 130 VMOV.I16 d0, #83 @ generate immediate constant in d0 for odd row multiplication 131 VTRN.32 q2, q3 @ Final step of matrix transpose 132 133 VMOV.I16 d1, #36 @ generate immediate constant in d1 for odd row multiplication 134 VSWP d6, d7 @ vector swap to allow even and odd row calculation using Q registers 135 VADD.S16 q10, q2, q3 @ q4 has the even array 136 VSUB.S16 q11, q2, q3 @ q5 has the odd array 137 VMULL.S16 q12, d20, d2 @ e[0]*64 138 VMLAL.S16 q12, d21, d2[0] @ row 1 of results: e[0]*64 + e[1]*64 139 VMULL.S16 q13, d20, d2 @ e[0]*64 140 VMLSL.S16 q13, d21, d2[0] @ row 3 of results: e[0]*64 - e[1]*64 141 VMULL.S16 q8, d22, d0 @ o[0]*83 142 VMLAL.S16 q8, d23, d1[0] @ row 2 of results: o[0]*83 + o[1]*36 143 VMULL.S16 q9, d22, d1 @ o[0]*36 144 VMLSL.S16 q9, d23, d0[0] @ row 4 of results: o[0]*36 - o[1]*83 145 146 @ Forward transform - step 2 147 VMOV.I32 d2, #64 @ generate immediate constant in d2 for even row multiplication 148 VMOV.I32 d0, #83 @ generate immediate constant in d0 for odd row multiplication 149 VTRN.32 q12, q8 @ 4-step transpose of residue matrix starts 150 VTRN.32 q13, q9 @ 2nd step of the 4-step matrix transpose 151 152 VMOV.I32 d1, #36 @ generate immediate constant in d1 for odd row multiplication 153 VSWP d25, d26 @ 3rd step of the 4-step matrix transpose 154 VSWP d17, d18 @ 4th step of the 4-step matrix transpose 155 VADD.S32 q2, q12, q9 @ e[0] 156 VADD.S32 q3, q8, q13 @ e[1] 157 VSUB.S32 q10, q12, q9 @ o[0] 158 VSUB.S32 q11, q8, q13 @ o[1] 159 160 VMUL.S32 q12, q2, d2[0] @ e[0]*64 161 VMLA.S32 q12, q3, d2[0] @ row 1 of results: e[0]*64 + e[1]*64 162 VMUL.S32 q13, q2, d2[0] @ e[1]*64 163 VMLS.S32 q13, q3, d2[0] @ row 3 of results: e[0]*64 - e[1]*64 164 VMUL.S32 q8, q10, d0[0] @ o[0]*83 165 VMLA.S32 q8, q11, d1[0] @ row 2 of results: o[0]*83 + o[1]*36 166 VMUL.S32 q9, q10, d1[0] @ o[0]*36 167 VMLS.S32 q9, q11, d0[0] @ row 4 of results: o[0]*36 - o[1]*83 168 169 VRSHRN.S32 d0, q12, #9 @ (row1 + 256)/512 170 VRSHRN.S32 d1, q8, #9 @ (row2 + 256)/512 171 VRSHRN.S32 d2, q13, #9 @ (row3 + 256)/512 172 VRSHRN.S32 d3, q9, #9 @ (row4 + 256)/512 173 174 LSR r7, r6, #15 @ r7 = 2*dst_strd, as pi2_dst contains 2-bit integers 175 VST1.U16 d0, [r3], r7 @ store 1st row of result 176 VST1.U16 d1, [r3], r7 @ store 2nd row of result 177 VST1.U16 d2, [r3], r7 @ store 3rd row of result 178 VST1.U16 d3, [r3], r7 @ store 4th row of result 179 180 LDMFD sp!,{r4-r7,r15} @ Reload the registers from SP 181 182 @ Function End 183 184 @/** 185 @******************************************************************************* 186 @* 187 @* @brief 188 @* This function performs residue calculation and forward transform type 1 189 @* on input pixels 190 @* 191 @* @description 192 @* Performs residue calculation by subtracting source and prediction and 193 @* followed by forward transform 194 @* 195 @* @param[in] pu1_src 196 @* Input 4x4 pixels 197 @* 198 @* @param[in] pu1_pred 199 @* Prediction data 200 @* 201 @* @param[in] pi2_tmp 202 @* Temporary buffer of size 4x4 203 @* 204 @* @param[out] pi2_dst 205 @* Output 4x4 coefficients 206 @* 207 @* @param[in] src_strd 208 @* Input stride 209 @* 210 @* @param[in] pred_strd 211 @* Prediction Stride 212 @* 213 @* @param[in] dst_strd_chr_flag 214 @* Output Stride and Chroma Flag packed in the MS and LS 16-bit 215 @* 216 @* @returns void 217 @* 218 @* @remarks 219 @* None 220 @* 221 @******************************************************************************* 222 @*/ 223 @ UWORD32 ihevc_resi_trans_4x4_ttype1(UWORD8 *pu1_src, 224 @ UWORD8 *pu1_pred, 225 @ WORD32 *pi4_temp, 226 @ WORD16 *pi2_dst, 227 @ WORD32 src_strd, 228 @ WORD32 pred_strd, 229 @ WORD32 dst_strd_chr_flag); 230 @ 231 @**************Variables Vs Registers******************************************* 232 @ 233 @ r0 - pu1_src 234 @ r1 - pu1_pred 235 @ r2 - pi4_temp 236 @ r3 - pi2_dst 237 @ 238 @ [sp] - src_strd 239 @ [sp+4] - pred_strd 240 @ [sp+8] - dst_strd_chr_flag 241 @ 242 @******************************************************************************* 243 244 .global ihevc_resi_trans_4x4_ttype1_a9q 245 246 ihevc_resi_trans_4x4_ttype1_a9q: 247 248 PUSH {r4} 249 vpush {d8 - d15} 250 251 LDR r2,[sp,#68] @ r2 = src_strd 252 LDR r4,[sp,#72] @ r4 = pred_strd 253 254 VLD1.32 d2[0],[r0],r2 @ Row 1 of source in d2[0] 255 VLD1.32 d3[0],[r1],r4 @ Row 1 of prediction in d3[0] 256 VLD1.32 d2[1],[r0],r2 @ Row 2 of source in d2[1] 257 VLD1.32 d3[1],[r1],r4 @ Row 2 of prediction in d3[1] 258 259 VLD1.32 d8[0],[r0],r2 @ Row 3 of source in d8[0] 260 VABDL.U8 q0,d2,d3 @ Absolute differences of rows 1 and 2 in d0 261 @ R2:[d11[3] d11[2] d11[1] d11[0]] => Row 2 of residue 262 VLD1.32 d9[0],[r1],r4 @ Row 3 of prediction in d9[0] 263 VSUBL.U8 q5,d2,d3 @ R1:[d10[3] d10[2] d10[1] d10[0]] => Row 1 of residue 264 VLD1.32 d8[1],[r0] @ Row 4 of source in d8[1] 265 VTRN.16 d10,d11 @ Transpose step 1 266 VLD1.32 d9[1],[r1] @ Row 4 of prediction in d9[1] 267 268 VSUBL.U8 q6,d8,d9 @ R3:[d12[3] d12[2] d12[1] d12[0]] => Row 3 of residue 269 @ R4:[d13[3] d13[2] d13[1] d13[0]] => Row 4 of residue 270 VABAL.U8 q0,d8,d9 @ Absolute differences of rows 3 and 4 in d1 271 VTRN.16 d12,d13 @ Transpose step 2 272 VTRN.32 q5,q6 @ Transpose step 3, Residue block transposed 273 @ Columns are in C1:d10, C2:d11, C3:d12 and C4:d13 274 VADD.S16 d23,d11,d13 @ d23 = C2 + C4 275 VMOV.I32 d6,#55 @ Constant used for multiplication 276 VADD.S16 d22,d10,d13 @ d22 = C1 + C4 277 VADD.U16 d0,d1,d0 @ Accumulating SAD step 1 278 VMOV.I32 d7,#84 @ Constant used for multiplication 279 VMULL.S16 q7,d23,d6[0] @ q7 = 55*C2 + 55*C4 280 VMOV.I32 d4,#74 @ Constant used for multiplication 281 VMULL.S16 q9,d22,d7[0] @ q9 = 84*C1 + 84*C4 282 VADD.S16 d16,d10,d11 @ d16 = C1 + C2 283 VMUL.S16 d12,d12,d4[0] @ d12 = 74*C3 284 VMOV.I32 d5,#29 @ Constant used for multiplication 285 VPADDL.U16 d0,d0 @ Accumulating SAD step 2 286 VSUB.S16 d16,d16,d13 @ d16 = C1 + C2 - C4 287 VMLAL.S16 q7,d22,d5[0] @ q7 = 29*C1 + 55*C2 + 84*C4 288 VMLSL.S16 q9,d23,d5[0] @ q9 = 84*C1 - 29*C2 + 55*C4 289 VMULL.S16 q8,d16,d4[0] @ q8 = 74*C1 + 74*C2 - 74*C4 290 VPADDL.U32 d0,d0 @ Accumulating SAD step 3, SAD in d0 291 VSUB.S32 q10,q9,q7 @ q10 = q9 - q7 = 55*C1 - 84*C2 - 29*C4 292 VMOV.32 r0,d0[0] @ Return SAD value 293 VRSHR.S32 q8,q8,#1 @ Truncating the 1 bit in q8 294 295 VADDW.S16 q7,q7,d12 @ q7 = 29*C1 + 55*C2 + 74*C3 + 84*C4 296 VSUBW.S16 q9,q9,d12 @ q9 = 84*C1 - 29*C2 - 74*C3 + 55*C4 297 VADDW.S16 q10,q10,d12 @ q10 = 55*C1 - 84*C2 + 74*C3 - 29*C4 298 299 VRSHR.S32 q7,q7,#1 @ Truncating the 1 bit in q7 300 VRSHR.S32 q9,q9,#1 @ Truncating the 1 bit in q9 301 VRSHR.S32 q10,q10,#1 @ Truncating the 1 bit in q10 302 @ Transform stage 1 is in P1:q7, P2:q8, P3:q9 and P4:q10 303 VTRN.32 q7,q8 304 VTRN.32 q9,q10 305 VSWP d15,d18 306 VSWP d17,d20 @ Residue block transposed 307 @ Corresponding columns are in S1:q7, S2:q8, S3:q9 and S4:q10 308 VADD.S32 q13,q7,q8 @ q13 = S1 + S2 309 VADD.S32 q1,q7,q10 @ q1 = S1 + S4 310 VADD.S32 q4,q8,q10 @ q4 = S2 + S4 311 VSUB.S32 q13,q13,q10 @ q13 = S1 + S2 - S4 312 VMUL.S32 q12,q1,d5[0] @ q12 = 29*S1 + 29*S4 313 VMUL.S32 q14,q1,d7[0] @ q14 = 84*S1 + 84*S4 314 VMUL.S32 q13,q13,d4[0] @ q13 = 74*S1 + 74*S2 - 74*S4 315 316 VMLA.S32 q12,q4,d6[0] @ q12 = 29*S1 + 55*S2 + 84*S4 317 VMLS.S32 q14,q4,d5[0] @ q14 = 84*S1 - 29*S2 + 55*S4 318 VMUL.S32 q9,q9,d4[0] @ q9 = 74*S3 319 320 LDR r4,[sp,#76] @ r4 = dst_strd_chr_flag 321 ASR r4,r4,#16 @ r4 = dst_strd 322 LSL r4,r4,#1 @ r4 = 2*dst_strd 323 324 VRSHRN.S32 d26,q13,#8 325 VSUB.S32 q15,q14,q12 @ q15 = q14 - q12 = 55*S1 - 84*S2 - 29*S4 326 327 VADD.S32 q12,q12,q9 @ q12 = 29*S1 + 55*S2 + 74*S3 + 84*S4 328 VSUB.S32 q14,q14,q9 @ q14 = 84*S1 - 29*S2 - 74*S3 + 55*S4 329 VADD.S32 q15,q15,q9 @ q15 = 55*S1 - 84*S2 + 74*S3 - 29*S4 330 331 VRSHRN.S32 d24,q12,#8 332 VRSHRN.S32 d28,q14,#8 333 VRSHRN.S32 d30,q15,#8 @ Truncating the last 8 bits 334 @ Transform stage 2 is in U1:d24, U2:d26, U3:d28 and U4:d30 335 VST1.64 d24,[r3],r4 @ Storing row 1 of transform stage 2 336 VST1.64 d26,[r3],r4 @ Storing row 2 of transform stage 2 337 VST1.64 d28,[r3],r4 @ Storing row 3 of transform stage 2 338 VST1.64 d30,[r3] @ Storing row 4 of transform stage 2 339 340 vpop {d8 - d15} 341 POP {r4} 342 MOV pc,lr 343 344 @/** 345 @******************************************************************************* 346 @* 347 @* @brief 348 @* This function performs residue calculation and DCT integer forward transform 349 @* on 8x8 block 350 @* 351 @* @description 352 @* Performs residue calculation by subtracting source and prediction and 353 @* followed by DCT integer forward transform 354 @* 355 @* @param[in] pu1_src 356 @* Input 4x4 pixels 357 @* 358 @* @param[in] pu1_pred 359 @* Prediction data 360 @* 361 @* @param[in] pi2_tmp 362 @* Temporary buffer of size 8x8 363 @* 364 @* @param[out] pi2_dst 365 @* Output 8x8 coefficients 366 @* 367 @* @param[in] src_strd 368 @* Input stride 369 @* 370 @* @param[in] pred_strd 371 @* Prediction Stride 372 @* 373 @* @param[in] dst_strd_chr_flag 374 @* Output Stride and Chroma Flag packed in the MS and LS 16-bit 375 @* 376 @* @returns void 377 @* 378 @* @remarks 379 @* None 380 @* 381 @******************************************************************************* 382 @*/ 383 @ UWORB32 ihevc_resi_trans_8x8(UWORD8 *pu1_src, 384 @ UWORD8 *pu1_pred, 385 @ WORB32 *pi4_temp, 386 @ WORB16 *pi2_dst, 387 @ WORB32 src_strd, 388 @ WORB32 pred_strd, 389 @ WORB32 dst_strd_chr_flag); 390 @ 391 @**************Variables Vs Registers******************************************* 392 @ 393 @ r0 - pu1_src 394 @ r1 - pu1_pred 395 @ r2 - pi4_temp 396 @ r3 - pi2_dst 397 @ 398 @ [sp] - src_strd 399 @ [sp+4] - pred_strd 400 @ [sp+8] - dst_strd_chr_flag 401 @ 402 @******************************************************************************* 403 404 .global ihevc_resi_trans_8x8_a9q 405 406 ihevc_resi_trans_8x8_a9q: 407 408 PUSH {r4,r5} 409 vpush {d8 - d15} 410 411 @ Loading Prediction and Source blocks of sixe 8x8 412 413 LDR r4,[sp,#80] @ r4 = dst_strd_chr_flag 414 AND r4,r4,#1 @ r4 = chr_flag 415 CMP r4,#1 416 BNE CHROMA_LOAD 417 418 LUMA_LOAD: 419 420 LDR r5,[sp,#72] @ r5 = src_strd 421 LDR r4,[sp,#76] @ r4 = pred_strd 422 423 VLD2.8 {d0,d2},[r1],r4 @ Row 1 of prediction in d0 424 VLD2.8 {d1,d3},[r0],r5 @ Row 1 of source in d1 425 426 VABDL.U8 q15,d1,d0 @ Row 1 of absolute difference in q15 427 VLD2.8 {d2,d4},[r1],r4 @ Row 2 of prediction in d2 428 VSUBL.U8 q0,d1,d0 @ Row 1 of residue in q0 429 VLD2.8 {d3,d5},[r0],r5 @ Row 2 of source in d3 430 431 VABDL.U8 q9,d3,d2 @ Row 2 of absolute difference in q9 432 VLD2.8 {d4,d6},[r1],r4 @ Row 3 of prediction in d4 433 VSUBL.U8 q1,d3,d2 @ Row 2 of residue in q1 434 VLD2.8 {d5,d7},[r0],r5 @ Row 3 of source in d5 435 436 VABAL.U8 q15,d5,d4 @ Row 3 of absolute difference accumulated in q15 437 VLD2.8 {d6,d8},[r1],r4 @ Row 4 of prediction in d6 438 VSUBL.U8 q2,d5,d4 @ Row 3 of residue in q2 439 VLD2.8 {d7,d9},[r0],r5 @ Row 4 of source in d7 440 441 VABAL.U8 q9,d7,d6 @ Row 4 of absolute difference accumulated in q9 442 VLD2.8 {d8,d10},[r1],r4 @ Row 5 of prediction in d8 443 VSUBL.U8 q3,d7,d6 @ Row 4 of residue in q3 444 VLD2.8 {d9,d11},[r0],r5 @ Row 5 of source in d9 445 446 VABDL.U8 q10,d9,d8 @ Row 5 of absolute difference in q10 447 VLD2.8 {d10,d12},[r1],r4 @ Row 6 of prediction in d10 448 VSUBL.U8 q4,d9,d8 @ Row 5 of residue in q4 449 VLD2.8 {d11,d13},[r0],r5 @ Row 6 of source in d11 450 451 VABAL.U8 q15,d11,d10 @ Row 6 of absolute difference accumulated in q15 452 VLD2.8 {d12,d14},[r1],r4 @ Row 7 of prediction in d12 453 VSUBL.U8 q5,d11,d10 @ Row 6 of residue in q5 454 VLD2.8 {d13,d15},[r0],r5 @ Row 7 of source in d13 455 456 VABAL.U8 q9,d13,d12 @ Row 7 of absolute difference accumulated in q9 457 VLD2.8 {d14,d16},[r1] @ Row 8 of prediction in d14 458 VSUBL.U8 q6,d13,d12 @ Row 7 of residue in q6 459 VLD2.8 {d15,d17},[r0] @ Row 8 of source in d15 460 461 B CHROMA_LOAD_END 462 463 CHROMA_LOAD: 464 465 LDR r5,[sp,#72] @ r5 = src_strd 466 LDR r4,[sp,#76] @ r4 = pred_strd 467 468 VLD1.64 d0,[r1],r4 @ Row 1 of prediction in d0 469 VLD1.64 d1,[r0],r5 @ Row 1 of source in d1 470 471 VABDL.U8 q15,d1,d0 @ Row 1 of absolute difference in q15 472 VLD1.64 d2,[r1],r4 @ Row 2 of prediction in d2 473 VSUBL.U8 q0,d1,d0 @ Row 1 of residue in q0 474 VLD1.64 d3,[r0],r5 @ Row 2 of source in d3 475 476 VABDL.U8 q9,d3,d2 @ Row 2 of absolute difference in q9 477 VLD1.64 d4,[r1],r4 @ Row 3 of prediction in d4 478 VSUBL.U8 q1,d3,d2 @ Row 2 of residue in q1 479 VLD1.64 d5,[r0],r5 @ Row 3 of source in d5 480 481 VABAL.U8 q15,d5,d4 @ Row 3 of absolute difference accumulated in q15 482 VLD1.64 d6,[r1],r4 @ Row 4 of prediction in d6 483 VSUBL.U8 q2,d5,d4 @ Row 3 of residue in q2 484 VLD1.64 d7,[r0],r5 @ Row 4 of source in d7 485 486 VABAL.U8 q9,d7,d6 @ Row 4 of absolute difference accumulated in q9 487 VLD1.64 d8,[r1],r4 @ Row 5 of prediction in d8 488 VSUBL.U8 q3,d7,d6 @ Row 4 of residue in q3 489 VLD1.64 d9,[r0],r5 @ Row 5 of source in d9 490 491 VABDL.U8 q10,d9,d8 @ Row 5 of absolute difference in q10 492 VLD1.64 d10,[r1],r4 @ Row 6 of prediction in d10 493 VSUBL.U8 q4,d9,d8 @ Row 5 of residue in q4 494 VLD1.64 d11,[r0],r5 @ Row 6 of source in d11 495 496 VABAL.U8 q15,d11,d10 @ Row 6 of absolute difference accumulated in q15 497 VLD1.64 d12,[r1],r4 @ Row 7 of prediction in d12 498 VSUBL.U8 q5,d11,d10 @ Row 6 of residue in q5 499 VLD1.64 d13,[r0],r5 @ Row 7 of source in d13 500 501 VABAL.U8 q9,d13,d12 @ Row 7 of absolute difference accumulated in q9 502 VLD1.64 d14,[r1] @ Row 8 of prediction in d14 503 VSUBL.U8 q6,d13,d12 @ Row 7 of residue in q6 504 VLD1.64 d15,[r0] @ Row 8 of source in d15 505 506 CHROMA_LOAD_END: 507 508 @ Transform stage 1 509 @ Transposing residue matrix 510 511 VABAL.U8 q10,d15,d14 @ Row 8 of absolute difference accumulated in q10 512 VTRN.16 q0,q1 @ Transpose residue matrix step (1a) 513 VSUBL.U8 q7,d15,d14 @ Row 8 of residue in q7 514 VTRN.16 q2,q3 @ Transpose residue matrix step (1b) 515 516 VTRN.16 q4,q5 @ Transpose residue matrix step (1c) 517 VTRN.16 q6,q7 @ Transpose residue matrix step (1d) 518 VTRN.32 q0,q2 @ Transpose residue matrix step (2a) 519 VTRN.32 q1,q3 @ Transpose residue matrix step (2b) 520 521 VADD.U16 q8,q15,q9 @ SAD calculation (1) 522 VTRN.32 q4,q6 @ Transpose residue matrix step (2c) 523 VTRN.32 q5,q7 @ Transpose residue matrix step (2d) 524 525 VADD.U16 q8,q8,q10 @ SAD calculation (2) 526 VSWP d1,d8 @ Transpose residue matrix step (3a) 527 VSWP d3,d10 @ Transpose residue matrix step (3b) 528 529 VADD.U16 d16,d16,d17 @ SAD calculation (3) 530 VSWP d7,d14 @ Transpose residue matrix step (3c) 531 VSWP d5,d12 @ Transpose residue matrix step (3d) 532 @ Columns of residue C0-C7 (8x8 matrix) in q0-q7 533 VPADDL.U16 d16,d16 @ SAD calculation (4) 534 535 @ Evaluating first step in Butterfly diagram 536 537 VADD.S16 q10,q0,q7 @ q10 = C0 + C7 538 VADD.S16 q11,q1,q6 @ q11 = C1 + C6 539 VPADDL.U32 d16,d16 @ SAD calculation (5) 540 VADD.S16 q12,q2,q5 @ q12 = C2 + C5 541 VADD.S16 q13,q3,q4 @ q13 = C3 + C4 542 543 VSUB.S16 q4,q3,q4 @ q4 = C3 - C4 544 VSUB.S16 q5,q2,q5 @ q5 = C2 - C5 545 VSUB.S16 q6,q1,q6 @ q6 = C1 - C6 546 VSUB.S16 q7,q0,q7 @ q7 = C0 - C7 547 548 @ Calculating F0, F2, F4 and F6 549 550 VADD.S16 q1,q11,q12 @ q1 = C1 + C2 + C5 + C6 551 VADD.S16 q2,q10,q13 @ q2 = C0 + C3 + C4 + C7 552 553 MOV r4,#50 554 LSL r4,r4,#16 555 ADD r4,r4,#18 556 MOV r5,#89 557 LSL r5,r5,#16 558 ADD r5,r5,#75 559 VMOV d0,r4,r5 @ 16-bit aligned, d0[3] = 89, d0[2] = 75, d0[1] = 50, d0[0]=18 560 561 MOV r4,#83 562 LSL r4,r4,#16 563 ADD r4,r4,#36 564 VMOV d1,r4,r4 @ 16-bit aligned, d1[3] = 83, d1[2] = 36, d1[1] = 83, d1[0]=36 565 566 VSUB.S16 q10,q10,q13 @ q10 = C0 - C3 - C4 + C7 567 VSUB.S16 q11,q11,q12 @ q11 = C1 - C2 - C5 + C6 568 VMOV.32 r0,d16[0] @ SAD calculation (6) : Return value = SAD 569 570 VSUB.S16 q3,q2,q1 @ q3 = C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7 571 VADD.S16 q2,q2,q1 @ q2 = C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7 572 573 VMULL.S16 q14,d20,d1[1] @ q14 = [0] of 83*(C0 - C3 - C4 + C7) 574 VMULL.S16 q15,d21,d1[1] @ q15 = [1] of 83*(C0 - C3 - C4 + C7) 575 VMULL.S16 q9,d20,d1[0] @ q9 = [0] of 36*(C0 - C3 - C4 + C7) 576 VMULL.S16 q10,d21,d1[0] @ q10 = [1] of 36*(C0 - C3 - C4 + C7) 577 578 VMLAL.S16 q14,d22,d1[0] @ q14 = F2[0] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6) 579 VSHLL.S16 q13,d6,#6 @ q13 = F4[0] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7) 580 VMLAL.S16 q15,d23,d1[0] @ q15 = F2[1] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6) 581 VSHLL.S16 q3,d7,#6 @ q3 = F4[1] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7) 582 VMLSL.S16 q9,d22,d1[1] @ q9 = F6[0] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6) 583 VSHLL.S16 q12,d4,#6 @ q12 = F0[0] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7) 584 VMLSL.S16 q10,d23,d1[1] @ q10 = F6[1] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6) 585 VSHLL.S16 q2,d5,#6 @ q2 = F0[1] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7) 586 587 @ Calculating F1, F3, F5 and F7 588 589 MOV r4,#48 590 VST1.64 {d24,d25},[r2]! @ Row 1 of transform stage 1 F0[0] stored 591 VST1.64 {d4,d5},[r2],r4 @ Row 1 of transform stage 1 F0[1] stored 592 VST1.64 {d28,d29},[r2]! @ Row 3 of transform stage 1 F2[0] stored 593 VST1.64 {d30,d31},[r2],r4 @ Row 3 of transform stage 1 F2[1] stored 594 595 VST1.64 {d26,d27},[r2]! @ Row 5 of transform stage 1 F4[0] stored 596 VMULL.S16 q1,d14,d0[3] @ q1 = [0] of 89*(C0 - C7) 597 VMULL.S16 q8,d15,d0[3] @ q8 = [1] of 89*(C0 - C7) 598 VST1.64 {d6,d7},[r2],r4 @ Row 5 of transform stage 1 F4[1] stored 599 VMULL.S16 q11,d14,d0[2] @ q11 = [0] of 75*(C0 - C7) 600 VMULL.S16 q13,d15,d0[2] @ q13 = [1] of 75*(C0 - C7) 601 VST1.64 {d18,d19},[r2]! @ Row 7 of transform stage 1 F6[0] stored 602 VMULL.S16 q3,d14,d0[1] @ q3 = [0] of 50*(C0 - C7) 603 VMULL.S16 q9,d15,d0[1] @ q9 = [1] of 50*(C0 - C7) 604 VST1.64 {d20,d21},[r2] @ Row 7 of transform stage 1 F6[1] stored 605 VMULL.S16 q10,d14,d0[0] @ q10 = [0] of 18*(C0 - C7) 606 VMULL.S16 q7,d15,d0[0] @ q7 = [1] of 18*(C0 - C7) 607 608 VMLAL.S16 q1,d12,d0[2] @ q1 = [0] of 89*(C0 - C7) + 75*(C1 - C6) 609 VMLAL.S16 q8,d13,d0[2] @ q8 = [1] of 89*(C0 - C7) + 75*(C1 - C6) 610 VMLSL.S16 q11,d12,d0[0] @ q11 = [0] of 75*(C0 - C7) - 18*(C1 - C6) 611 VMLSL.S16 q13,d13,d0[0] @ q13 = [1] of 75*(C0 - C7) - 18*(C1 - C6) 612 VMLSL.S16 q3,d12,d0[3] @ q3 = [0] of 50*(C0 - C7) - 89*(C1 - C6) 613 VMLSL.S16 q9,d13,d0[3] @ q9 = [1] of 50*(C0 - C7) - 89*(C1 - C6) 614 VMLSL.S16 q10,d12,d0[1] @ q10 = [0] of 18*(C0 - C7) - 50*(C1 - C6) 615 VMLSL.S16 q7,d13,d0[1] @ q7 = [1] of 18*(C0 - C7) - 50*(C1 - C6) 616 617 VMLAL.S16 q1,d10,d0[1] @ q1 = [0] of 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) 618 VMLAL.S16 q8,d11,d0[1] @ q8 = [1] of 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) 619 VMLSL.S16 q11,d10,d0[3] @ q11 = [0] of 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) 620 VMLSL.S16 q13,d11,d0[3] @ q13 = [1] of 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) 621 VMLAL.S16 q3,d10,d0[0] @ q3 = [0] of 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) 622 VMLAL.S16 q9,d11,d0[0] @ q9 = [1] of 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) 623 VMLAL.S16 q10,d10,d0[2] @ q10 = [0] of 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) 624 VMLAL.S16 q7,d11,d0[2] @ q7 = [1] of 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) 625 626 VMLAL.S16 q1,d8,d0[0] @ q1 = F1[0] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4) 627 VMLAL.S16 q8,d9,d0[0] @ q8 = F1[1] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4) 628 VMLSL.S16 q11,d8,d0[1] @ q11 = F3[0] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4) 629 VMLSL.S16 q13,d9,d0[1] @ q13 = F3[1] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4) 630 SUB r2,r2,#176 @ r2 now points to the second row 631 VMLAL.S16 q3,d8,d0[2] @ q3 = F5[0] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4) 632 VMLAL.S16 q9,d9,d0[2] @ q9 = F5[1] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4) 633 VST1.64 {d2,d3},[r2]! @ Row 2 of transform stage 1 F1[0] stored 634 VMLSL.S16 q10,d8,d0[3] @ q10 = F7[0] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4) 635 VMLSL.S16 q7,d9,d0[3] @ q7 = F7[1] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4) 636 637 VST1.64 {d16,d17},[r2],r4 @ Row 2 of transform stage 1 F1[1] stored 638 VST1.64 {d22,d23},[r2]! @ Row 4 of transform stage 1 F3[0] stored 639 VST1.64 {d26,d27},[r2],r4 @ Row 4 of transform stage 1 F3[1] stored 640 VST1.64 {d6,d7},[r2]! @ Row 6 of transform stage 1 F5[0] stored 641 VST1.64 {d18,d19},[r2],r4 @ Row 6 of transform stage 1 F5[1] stored 642 VST1.64 {d20,d21},[r2]! @ Row 8 of transform stage 1 F7[0] stored 643 VST1.64 {d14,d15},[r2] @ Row 8 of transform stage 1 F7[1] stored 644 645 @ Transform stage 2 (for rows 1-4 of transform stage 1) 646 @ Transposing the 4 rows (F0, F1, F2, F3) 647 @ F0 = {q2,q12}, F1 = {q8,q1}, F2 = {q15,q14} and F3 = {q13,q11} 648 649 VTRN.32 q12,q1 @ Transposing first half of transform stage 1 (1a) 650 VTRN.32 q14,q11 @ Transposing first half of transform stage 1 (1b) 651 VSWP d25,d28 @ Transposing first half of transform stage 1 (2a) 652 VSWP d22,d3 @ Transposing first half of transform stage 1 (2b) 653 654 VTRN.32 q2,q8 @ Transposing first half of transform stage 1 (3a) 655 VTRN.32 q15,q13 @ Transposing first half of transform stage 1 (3b) 656 VSWP d5,d30 @ Transposing first half of transform stage 1 (4a) 657 VSWP d26,d17 @ Transposing first half of transform stage 1 (4b) 658 @ B0:q12, B1:q1, B2:q14, B3:q11, B4:q2, B5:q8, B6:q15 and B7:q13 659 660 @ Evaluating first step in Butterfly diagram 661 662 VADD.S32 q0,q12,q13 @ q0 = B0 + B7 663 VADD.S32 q5,q11,q2 @ q5 = B3 + B4 664 VADD.S32 q3,q1,q15 @ q3 = B1 + B6 665 VADD.S32 q4,q14,q8 @ q4 = B2 + B5 666 667 VSUB.S32 q7,q14,q8 @ q7 = B2 - B5 668 VSUB.S32 q8,q1,q15 @ q8 = B1 - B6 669 VSUB.S32 q6,q11,q2 @ q6 = B3 - B4 670 VSUB.S32 q9,q12,q13 @ q9 = B0 - B7 671 672 @ Calculating G0, G2, G4 and G6 673 674 MOV r4,#18 675 MOV r5,#50 676 VMOV d2,r4,r5 @ 32-bit aligned, d2[1] = 50, d2[0] = 18 677 VSUB.S32 q2,q0,q5 @ q2 = B0 - B3 - B4 + B7 678 679 MOV r4,#75 680 MOV r5,#89 681 VMOV d3,r4,r5 @ 32-bit aligned, d3[1] = 89, d3[0] = 75 682 VADD.S32 q10,q0,q5 @ q10 = B0 + B3 + B4 + B7 683 684 MOV r4,#36 685 MOV r5,#83 686 VMOV d0,r4,r5 @ 32-bit aligned, d0[1] = 83, d0[0] = 36 687 VSUB.S32 q11,q3,q4 @ q11 = B1 - B2 - B5 + B6 688 VADD.S32 q3,q3,q4 @ q3 = B1 + B2 + B5 + B6 689 690 VMUL.S32 q12,q2,d0[1] @ q12 = 83*(B0 - B3 - B4 + B7) 691 VMUL.S32 q2,q2,d0[0] @ q2 = 36*(B0 - B3 - B4 + B7) 692 VMUL.S32 q5,q9,d3[1] @ q5 = 89*(B0 - B7) 693 VADD.S32 q14,q10,q3 @ q14 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7 694 VMUL.S32 q4,q9,d3[0] @ q4 = 75*(B0 - B7) 695 VSUB.S32 q15,q10,q3 @ q15 = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7 696 @ VSHL.S32 q14,q14,#6 ; q14 = G0 = 64*(B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7) 697 @ VSHL.S32 q15,q15,#6 ; q15 = G4 = 64*(B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7) 698 699 VMLA.S32 q12,q11,d0[0] @ q12 = G2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6) 700 VRSHRN.I32 d28,q14,#5 @ Truncating last 11 bits in G0 701 VMLS.S32 q2,q11,d0[1] @ q2 = G6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6) 702 VRSHRN.I32 d30,q15,#5 @ Truncating last 11 bits in G4 703 704 LDR r4,[sp,#80] @ r4 = dst_strd_chr_flag 705 ASR r4,r4,#16 @ r4 = dst_strd 706 LSL r4,r4,#2 @ r4 = 2*dst_strd*2 707 708 VMUL.S32 q3,q9,d2[1] @ q3 = 50*(B0 - B7) 709 VRSHRN.I32 d24,q12,#11 @ Truncating last 11 bits in G2 710 VMUL.S32 q9,q9,d2[0] @ q9 = 18*(B0 - B7) 711 VRSHRN.I32 d4,q2,#11 @ Truncating last 11 bits in G6 712 713 VMLA.S32 q5,q8,d3[0] @ q5 = 89*(B0 - B7) + 75*(B1 - B6) 714 VST1.64 d28,[r3],r4 @ First half-row of row 1 of transform stage 2 (G0) stored 715 VMLS.S32 q4,q8,d2[0] @ q4 = 75*(B0 - B7) - 18*(B1 - B6) 716 717 VMLS.S32 q3,q8,d3[1] @ q3 = 50*(B0 - B7) - 89*(B1 - B6) 718 VST1.64 d24,[r3],r4 @ First half-row of row 3 of transform stage 2 (G2) stored 719 VMLS.S32 q9,q8,d2[1] @ q9 = 18*(B0 - B7) - 50*(B1 - B6) 720 721 VMLA.S32 q5,q7,d2[1] @ q5 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) 722 VST1.64 d30,[r3],r4 @ First half-row of row 5 of transform stage 2 (G4) stored 723 VMLS.S32 q4,q7,d3[1] @ q4 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) 724 725 VMLA.S32 q3,q7,d2[0] @ q3 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) 726 VST1.64 d4,[r3] @ First half-row of row 7 of transform stage 2 (G6) stored 727 VMLA.S32 q9,q7,d3[0] @ q9 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) 728 729 VMLA.S32 q5,q6,d2[0] @ q5 = G1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4) 730 VMLS.S32 q4,q6,d2[1] @ q4 = G3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4) 731 VMLA.S32 q3,q6,d3[0] @ q3 = G5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4) 732 VMLS.S32 q9,q6,d3[1] @ q9 = G7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4) 733 734 SUB r3,r3,r4,LSL #1 735 SUB r3,r3,r4,ASR #1 @ r3 = r3 - 5*dst_strd*2 736 @ r3 is moved from row 7 to row 2 737 VRSHRN.I32 d10,q5,#11 @ Truncating last 11 bits in G1 738 VRSHRN.I32 d8,q4,#11 @ Truncating last 11 bits in G3 739 VRSHRN.I32 d6,q3,#11 @ Truncating last 11 bits in G5 740 VST1.64 d10,[r3],r4 @ First half-row of row 2 of transform stage 2 (G1) stored 741 VRSHRN.I32 d18,q9,#11 @ Truncating last 11 bits in G7 742 743 VST1.64 d8,[r3],r4 @ First half-row of row 4 of transform stage 2 (G3) stored 744 VST1.64 d6,[r3],r4 @ First half-row of row 6 of transform stage 2 (G5) stored 745 VST1.64 d18,[r3]! @ First half-row of row 8 of transform stage 2 (G7) stored 746 747 @ Transform stage 2 (for rows 5-8 of transform stage 1) 748 @ Loading the 4 rows (F4, F5, F6, F7) 749 750 SUB r2,r2,#112 @ r2 jumps from row 8 to row 5 in temporary memory 751 VLD1.64 {d20,d21},[r2]! @ q10 = F4[0] 752 VLD1.64 {d22,d23},[r2]! @ q11 = F4[1] 753 VLD1.64 {d8,d9},[r2]! @ q4 = F5[0] 754 @ Transposing the 4 rows 755 @ F0 = {q11,q10}, F1 = {q5,q4}, F2 = {q3,q2} and F3 = {q13,q12} 756 757 VTRN.32 q10,q4 @ Transposing second half of transform stage 1 (1a) 758 VLD1.64 {d10,d11},[r2]! @ q5 = F5[1] 759 VLD1.64 {d4,d5},[r2]! @ q2 = F6[0] 760 VLD1.64 {d6,d7},[r2]! @ q3 = F6[1] 761 VLD1.64 {d24,d25},[r2]! @ q12 = F7[0] 762 VTRN.32 q2,q12 @ Transposing second half of transform stage 1 (1b) 763 VLD1.64 {d26,d27},[r2] @ q13 = F7[1] 764 765 VSWP d21,d4 @ Transposing second half of transform stage 1 (2a) 766 VSWP d24,d9 @ Transposing second half of transform stage 1 (2b) 767 768 VTRN.32 q11,q5 @ Transposing second half of transform stage 1 (3a) 769 VTRN.32 q3,q13 @ Transposing second half of transform stage 1 (3b) 770 VSWP d26,d11 @ Transposing second half of transform stage 1 (4b) 771 VSWP d23,d6 @ Transposing second half of transform stage 1 (4a) 772 @ B0:q10, B1:q4, B2:q2, B3:q12, B4:q11, B5:q5, B6:q3 and B7:q13 773 774 @ Evaluating first step in Butterfly diagram 775 776 VADD.S32 q0,q10,q13 @ q0 = B0 + B7 777 VADD.S32 q15,q12,q11 @ q15 = B3 + B4 778 VADD.S32 q1,q4,q3 @ q1 = B1 + B6 779 VADD.S32 q14,q2,q5 @ q14 = B2 + B5 780 781 VSUB.S32 q9,q10,q13 @ q9 = B0 - B7 782 VSUB.S32 q6,q12,q11 @ q6 = B3 - B4 783 VSUB.S32 q7,q2,q5 @ q7 = B2 - B5 784 VSUB.S32 q8,q4,q3 @ q8 = B1 - B6 785 786 @ Calculating H0, H2, H4 and H6 787 788 VADD.S32 q3,q1,q14 @ q3 = B1 + B2 + B5 + B6 789 VSUB.S32 q5,q1,q14 @ q5 = B1 - B2 - B5 + B6 790 791 MOV r4,#18 792 MOV r5,#50 793 VSUB.S32 q4,q0,q15 @ q4 = B0 - B3 - B4 + B7 794 VMOV d2,r4,r5 @ 32-bit aligned, d2[1] = 50, d2[0] = 18 795 796 MOV r4,#75 797 MOV r5,#89 798 VADD.S32 q2,q0,q15 @ q2 = B0 + B3 + B4 + B7 799 VMOV d3,r4,r5 @ 32-bit aligned, d3[1] = 89, d3[0] = 75 800 801 MOV r4,#36 802 MOV r5,#83 803 804 @ Calculating H1, H3, H5 and H7 805 806 VMUL.S32 q10,q9,d3[1] @ q10 = 89*(B0 - B7) 807 VMOV d0,r4,r5 @ 32-bit aligned, d0[1] = 83, d0[0] = 36 808 809 VMUL.S32 q13,q9,d3[0] @ q13 = 75*(B0 - B7) 810 811 VMUL.S32 q12,q4,d0[1] @ q12 = 83*(B0 - B3 - B4 + B7) 812 VADD.S32 q14,q2,q3 @ q14 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7 813 VMUL.S32 q4,q4,d0[0] @ q4 = 36*(B0 - B3 - B4 + B7) 814 VSUB.S32 q2,q2,q3 @ q2 = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7 815 816 817 VMLA.S32 q12,q5,d0[0] @ q12 = H2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6) 818 @ VSHL.S32 q14,q14,#6 ; q14 = H0 = 64*(B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7) 819 VMLS.S32 q4,q5,d0[1] @ q4 = H6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6) 820 @ VSHL.S32 q2,q15,#6 ; q2 = H4 = 64*(B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7) 821 822 VMUL.S32 q11,q9,d2[1] @ q11 = 50*(B0 - B7) 823 VRSHRN.I32 d28,q14,#5 @ Truncating last 11 bits in H0 824 VMUL.S32 q9,q9,d2[0] @ q9 = 18*(B0 - B7) 825 VRSHRN.I32 d24,q12,#11 @ Truncating last 11 bits in H2 826 827 VMLA.S32 q10,q8,d3[0] @ q10 = 89*(B0 - B7) + 75*(B1 - B6) 828 VRSHRN.I32 d4,q2,#5 @ Truncating last 11 bits in H4 829 VMLS.S32 q13,q8,d2[0] @ q13 = 75*(B0 - B7) - 18*(B1 - B6) 830 VRSHRN.I32 d8,q4,#11 @ Truncating last 11 bits in H6 831 832 LDR r4,[sp,#80] @ r4 = dst_strd_chr_flag 833 ASR r4,r4,#16 @ r4 = dst_strd 834 LSL r4,r4,#2 @ r4 = 2*dst_strd*2 835 836 SUB r3,r3,r4,LSL #2 837 ADD r3,r3,r4,ASR #1 @ r3 = r3 - 7*dst_strd*2 838 @ r3 is moved from row 8 to row 1 839 VMLS.S32 q11,q8,d3[1] @ q11 = 50*(B0 - B7) - 89*(B1 - B6) 840 VST1.64 d28,[r3],r4 @ Second half-row of row 1 of transform stage 2 (H0) stored 841 VMLS.S32 q9,q8,d2[1] @ q9 = 18*(B0 - B7) - 50*(B1 - B6) 842 843 VMLA.S32 q10,q7,d2[1] @ q10 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) 844 VST1.64 d24,[r3],r4 @ Second half-row of row 3 of transform stage 2 (H2) stored 845 VMLS.S32 q13,q7,d3[1] @ q13 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) 846 847 VMLA.S32 q11,q7,d2[0] @ q11 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) 848 VST1.64 d4,[r3],r4 @ Second half-row of row 5 of transform stage 2 (H4) stored 849 VMLA.S32 q9,q7,d3[0] @ q9 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) 850 851 VMLA.S32 q10,q6,d2[0] @ q10 = H1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4) 852 VST1.64 d8,[r3] @ Second half-row of row 7 of transform stage 2 (H6) stored 853 VMLS.S32 q13,q6,d2[1] @ q13 = H3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4) 854 855 VMLA.S32 q11,q6,d3[0] @ q11 = H5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4) 856 VMLS.S32 q9,q6,d3[1] @ q9 = H7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4) 857 858 SUB r3,r3,r4,LSL #1 859 SUB r3,r3,r4,ASR #1 @ r3 = r3 - 5*dst_strd 860 @ r3 is moved from row 7 to row 2 861 VRSHRN.I32 d20,q10,#11 @ Truncating last 11 bits in H1 862 VRSHRN.I32 d26,q13,#11 @ Truncating last 11 bits in H3 863 VRSHRN.I32 d22,q11,#11 @ Truncating last 11 bits in H5 864 VST1.64 d20,[r3],r4 @ Second half-row of row 2 of transform stage 2 (H1) stored 865 VRSHRN.I32 d18,q9,#11 @ Truncating last 11 bits in H7 866 867 VST1.64 d26,[r3],r4 @ Second half-row of row 4 of transform stage 2 (H3) stored 868 VST1.64 d22,[r3],r4 @ Second half-row of row 6 of transform stage 2 (H5) stored 869 VST1.64 d18,[r3] @ Second half-row of row 8 of transform stage 2 (H7) stored 870 871 vpop {d8 - d15} 872 POP {r4,r5} 873 MOV pc,lr 874 875 @/** 876 @*/ ******************************************************************************* 877 @*/ 878 @*/@brief 879 @*/ This function performs residue calculation and forward transform on 880 @*/ input pixels 881 @*/ 882 @*/@par Description: 883 @*/ Performs residue calculation by subtracting source and prediction and 884 @*/ followed by forward transform 885 @*/ 886 @*/ @param[in] pu1_src 887 @*/ Input 16x16 pixels 888 @*/ 889 @*/ @param[in] pu1_pred 890 @*/ Prediction data 891 @*/ 892 @*/ @param[in] pi2_tmp 893 @*/ Temporary buffer of size 16x16 894 @*/ 895 @*/ @param[out] pi2_dst 896 @*/ Output 16x16 coefficients 897 @*/ 898 @*/ @param[in] src_strd 899 @*/ Input stride 900 @*/ 901 @*/ @param[in] pred_strd 902 @*/ Prediction Stride 903 @*/ 904 @*/ @param[in] dst_strd_chr_flag 905 @*/ Output Stride and Chroma Flag packed in the MS and LS 16-bit 906 @*/ 907 @*/ @returns Void 908 @*/ 909 @*/ @remarks 910 @*/ None 911 @*/ 912 @*/******************************************************************************* 913 @*/ 914 915 .extern g_ai2_ihevc_trans_16 916 .extern g_ai4_ihevc_trans_16 917 918 g_ai2_ihevc_trans_16_addr_1: 919 .long g_ai2_ihevc_trans_16 - ulbl1 - 8 920 921 g_ai2_ihevc_trans_16_addr_2: 922 .long g_ai2_ihevc_trans_16 - ulbl2 - 8 923 924 g_ai4_ihevc_trans_16_addr: 925 .long g_ai4_ihevc_trans_16 - ulbl3 - 8 926 927 .global ihevc_resi_trans_16x16_a9q 928 929 ihevc_resi_trans_16x16_a9q: 930 931 .equ TMP_STRIDE , 64 @16*4, Stride of tmp register 932 .equ SHIFT , 13 @shift = 13; // log2(iWidth) - 1 + g_uiBitIncrement 933 .equ RADD , 4096 @1 << (shift - 1); 934 935 .equ COFF_STD_2B , 32 @Stride for g_ai2_ihevc_trans_16 in bytes 936 .equ COFF_STD_W , 32 @Stride for g_ai4_ihevc_trans_16 in bytes 937 938 @;LOAD the fucntion 939 STMFD SP!,{r4-r12,LR} @stack store values of the arguments 940 vpush {d8 - d15} 941 SUB SP,SP,#32 942 943 LDR R4,[SP,#136] @get src_strd 944 LDR R5,[SP,#140] @get pred_strd 945 LDR R6,[SP,#144] @get dst_strd_chr_flag 946 947 MOV R8,#0 @Set loop counter 948 LDR R9,g_ai2_ihevc_trans_16_addr_1 @get 16 bit transform matrix 949 ulbl1: 950 ADD R9, R9, PC 951 @Read [0 0] [4 0] [8 0] [12 0],[0 1] [4 1] [8 1] [12 1] values of g_ai2_ihevc_trans_16 952 @and write to stack 953 MOV R12,#COFF_STD_2B 954 LSL R12,#2 955 956 VLD1.S32 D30[0],[R9],R12 957 VLD1.S32 D30[1],[R9],R12 958 VLD1.S32 D31[0],[R9],R12 959 VLD1.S32 D31[1],[R9],R12 960 961 VTRN.S32 D30,D31 962 VTRN.S16 D30,D31 963 VST1.S16 {d30,d31},[SP] 964 965 LDR R9,g_ai2_ihevc_trans_16_addr_2 @get back 16 bit transform matrix 966 ulbl2: 967 ADD R9, R9, PC 968 969 MOV R7,#TMP_STRIDE 970 AND R14,R6,#0x1 971 972 VMOV.S32 Q14,#0 973 974 @R0 pu1_src 975 @R1 pu1_pred 976 @R2 pi4_tmp 977 @R3 pi2_dst 978 @R4 src_strd 979 @R5 pred_strd 980 @R6 dst_strd_chr_flag 981 @R7 tmp_dst Nx4 block stride 982 @R8 loop cntr 983 @R9 g_ai2_ihevc_trans_16 984 @R10 tmp_dst Nx4 block offset 985 @R11 tmp register 986 @R12 ------ 987 @R14 ------. 988 @q14 shift 32 bit 989 @q15 add 32 bit 990 991 CORE_LOOP_16X16_HORIZ: 992 993 CMP R14,#1 994 BEQ INTERLEAVED_LOAD_S1 995 996 VLD1.U8 {d0,d1},[R0],R4 @LOAD 1-16 src row 1 997 VLD1.U8 {d2,d3},[R1],R5 @LOAD 1-16 pred row 1 998 VLD1.U8 {d4,d5},[R0],R4 @LOAD 1-16 src row 2 999 VLD1.U8 {d6,d7},[R1],R5 @LOAD 1-16 pred row 2 1000 B LOAD_DONE 1001 1002 INTERLEAVED_LOAD_S1: 1003 1004 VLD2.U8 {Q0,Q1},[R0],R4 @LOAD 1-16 src row 1 1005 VLD2.U8 {Q1,Q2},[R1],R5 @LOAD 1-16 pred row 1 1006 VLD2.U8 {Q2,Q3},[R0],R4 @LOAD 1-16 src row 2 1007 VLD2.U8 {Q3,Q4},[R1],R5 @LOAD 1-16 pred row 2 1008 LOAD_DONE: 1009 1010 VSUBL.U8 Q4,D0,D2 @Get residue 1-8 row 1 1011 VSUBL.U8 Q5,D1,D3 @Get residue 9-16 row 1 1012 VSUBL.U8 Q6,D4,D6 @Get residue 1-8 row 2 1013 VSUBL.U8 Q7,D5,D7 @Get residue 9-16 row 2 1014 1015 @Get blk sads 1016 VABDL.U8 Q15,D0,D2 1017 VABAL.U8 Q15,D1,D3 1018 VABAL.U8 Q15,D4,D6 1019 VABAL.U8 Q15,D5,D7 1020 VADDW.S16 Q14,Q14,D30 1021 VADDW.S16 Q14,Q14,D31 1022 1023 VREV64.S16 Q5,Q5 @Rev row 1 1024 VREV64.S16 Q7,Q7 @Rev row 2 1025 VSWP D10,D11 1026 VSWP D14,D15 1027 1028 VADD.S16 Q8 ,Q4,Q5 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 1-8 row 1 1029 VSUB.S16 Q9 ,Q4,Q5 @o[k] = resi_tmp_1 - resi_tmp_2 k ->9-16 row 1 1030 VADD.S16 Q10,Q6,Q7 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 1-8 row 2 1031 VSUB.S16 Q11,Q6,Q7 @o[k] = resi_tmp_1 - resi_tmp_2 k ->9-16 row 2 1032 1033 VREV64.S16 D24,D17 @rev e[k] k-> 4-7 row 1 1034 VREV64.S16 D25,D21 @rev e[k] k-> 4-7 row 2 1035 VMOV.S16 D17,D20 1036 1037 @arrangement OF DATA 1038 @Q8 A1 A2 A3 A4 B1 B2 B3 B4 1039 @Q12 A8 A7 A6 A5 B8 B7 B6 B5 1040 1041 VADD.S16 Q13,Q8,Q12 @ee[k] = e[k] + e[7 - k] row 1 & 2 1042 VSUB.S16 Q0,Q8,Q12 @eo[k] = e[k] - e[7 - k] row 1 & 2 1043 1044 @D26 R1ee[0] R1ee[1] R1ee[2] R1ee[3] 1045 @D27 R2ee[0] R2ee[1] R2ee[2] R2ee[3] 1046 VTRN.S32 D26,D27 @1-cycle stall before it? 1047 @D26 R1ee[0] R1ee[1] R2ee[0] R2ee[1] 1048 @D27 R1ee[2] R1ee[3] R2ee[2] R2ee[3] 1049 VREV32.16 D2,D27 @1-cycle stall before it? 1050 @D26 R1ee[0] R1ee[1] R2ee[0] R2ee[1] 1051 @D2 R1ee[3] R1ee[2] R2ee[3] R2ee[2] 1052 VMOV.S16 D27,D26 1053 VNEG.S16 D3,D2 1054 @Q13 R1ee[0] R1ee[1] R2ee[0] R2ee[1] R1ee[0] R1ee[1] R2ee[0] R2ee[1] 1055 @Q1 R1ee[3] R1ee[2] R2ee[3] R2ee[2] -R1ee[3] -R1ee[2] -R2ee[3] -R2ee[2] 1056 1057 @D8 : [0 0] [4 0] [8 0] [12 0] 1058 @D9 : [0 1] [4 1] [8 1] [12 1] 1059 VLD1.S16 {d8,d9},[SP] @[0 0] [4 0] [8 0] [12 0] [0 1] [4 1] [8 1] [12 1] 1060 VADD.S16 Q1,Q13,Q1 @ 1-cycle stall before it? 1061 @Q15 R1eee[0] R1eee[1] R2eee[0] R2eee[1] R1eeo[0] R1eeo[1] R2eeo[0] R2eeo[1] 1062 1063 @Q1 R1eee[0] R1eee[1] R2eee[0] R2eee[1] 1064 @ R1eeo[0] R1eeo[1] R2eeo[0] R2eeo[1] 1065 VTRN.S16 D2,D3 @2-cycle stall before it? 1066 @Q1 R1eee[0] R1eeo[0] R2eee[0] R2eeo[0] 1067 @ R1eee[1] R1eeo[1] R2eee[1] R2eeo[1] 1068 1069 VDUP.S32 D4,D2[0] @R1eee[0] R1eeo[0] R1eee[0] R1eeo[0] ;1-cycle stall? 1070 VDUP.S32 D5,D2[1] @R2eee[0] R2eeo[0] R2eee[0] R2eeo[0] 1071 VDUP.S32 D6,D3[0] @R1eee[1] R1eeo[1] R1eee[1] R1eeo[1] 1072 VDUP.S32 D7,D3[1] @R2eee[1] R2eeo[1] R2eee[1] R2eeo[1] 1073 1074 @---------------Process EO-------------------- 1075 @ Early start to avoid stalls 1076 MOV R12,#COFF_STD_2B @Get stride of coeffs 1077 1078 VMULL.S16 Q5,D4,D8 @ g_ai2_ihevc_trans_16 * R1eee[0] R1eeo[0] R1eee[0] R1eeo[0] 1079 VMLAL.S16 Q5,D6,D9 @ + g_ai2_ihevc_trans_16 * R1eee[1] R1eeo[1] R1eee[1] R1eeo[1] 1080 VMULL.S16 Q6,D5,D8 @ g_ai2_ihevc_trans_16 * R2eee[0] R2eeo[0] R2eee[0] R2eeo[0] 1081 VMLAL.S16 Q6,D7,D9 @ + g_ai2_ihevc_trans_16 * R2eee[1] R2eeo[1] R2eee[1] R2eeo[1] 1082 1083 ADD R11,R9,R12,LSL #1 @Load address of g_ai2_ihevc_trans_16[2] 1084 LSL R12,R12,#2 1085 1086 VLD1.S16 D26,[R11],R12 @LOAD g_ai2_ihevc_trans_16[2][0-4]] 1087 1088 VLD1.S16 D27,[R11],R12 @LOAD g_ai2_ihevc_trans_16[6][0-4] 1089 VMULL.S16 Q1,D26,D0 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4] R1 1090 1091 VMULL.S16 Q2,D26,D1 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4] R2 1092 1093 VZIP.S32 Q5,Q6 @3-cycle instruction 1094 VMULL.S16 Q3,D27,D0 @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4] R1 1095 1096 1097 VLD1.S16 D26,[R11],R12 @LOAD g_ai2_ihevc_trans_16[10][0-4] 1098 VMULL.S16 Q4,D27,D1 @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4] R2 1099 1100 @These values must go to 0 4 8 12 colums hence we need stride *4 1101 LSL R10,R7,#2 1102 1103 VLD1.S16 D27,[R11],R12 @LOAD g_ai2_ihevc_trans_16[14][0-4] 1104 1105 VST1.32 D10,[R2],R10 1106 VMULL.S16 Q8,D27,D1 @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4] R2 1107 1108 VST1.32 D11,[R2],R10 1109 VMULL.S16 Q7,D27,D0 @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4] R1 1110 1111 VST1.32 D12,[R2],R10 1112 VMULL.S16 Q5,D26,D0 @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4] R1 1113 1114 VST1.32 D13,[R2],R10 1115 VMULL.S16 Q6,D26,D1 @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4] R2 1116 1117 SUB R2,R2,R10,LSL #2 1118 1119 @transpose the 4x4 matrix row1 1120 VTRN.32 Q1, Q3 @R1 transpose1 -- 2 cycles 1121 1122 @transpose the 4x4 matrix row2 1123 VTRN.32 Q2,Q4 @R2 transpose1 -- 2 cycles 1124 1125 VTRN.32 Q5, Q7 @R1 transpose1 -- 2 cycles 1126 1127 VTRN.32 Q6,Q8 @R2 transpose1 -- 2 cycles 1128 1129 VSWP D10,D3 @R1 transpose2 1130 VSWP D14,D7 @R1 transpose2 1131 1132 VSWP D12,D5 @R2 transpose2 1133 VSWP D16,D9 @R2 transpose2 1134 1135 VADD.S32 Q5,Q5,Q1 @R1 add 1136 VADD.S32 Q3,Q3,Q7 @R1 add 1137 1138 VADD.S32 Q2,Q2,Q4 @R2 add 1139 VADD.S32 Q6,Q6,Q8 @R2 add 1140 1141 VADD.S32 Q5,Q5,Q3 @R1 add 1142 1143 VADD.S32 Q4,Q6,Q2 @R2 add 1144 1145 @-----------------------Processing O ---------------------------- 1146 @ Early start to avoid stalls 1147 MOV R12,#COFF_STD_2B @Get coeffs stride 1148 LSL R12,R12,#1 1149 ADD R11,R9,#COFF_STD_2B @Get address of g_ai2_ihevc_trans_16[1] 1150 1151 VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[1][0-7] -- 2 cycles 1152 1153 VZIP.S32 Q5,Q4 @ 3 cycle instruction 1154 VMULL.S16 Q6,D18,D4 @o[0][0-3]* R1 1155 1156 1157 VMLAL.S16 Q6,D19,D5 @o[0][4-7]* R1 ; follows MULL instruction: Multiplier accumulator forwarding 1158 @write to memory 1159 @this should go to 2 6 10 14 1160 LSL R10,R7,#2 1161 ADD R2,R2,R7,LSL #1 @move to third row 1162 VST1.32 D10,[R2],R10 1163 VMULL.S16 Q7,D22,D4 @o[0][0-3]* R2 1164 1165 VST1.32 D11,[R2],R10 1166 VMLAL.S16 Q7,D23,D5 @o[0][4-7]* R2 1167 1168 VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[3][0-7] 1169 1170 VST1.32 D8,[R2],R10 1171 VMULL.S16 Q8,D18,D4 @o[1][0-3]* R1 1172 1173 VST1.32 D9,[R2],R10 1174 VMLAL.S16 Q8,D19,D5 @o[1][4-7]* R1 1175 SUB R2,R2,R10,LSL #2 1176 SUB R2,R2,R7,LSL #1 1177 1178 @--------------------Done procrssing EO ------------------------- 1179 1180 @ -----------------Processing O continues------------------------ 1181 1182 VMULL.S16 Q10,D22,D4 @o[1][0-3]* R2 1183 VMLAL.S16 Q10,D23,D5 @o[1][4-7]* R2 1184 1185 VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[5][0-7] 1186 1187 VLD1.S16 {d6,d7},[R11],R12 @g_ai2_ihevc_trans_16[7][0-7] 1188 VMULL.S16 Q12,D18,D4 @o[2][0-3]* R1 1189 1190 VMLAL.S16 Q12,D19,D5 @o[2][4-7]* R1 1191 VMULL.S16 Q0,D18,D6 @o[3][0-3]* R1 1192 VMLAL.S16 Q0,D19,D7 @o[3][4-7]* R1 1193 1194 VMULL.S16 Q13,D22,D4 @o[2][0-3]* R2 1195 VMLAL.S16 Q13,D23,D5 @o[2][4-7]* R2 1196 VMULL.S16 Q1,D22,D6 @o[3][0-3]* R2 1197 VMLAL.S16 Q1,D23,D7 @o[3][4-7]* R2 1198 1199 @transpose the 4x4 matrix R1 1200 VTRN.32 Q6, Q8 @ 2-cycle instruction 1201 1202 VTRN.32 Q12,Q0 @ 2-cycle instruction 1203 1204 @transpose the 4x4 matrix R2 1205 VTRN.32 Q7,Q10 @ 2-cycle instruction 1206 1207 VTRN.32 Q13,Q1 @ 2-cycle instruction 1208 1209 VSWP D24,D13 1210 VSWP D0, D17 1211 1212 VSWP D26,D15 1213 VSWP D2,D21 1214 1215 VADD.S32 Q8 ,Q8 ,Q6 1216 VADD.S32 Q12,Q12,Q0 1217 1218 VADD.S32 Q10,Q10,Q7 1219 VADD.S32 Q13,Q13,Q1 1220 1221 VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[9][0-7] 1222 VADD.S32 Q12 ,Q12 ,Q8 1223 1224 VADD.S32 Q13,Q13,Q10 1225 VMULL.S16 Q3,D18,D4 @o[4][0-3]* R1 1226 VMLAL.S16 Q3,D19,D5 @o[4][4-7]* R1 1227 1228 VZIP.S32 Q12,Q13 1229 VMULL.S16 Q4,D22,D4 @o[0][0-3]* R2 1230 1231 1232 VMLAL.S16 Q4,D23,D5 @o[0][4-7]* R2 1233 @write to memory 1234 @this should go to 1 3 5 7 1235 ADD R2,R2,R7 1236 LSL R7,R7,#1 1237 VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[11][0-7] 1238 1239 VST1.32 D24,[R2],R7 1240 VMULL.S16 Q5,D18,D4 @o[5][0-3]* R1 1241 1242 VST1.32 D25,[R2],R7 1243 VMLAL.S16 Q5,D19,D5 @o[5][4-7]* R1 1244 1245 VST1.32 D26,[R2],R7 1246 VMULL.S16 Q6,D22,D4 @o[0][0-3]* R2 1247 1248 VST1.32 D27,[R2],R7 1249 VMLAL.S16 Q6,D23,D5 @o[0][4-7]* R2 1250 1251 VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[13][0-7] 1252 1253 VLD1.S16 {d2,d3},[R11],R12 @g_ai2_ihevc_trans_16[15][0-7] 1254 VMULL.S16 Q7,D18,D4 @o[6][0-3]* R1 1255 1256 VMLAL.S16 Q7,D19,D5 @o[6][4-7]* R1 1257 VMULL.S16 Q10,D18,D2 @o[7][0-3]* R1 1258 VMLAL.S16 Q10,D19,D3 @o[7][4-7]* R1 1259 1260 VMULL.S16 Q8,D22,D4 @o[0][0-3]* R2 1261 VMLAL.S16 Q8,D23,D5 @o[0][4-7]* R2 1262 VMULL.S16 Q12,D22,D2 @o[0][0-3]* R2 1263 VMLAL.S16 Q12,D23,D3 @o[0][4-7]* R2 1264 1265 1266 @transpose the 4x4 matrix R1 1267 VTRN.32 Q3 ,Q5 @ 2-cycle instruction 1268 1269 VTRN.32 Q7 ,Q10 @ transpose step 2 R1 , 2-cycle instruction 1270 1271 @transpose the 4x4 matrix R2 1272 VTRN.32 Q4 ,Q6 @ 2-cycle instruction 1273 1274 VTRN.32 Q8 ,Q12 @ transpose step 2 R2 , 2-cycle instruction 1275 1276 VSWP D14,D7 @ transpose step 3, R1 1277 VSWP D20,D11 @ transpose step 4, R1 1278 VSWP D16,D9 @ transpose step 3, R2 1279 VSWP D24,D13 @ transpose step 4, R2 1280 1281 VADD.S32 Q5 ,Q5 ,Q3 1282 VADD.S32 Q10,Q10,Q7 1283 VADD.S32 Q6 ,Q6 ,Q4 1284 VADD.S32 Q12,Q12,Q8 1285 VADD.S32 Q10,Q10,Q5 1286 VADD.S32 Q12,Q12,Q6 1287 1288 @ 2-cycle stall 1289 VZIP.S32 Q10,Q12 @ 3-cycle instruction 1290 1291 @ 2-cycle stall 1292 @this should go to 9 11 13 15 1293 VST1.32 D20,[R2],R7 1294 1295 VST1.32 D21,[R2],R7 1296 1297 VST1.32 D24,[R2],R7 1298 1299 VST1.32 D25,[R2],R7 1300 1301 SUB R2,R2,R7,LSL #3 1302 LSR R7,R7,#1 1303 SUB R2,R2,R7 1304 1305 ADD R2,R2,#8 @MOVE TO NEXT to next COLUMN - pi4_tmp 1306 1307 ADD R8,R8,#2 @increment loop cntr 1308 CMP R8,#16 @check lllop cntr 1309 BNE CORE_LOOP_16X16_HORIZ @jump acc 1310 1311 1312 @*****************Vertical transform************************************ 1313 1314 @Initialization for vert transform 1315 @pi4_tmp will be the new src 1316 @tmp stride will be new src stride 1317 @dst will be new pi4_tmp 1318 @dst stride will be new tmp stride 1319 @trans table will be of 32 bit 1320 1321 LDR R9,g_ai4_ihevc_trans_16_addr @get 32 bit transform matrix 1322 ulbl3: 1323 ADD R9, R9, PC 1324 1325 SUB R0,R2,#64 @set tmp as src [-32 to move back to orgin] 1326 MOV R2,R3 @set dst as tmp 1327 MOV R4,#TMP_STRIDE @set tmp stride as src stride 1328 LSR R7,R6,#15 @Set dst stride as tmp stride 1329 SUB R4,#48 @Adjust stride 3 previous loads 1330 1331 @Block SAD 1332 VADD.S32 D28,D28,D29 1333 VPADD.S32 D28,D28,D29 1334 VMOV.S32 R3,D28[0] 1335 @ SAD calculation ends -- final value in R3. 1336 1337 @Read [0 0] [4 0] [8 0] [12 0],[0 1] [4 1] [8 1] [12 1] 1338 @values of g_ai4_ihevc_trans_16 and write to stack 1339 MOV R12,#COFF_STD_W 1340 LSL R12,R12,#2 1341 VLD1.S32 D28,[R9],R12 1342 VLD1.S32 D29,[R9],R12 1343 VLD1.S32 D30,[R9],R12 1344 VLD1.S32 D31,[R9],R12 1345 SUB R9,R9,R12,LSL #2 1346 1347 VREV64.32 Q15,Q15 1348 VTRN.S32 Q14,Q15 1349 VST1.S32 {Q14-Q15},[SP] 1350 1351 VMOV.U32 Q14,#RADD @get the round factor to q14 1352 VMOV.U32 Q15,#SHIFT @Get the shift to neon 1353 1354 MOV R8,#0 @INIT LOOP 1355 1356 CORE_LOOP_16X16_VERT: 1357 1358 VLD1.S32 {D0,D1},[R0]! @LOAD 1-4 src R1 1359 VLD1.S32 {D2,D3},[R0]! @LOAD 5-8 pred R1 1360 VLD1.S32 {D4,D5},[R0]! @LOAD 9-12 src R1 1361 VLD1.S32 {D6,D7},[R0],R4 @LOAD 12-16 pred R1 1362 1363 VLD1.S32 {D8,D9},[R0]! @LOAD 1-4 src R2 1364 VLD1.S32 {D10,D11},[R0]! @LOAD 5-8 pred R2 1365 VLD1.S32 {D12,D13},[R0]! @LOAD 9-12 src R2 1366 VLD1.S32 {D14,D15},[R0],R4 @LOAD 12-16 pred R2 1367 1368 VREV64.S32 Q2,Q2 @Rev 9-12 R1 1369 VREV64.S32 Q3,Q3 @Rev 12-16 R1 1370 VREV64.S32 Q6,Q6 @Rev 9-12 R2 1371 VREV64.S32 Q7,Q7 @Rev 12-16 R2 1372 1373 VSWP D6,D7 1374 VSWP D4,D5 1375 VADD.S32 Q8 ,Q0,Q3 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 1-4 R1 1376 VSWP D12,D13 @ dual issued with prev. instruction 1377 VADD.S32 Q9 ,Q1,Q2 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 5-8 R1 1378 VSWP D14,D15 @ dual issued with prev. instruction 1379 VSUB.S32 Q10,Q0,Q3 @o[k] = resi_tmp_1 - resi_tmp_2 k -> 1-4 R1 1380 VSUB.S32 Q11,Q1,Q2 @o[k] = resi_tmp_1 - resi_tmp_2 k -> 5-8 R1 1381 1382 VADD.S32 Q12,Q4,Q7 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 1-4 R2 1383 VREV64.S32 Q9 ,Q9 @rev e[k] k-> 4-7 R1, dual issued with prev. instruction 1384 VADD.S32 Q13,Q5,Q6 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 5-8 R2 1385 VSUB.S32 Q0 ,Q4,Q7 @o[k] = resi_tmp_1 - resi_tmp_2 k -> 1-4 R2 1386 VSWP D18,D19 @ dual issued with prev. instruction 1387 VSUB.S32 Q1 ,Q5,Q6 @o[k] = resi_tmp_1 - resi_tmp_2 k -> 5-8 R2 1388 VREV64.S32 Q13,Q13 @rev e[k] k-> 4-7 R2, dual issued with prev. instruction 1389 1390 VADD.S32 Q2,Q8,Q9 @ee[k] = e[k] + e[7 - k] row R1 1391 VSUB.S32 Q3,Q8,Q9 @eo[k] = e[k] - e[7 - k] row R1 1392 VSWP D26,D27 1393 1394 1395 VADD.S32 Q4,Q12,Q13 @ee[k] = e[k] + e[7 - k] row R2 1396 VSUB.S32 Q5,Q12,Q13 @eo[k] = e[k] - e[7 - k] row R2 1397 VREV64.S32 D5,D5 @rev ee[k] 4-7 R1, dual issued with prev. instruction 1398 1399 VADD.S32 D12,D4,D5 @eee[0] eee[1] R1 1400 VSUB.S32 D13,D4,D5 @eeo[0] eeo[1] R1 1401 VREV64.S32 D9,D9 @rev ee[k] 4-7 R2, dual issued with prev. instruction 1402 1403 1404 VADD.S32 D14,D8,D9 @eee[0] eee[1] R2 1405 VSUB.S32 D15,D8,D9 @eeo[0] eeo[1] R2 1406 1407 VLD1.S32 {Q12,Q13},[SP] @Load g_ai2_ihevc_trans_16[xx]-> Q12 : [0 0] [8 0] [4 0] [12 0] Q13 : [0 1] [8 1] [4 1] [12 1] 1408 VREV64.S32 Q8,Q6 @Q6 : eee[0] eee[1] eeo[0] eeo[1] R1 -> ;Q8 : eee[1] eee[0] eeo[1] eeo[0] R1 1409 1410 VREV64.S32 Q9,Q7 @Q7 : eee[0] eee[1] eeo[0] eeo[1] R2 -> ;Q9 : eee[1] eee[0] eeo[1] eeo[0] R2 1411 1412 1413 VMUL.S32 Q4,Q6,Q12 @g_ai2_ihevc_trans_16 * eee[0] eee[1] eeo[0] eeo[1] R1 1414 VMLA.S32 Q4,Q8,Q13 @g_ai2_ihevc_trans_16 * eee[1] eee[0] eeo[1] eeo[0] R1 1415 1416 VMUL.S32 Q6,Q7,Q12 @g_ai2_ihevc_trans_16 * eee[0] eee[1] eeo[0] eeo[1] R2 1417 VMLA.S32 Q6,Q9,Q13 @g_ai2_ihevc_trans_16 * eee[1] eee[0] eeo[1] eeo[0] R2 1418 1419 @Q3 :R1E00 R1E01 R1E02 R1E03 1420 @Q5 :R2E00 R2E01 R2E02 R2E03 1421 VSWP D7,D10 @ dual issued with prev. instruction 1422 @Q3 :R1E00 R1E01 R2E00 R2E01 1423 @Q5 :R1E02 R1E03 R2E02 R2E03 1424 VSWP D7,D11 1425 @Q3 :R1E00 R1E01 R2E02 R2E03 1426 @Q5 :R1E02 R1E03 R2E00 R2E01 1427 1428 MOV R12,#COFF_STD_W 1429 ADD R11,R9,R12,LSL #1 @Get to the 2nd row of src 1430 LSL R12,R12,#2 1431 1432 VLD1.S32 {D14,D15},[R11],R12 @LOAD g_ai2_ihevc_trans_16[2][0-4] -> 2G0 2G1 2G2 2G3, 2-cycle instr. 1433 1434 VADD.S32 Q4,Q4,Q14 @ROUND R1 1435 VMUL.S32 Q12,Q3,Q7 @2G0 2G1 2G2 2G3 * R1E00 R1E01 R2E02 R2E03, 4-cycle instruction 1436 VSWP D14,D15 @2G0 2G1 2G2 2G3 -> 2G2 2G3 2G0 2G1, dual issued with prev. instruction 1437 1438 VADD.S32 Q6,Q6,Q14 @ROUND R2 1439 1440 VSHRN.S32 D8,Q4,#SHIFT @NARROW R1 1441 1442 VLD1.S32 {D16,D17},[R11],R12 @LOAD g_ai2_ihevc_trans_16[6][0-4] 1443 VSHRN.S32 D9,Q6,#SHIFT @NARROW R2, dual issued in 2nd cycle 1444 1445 VMUL.S32 Q2,Q3,Q8 @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4], 4-cycle instruction 1446 VSWP D16,D17 @dual issued with prev. instr. 1447 1448 VZIP.S16 D8,D9 @INTERLEAVE R1 R2 R1 R2 R1 R2 to write 1449 VMLA.S32 Q12,Q5,Q7 @2G2 2G3 2G0 2G1 * R1E02 R1E03 R2E00 R2E01, 4-cycle instruction 1450 1451 1452 @WRITE INTO MEM the values or wait to be shuffled 1453 @These values must go to 0 4 8 12 colums 1454 LSL R10,R7,#2 1455 VST1.S32 D8[0],[R2],R10 1456 1457 VST1.S32 D9[0],[R2],R10 1458 1459 VST1.S32 D8[1],[R2],R10 1460 VPADD.S32 D18,D24,D25 @D18[0] -> 2G0*R1E00+2G1*R1E01 2G2*R2E02+2G3*R2E03 1461 @D18[1] -> 2G2*R1E02+2G3*R1E03 2G0*R2E00+*2G1R2E01 1462 1463 VST1.S32 D9[1],[R2],R10 1464 VMLA.S32 Q2,Q5,Q8 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4] 1465 LSL R10,R10,#2 1466 SUB R2,R2,R10 1467 1468 VLD1.S32 {D14,D15},[R11],R12 @LOAD g_ai2_ihevc_trans_16[10][0-4] 1469 1470 VMUL.S32 Q6,Q3,Q7 @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4] 1471 VSWP D14,D15 @ dual issued with prev. instruction 1472 VPADD.S32 D19,D4,D5 1473 1474 VLD1.S32 {D16,D17},[R11],R12 @LOAD g_ai2_ihevc_trans_16[14][0-4] 1475 VMUL.S32 Q2,Q3,Q8 @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4] 1476 VSWP D16,D17 1477 1478 VMLA.S32 Q6,Q5,Q7 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4] 1479 VADD.S32 Q9,Q9,Q14 @Round by RADD R1 1480 VMLA.S32 Q2,Q5,Q8 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4] 1481 VSHRN.S32 D8,Q9,#SHIFT @Shift by SHIFT 1482 VPADD.S32 D24,D12,D13 1483 @---------------Processing O, Row 1 and Row 2-------------------------------------- 1484 @ Early start to avoid stalls 1485 MOV R12,#COFF_STD_W 1486 ADD R11,R9,R12 @Get 1ST row 1487 LSL R12,R12,#1 1488 1489 LSL R10,R7,#2 1490 ADD R2,R2,R7,LSL #1 @move to third row 1491 @this should go to 2 6 10 14 1492 VST1.S32 D8[0],[R2],R10 1493 1494 VST1.S32 D8[1],[R2],R10 1495 VPADD.S32 D25,D4,D5 @ dual issued with prev. instruction in 2nd cycle 1496 1497 VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[1][0-7] 1498 VADD.S32 Q12,Q12,Q14 @Round by RADD R2, dual issued with prev. instruction in 2nd cycle 1499 VMUL.S32 Q6,Q2,Q0 @g_ai2_ihevc_trans_16[1][0-3]*o[0][0-3] R2 1500 VMLA.S32 Q6,Q3,Q1 @g_ai2_ihevc_trans_16[1][4-7]*o[0][4-7] R2 1501 VSHRN.S32 D9,Q12,#SHIFT @Shift by SHIFT 1502 1503 VMUL.S32 Q2,Q2,Q10 @g_ai2_ihevc_trans_16[1][0-3]*o[0][0-3] R1 1504 VMLA.S32 Q2,Q3,Q11 @g_ai2_ihevc_trans_16[1][4-7]*o[0][4-7] R1 1505 VADD.S32 D11,D12,D13 @g_ai2_ihevc_trans_16[1][k]*o[0][k]+g_ai2_ihevc_trans_16[0][7-k]*o[0][7-k] R2, dual issued with prev. instr. 1506 VST1.S32 D9[0],[R2],R10 1507 1508 VST1.S32 D9[1],[R2],R10 1509 VADD.S32 D10,D4,D5 @g_ai2_ihevc_trans_16[1][k]*o[0][k]+g_ai2_ihevc_trans_16[0][7-k]*o[0][7-k] R1, dual issued with prev. instr. 1510 LSL R10,R10,#2 @go back to orgin 1511 SUB R2,R2,R10 1512 SUB R2,R2,R7,LSL #1 1513 1514 VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[3][0-7] 1515 1516 VMUL.S32 Q7,Q2,Q10 @o[0][0-3] 1517 VMLA.S32 Q7,Q3,Q11 @o[0][4-7] 1518 VMUL.S32 Q8,Q2,Q0 @o[0][0-3] 1519 VMLA.S32 Q8,Q3,Q1 @o[0][4-7] 1520 1521 VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[5][0-7] 1522 VADD.S32 D18,D14,D15 1523 VMUL.S32 Q12,Q2,Q10 @o[0][0-3] 1524 VMLA.S32 Q12,Q3,Q11 @o[0][4-7] 1525 VADD.S32 D19,D16,D17 1526 VMUL.S32 Q4,Q2,Q0 1527 VMLA.S32 Q4,Q3,Q1 1528 VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[7][0-7] 1529 VADD.S32 D26,D24,D25 @ dual issued with prev. instr. 1530 VMUL.S32 Q6,Q2,Q10 @o[0][0-3] 1531 VMLA.S32 Q6,Q3,Q11 @o[0][4-7] 1532 VADD.S32 D27,D8,D9 1533 VMUL.S32 Q4,Q2,Q0 1534 VMLA.S32 Q4,Q3,Q1 1535 VADD.S32 D12,D12,D13 1536 @Q5 Q9 Q13 Q6 1537 VPADD.S32 D14,D10,D11 1538 VPADD.S32 D15,D18,D19 1539 VPADD.S32 D16,D26,D27 1540 VADD.S32 D13,D8,D9 1541 VADD.S32 Q9,Q7,Q14 1542 VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[0][0-7] 1543 VPADD.S32 D17,D12,D13 @ dual issued with prev. instr. in 2nd cycle 1544 1545 VMUL.S32 Q4,Q2,Q10 @o[0][0-3] 1546 VMLA.S32 Q4,Q3,Q11 @o[0][4-7] 1547 1548 VADD.S32 Q12,Q8,Q14 1549 1550 VMUL.S32 Q6,Q2,Q0 @o[0][0-3] 1551 VMLA.S32 Q6,Q3,Q1 @o[0][4-7] 1552 1553 VSHRN.S32 D26,Q9,#SHIFT 1554 VSHRN.S32 D27,Q12,#SHIFT 1555 VADD.S32 D10,D8,D9 1556 @write to memory this should go to 1 3 5 7 1557 ADD R2,R2,R7 1558 LSL R7,R7,#1 1559 VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[1][0-7] 1560 VADD.S32 D11,D12,D13 @ dual issued with prev. instr. 1561 1562 VST1.S32 D26[0],[R2],R7 1563 VMUL.S32 Q7,Q2,Q10 @o[0][0-3] 1564 VMLA.S32 Q7,Q3,Q11 @o[0][4-7] 1565 VST1.S32 D26[1],[R2],R7 1566 VMUL.S32 Q8,Q2,Q0 @o[0][0-3] 1567 VMLA.S32 Q8,Q3,Q1 @o[0][4-7] 1568 VST1.S32 D27[0],[R2],R7 1569 VADD.S32 D18,D14,D15 1570 VST1.S32 D27[1],[R2],R7 1571 1572 VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[2][0-7] 1573 VADD.S32 D19,D16,D17 @ dual issued with prev. instr. 1574 1575 VMUL.S32 Q12,Q2,Q10 @o[0][0-3] 1576 VMLA.S32 Q12,Q3,Q11 @o[0][4-7] 1577 VMUL.S32 Q4,Q2,Q0 1578 VMLA.S32 Q4,Q3,Q1 1579 1580 VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[3][0-7] 1581 VADD.S32 D26,D24,D25 1582 1583 VMUL.S32 Q6,Q2,Q10 @o[0][0-3] 1584 VMLA.S32 Q6,Q3,Q11 @o[0][4-7] 1585 VADD.S32 D27,D8,D9 1586 1587 VMUL.S32 Q4,Q2,Q0 1588 VMLA.S32 Q4,Q3,Q1 1589 VADD.S32 D12,D12,D13 1590 @Q5 Q9 Q13 Q6 1591 VPADD.S32 D14,D10,D11 1592 VPADD.S32 D15,D18,D19 1593 VPADD.S32 D16,D26,D27 1594 VADD.S32 D13,D8,D9 1595 VADD.S32 Q9,Q7,Q14 1596 @ 1- cycle stall? 1597 VPADD.S32 D17,D12,D13 1598 VSHRN.S32 D22,Q9,#SHIFT 1599 VADD.S32 Q10,Q8,Q14 1600 @ 2-cycle stall? 1601 VSHRN.S32 D23,Q10,#SHIFT 1602 1603 @this should go to 9 11 13 15 1604 @LSL R11,R7,#1 1605 VST1.S32 D22[0],[R2],R7 1606 VST1.S32 D22[1],[R2],R7 1607 VST1.S32 D23[0],[R2],R7 1608 VST1.S32 D23[1],[R2],R7 1609 1610 SUB R2,R2,R7,LSL #3 1611 LSR R7,R7,#1 1612 SUB R2,R2,R7 1613 1614 ADD R2,R2,#4 @MOVE TO NEXT to next COLUMN 1615 1616 ADD R8,R8,#2 @increment loop cntr by 2 since we process loop as 2 cols 1617 CMP R8,#16 @check loop cntr 1618 BNE CORE_LOOP_16X16_VERT @jump acc 1619 1620 MOV R0,R3 1621 1622 ADD SP,SP,#32 1623 vpop {d8 - d15} 1624 LDMFD sp!,{r4-r12,PC} @stack store values of the arguments 1625 1626