1 //****************************************************************************** 2 //* 3 //* Copyright (C) 2015 The Android Open Source Project 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //***************************************************************************** 18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 //*/ 20 ///** 21 // ******************************************************************************* 22 // * @file 23 // * impeg2_idct.s 24 // * 25 // * @brief 26 // * contains function definitions for single stage inverse transform 27 // * 28 // * @author 29 // * anand s 30 // * 31 // * @par list of functions: 32 // * - impeg2_idct_recon_dc_av8() 33 // * 34 // * @remarks 35 // * none 36 // * 37 // ******************************************************************************* 38 //*/ 39 40 ///** 41 // ******************************************************************************* 42 // * 43 // * @brief 44 // * this function performs inverse transform and reconstruction for 8x8 45 // * input block 46 // * 47 // * @par description: 48 // * performs inverse transform and adds the prediction data and clips output 49 // * to 8 bit 50 // * 51 // * @param[in] pi2_src 52 // * input 8x8 coefficients 53 // * 54 // * @param[in] pi2_tmp 55 // * temporary 8x8 buffer for storing inverse 56 // * 57 // * transform 58 // * 1st stage output 59 // * 60 // * @param[in] pu1_pred 61 // * prediction 8x8 block 62 // * 63 // * @param[out] pu1_dst 64 // * output 8x8 block 65 // * 66 // * @param[in] src_strd 67 // * input stride 68 // * 69 // * @param[in] pred_strd 70 // * prediction stride 71 // * 72 // * @param[in] dst_strd 73 // * output stride 74 // * 75 // * @param[in] shift 76 // * output shift 77 // * 78 // * @param[in] zero_cols 79 // * zero columns in pi2_src 80 // * 81 // * @returns void 82 // * 83 // * @remarks 84 // * none 85 // * 86 // ******************************************************************************* 87 // */ 88 89 //void impeg2_itrans_recon_8x8(word16 *pi2_src, 90 // word16 *pi2_tmp, 91 // uword8 *pu1_pred, 92 // uword8 *pu1_dst, 93 // word32 src_strd, 94 // word32 pred_strd, 95 // word32 dst_strd, 96 // word32 zero_cols 97 // word32 zero_rows ) 98 99 //**************variables vs registers************************* 100 // x0 => *pi2_src 101 // x1 => *pi2_tmp 102 // x2 => *pu1_pred 103 // x3 => *pu1_dst 104 // src_strd 105 // pred_strd 106 // dst_strd 107 // zero_cols 108 109 110 111 .text 112 .align 4 113 .include "impeg2_neon_macros.s" 114 115 .set idct_stg1_shift , 12 116 .set idct_stg2_shift , 16 117 .set idct_stg1_round , (1 << (idct_stg1_shift - 1)) 118 .set idct_stg2_round , (1 << (idct_stg2_shift - 1)) 119 120 .extern gai2_impeg2_idct_q15 121 .extern gai2_impeg2_idct_q11 122 .extern gai2_impeg2_idct_first_col_q15 123 .extern gai2_impeg2_idct_first_col_q11 124 .extern gai2_impeg2_mismatch_stg2_additive 125 126 .global impeg2_idct_recon_dc_av8 127 impeg2_idct_recon_dc_av8: 128 // STMFD sp!,{x4,x6,x12,x14} 129 push_v_regs 130 ////x0: pi2_src 131 ////x1: pi2_tmp - not used, used as pred_strd 132 ////x2: pu1_pred 133 ////x3: pu1_dst 134 ////x4: used as scratch 135 ////x5: pred_strd 136 ////x6: dst_strd 137 138 ldrsh x4, [x0] 139 adrp x14, :got:gai2_impeg2_idct_q15 140 ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_q15] 141 ldrsh x12, [x14] 142 143 ld1 {v0.8b}, [x2], x5 144 mul x4, x4, x12 145 146 ld1 {v1.8b}, [x2], x5 147 add x4, x4, #idct_stg1_round 148 149 ld1 {v2.8b}, [x2], x5 150 asr x4, x4, #idct_stg1_shift 151 152 adrp x14, :got:gai2_impeg2_idct_q11 153 ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_q11] 154 ldrsh x12, [x14] 155 156 ld1 {v3.8b}, [x2], x5 157 mul x4, x4, x12 158 159 ld1 {v4.8b}, [x2], x5 160 add x4, x4, #idct_stg2_round 161 162 ld1 {v5.8b}, [x2], x5 163 asr x4, x4, #idct_stg2_shift 164 165 ld1 {v6.8b}, [x2], x5 166 dup v30.8h, w4 167 168 169 ld1 {v7.8b}, [x2], x5 170 171 uaddw v8.8h, v30.8h , v0.8b 172 173 uaddw v10.8h, v30.8h , v1.8b 174 sqxtun v0.8b, v8.8h 175 176 uaddw v12.8h, v30.8h , v2.8b 177 sqxtun v1.8b, v10.8h 178 st1 {v0.8b}, [x3], x6 179 180 uaddw v14.8h, v30.8h , v3.8b 181 sqxtun v2.8b, v12.8h 182 st1 {v1.8b}, [x3], x6 183 184 uaddw v16.8h, v30.8h , v4.8b 185 sqxtun v3.8b, v14.8h 186 st1 {v2.8b}, [x3], x6 187 188 uaddw v18.8h, v30.8h , v5.8b 189 sqxtun v4.8b, v16.8h 190 st1 {v3.8b}, [x3], x6 191 192 uaddw v20.8h, v30.8h , v6.8b 193 sqxtun v5.8b, v18.8h 194 st1 {v4.8b}, [x3], x6 195 196 uaddw v22.8h, v30.8h , v7.8b 197 sqxtun v6.8b, v20.8h 198 st1 {v5.8b}, [x3], x6 199 200 sqxtun v7.8b, v22.8h 201 st1 {v6.8b}, [x3], x6 202 203 204 st1 {v7.8b}, [x3], x6 205 206 // LDMFD sp!,{x4,x6,x12,pc} 207 pop_v_regs 208 ret 209 210 211 212 .global impeg2_idct_recon_dc_mismatch_av8 213 .extern gai2_impeg2_idct_last_row_q11 214 .extern gai2_impeg2_mismatch_stg1_outp 215 impeg2_idct_recon_dc_mismatch_av8: 216 // STMFD sp!,{x4-x12,x14} 217 push_v_regs 218 219 ldrsh x4, [x0] 220 adrp x14, :got:gai2_impeg2_idct_q15 221 ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_q15] 222 ldrsh x12, [x14] 223 224 mul x4, x4, x12 225 add x4, x4, #idct_stg1_round 226 asr x4, x4, #idct_stg1_shift 227 228 adrp x14, :got:gai2_impeg2_idct_q11 229 ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_q11] 230 ldrsh x12, [x14] 231 mul x4, x4, x12 232 dup v0.4s, w4 233 234 mov x14, #16 ////Increment for table read 235 adrp x4, :got:gai2_impeg2_mismatch_stg2_additive 236 ldr x4, [x4, #:got_lo12:gai2_impeg2_mismatch_stg2_additive] 237 238 ld1 {v2.4h, v3.4h}, [x4], x14 239 ld1 {v30.8b}, [x2], x5 240 sxtl v8.4s, v2.4h 241 sxtl v10.4s, v3.4h 242 raddhn v12.4h, v0.4s, v8.4s 243 raddhn2 v12.8h, v0.4s, v10.4s 244 uaddw v14.8h, v12.8h , v30.8b 245 sqxtun v30.8b, v14.8h 246 st1 {v30.8b}, [x3], x6 247 248 ld1 {v2.4h, v3.4h}, [x4], x14 249 ld1 {v30.8b}, [x2], x5 250 sxtl v8.4s, v2.4h 251 sxtl v10.4s, v3.4h 252 raddhn v12.4h, v0.4s, v8.4s 253 raddhn2 v12.8h, v0.4s, v10.4s 254 uaddw v14.8h, v12.8h , v30.8b 255 sqxtun v30.8b, v14.8h 256 st1 {v30.8b}, [x3], x6 257 258 ld1 {v2.4h, v3.4h}, [x4], x14 259 ld1 {v30.8b}, [x2], x5 260 sxtl v8.4s, v2.4h 261 sxtl v10.4s, v3.4h 262 raddhn v12.4h, v0.4s, v8.4s 263 raddhn2 v12.8h, v0.4s, v10.4s 264 uaddw v14.8h, v12.8h , v30.8b 265 sqxtun v30.8b, v14.8h 266 st1 {v30.8b}, [x3], x6 267 268 ld1 {v2.4h, v3.4h}, [x4], x14 269 ld1 {v30.8b}, [x2], x5 270 sxtl v8.4s, v2.4h 271 sxtl v10.4s, v3.4h 272 raddhn v12.4h, v0.4s, v8.4s 273 raddhn2 v12.8h, v0.4s, v10.4s 274 uaddw v14.8h, v12.8h , v30.8b 275 sqxtun v30.8b, v14.8h 276 st1 {v30.8b}, [x3], x6 277 278 ld1 {v2.4h, v3.4h}, [x4], x14 279 ld1 {v30.8b}, [x2], x5 280 sxtl v8.4s, v2.4h 281 sxtl v10.4s, v3.4h 282 raddhn v12.4h, v0.4s, v8.4s 283 raddhn2 v12.8h, v0.4s, v10.4s 284 uaddw v14.8h, v12.8h , v30.8b 285 sqxtun v30.8b, v14.8h 286 st1 {v30.8b}, [x3], x6 287 288 ld1 {v2.4h, v3.4h}, [x4], x14 289 ld1 {v30.8b}, [x2], x5 290 sxtl v8.4s, v2.4h 291 sxtl v10.4s, v3.4h 292 raddhn v12.4h, v0.4s, v8.4s 293 raddhn2 v12.8h, v0.4s, v10.4s 294 uaddw v14.8h, v12.8h , v30.8b 295 sqxtun v30.8b, v14.8h 296 st1 {v30.8b}, [x3], x6 297 298 ld1 {v2.4h, v3.4h}, [x4], x14 299 ld1 {v30.8b}, [x2], x5 300 sxtl v8.4s, v2.4h 301 sxtl v10.4s, v3.4h 302 raddhn v12.4h, v0.4s, v8.4s 303 raddhn2 v12.8h, v0.4s, v10.4s 304 uaddw v14.8h, v12.8h , v30.8b 305 sqxtun v30.8b, v14.8h 306 st1 {v30.8b}, [x3], x6 307 308 ld1 {v2.4h, v3.4h}, [x4], x14 309 ld1 {v30.8b}, [x2], x5 310 sxtl v8.4s, v2.4h 311 sxtl v10.4s, v3.4h 312 raddhn v12.4h, v0.4s, v8.4s 313 raddhn2 v12.8h, v0.4s, v10.4s 314 uaddw v14.8h, v12.8h , v30.8b 315 sqxtun v30.8b, v14.8h 316 st1 {v30.8b}, [x3], x6 317 318 319 // LDMFD sp!,{x4-x12,pc} 320 pop_v_regs 321 ret 322 323 .globl impeg2_idct_recon_av8 324 325 .type impeg2_idct_recon_av8, %function 326 327 impeg2_idct_recon_av8: 328 ////register usage.extern - loading and until idct of columns 329 //// cosine constants - d0 330 //// sine constants - d1 331 //// row 0 first half - d2 - y0 332 //// row 1 first half - d6 - y1 333 //// row 2 first half - d3 - y2 334 //// row 3 first half - d7 - y3 335 //// row 4 first half - d10 - y4 336 //// row 5 first half - d14 - y5 337 //// row 6 first half - d11 - y6 338 //// row 7 first half - d15 - y7 339 340 //// row 0 second half - d4 - y0 341 //// row 1 second half - d8 - y1 342 //// row 2 second half - d5 - y2 343 //// row 3 second half - d9 - y3 344 //// row 4 second half - d12 - y4 345 //// row 5 second half - d16 - y5 346 //// row 6 second half - d13 - y6 347 //// row 7 second half - d17 - y7 348 349 //// copy the input pointer to another register 350 //// step 1 : load all constants 351 // stmfd sp!,{x4-x12,x14} 352 353 ldr w11, [sp] // zero rows 354 355 push_v_regs 356 stp x19, x20, [sp, #-16]! 357 358 mov x12, x7 // zero columns 359 mov x8, x5 // prediction stride 360 mov x7, x6 // destination stride 361 mov x6, x4 // src stride 362 lsl x6, x6, #1 // x sizeof(word16) 363 add x9, x0, x6, lsl #1 // 2 rows 364 365 add x10, x6, x6, lsl #1 // 3 rows 366 367 sub x10, x10, #8 // - 4 cols * sizeof(word16) 368 sub x5, x6, #8 // src_strd - 4 cols * sizeof(word16) 369 370 adrp x14, :got:gai2_impeg2_idct_first_col_q15 371 ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15] 372 ld1 {v0.4h, v1.4h}, [x14] ////d0,d1 are used for storing the constant data 373 374 ////step 2 load all the input data 375 ////step 3 operate first 4 colums at a time 376 377 and x11, x11, #0xff 378 and x12, x12, #0xff 379 380 cmp x11, #0xf0 381 bge skip_last4_rows 382 383 384 ld1 {v2.4h}, [x0], #8 385 ld1 {v3.4h}, [x9], #8 386 ld1 {v4.4h}, [x0], x5 387 smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) 388 ld1 {v5.4h}, [x9], x5 389 smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) 390 ld1 {v6.4h}, [x0], #8 391 ld1 {v7.4h}, [x9], #8 392 smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0) 393 ld1 {v8.4h}, [x0], x10 394 smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) 395 ld1 {v9.4h}, [x9], x10 396 smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2) 397 ld1 {v10.4h}, [x0], #8 398 smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) 399 ld1 {v11.4h}, [x9], #8 400 smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 401 ld1 {v12.4h}, [x0], x5 402 smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 403 ld1 {v13.4h}, [x9], x5 404 smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 405 ld1 {v14.4h}, [x0], #8 406 smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 407 ld1 {v15.4h}, [x9], #8 408 smull v22.4s, v10.4h, v0.h[0] //// y4 * cos4(part of c0 and c1) 409 ld1 {v16.4h}, [x0], x10 410 smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0) 411 ld1 {v17.4h}, [x9], x10 412 413 ///* this following was activated when alignment is not there */ 414 //// vld1.16 d2,[x0]! 415 //// vld1.16 d3,[x2]! 416 //// vld1.16 d4,[x0]! 417 //// vld1.16 d5,[x2]! 418 //// vld1.16 d6,[x0]! 419 //// vld1.16 d7,[x2]! 420 //// vld1.16 d8,[x0],x3 421 //// vld1.16 d9,[x2],x3 422 //// vld1.16 d10,[x0]! 423 //// vld1.16 d11,[x2]! 424 //// vld1.16 d12,[x0]! 425 //// vld1.16 d13,[x2]! 426 //// vld1.16 d14,[x0]! 427 //// vld1.16 d15,[x2]! 428 //// vld1.16 d16,[x0],x3 429 //// vld1.16 d17,[x2],x3 430 431 432 433 434 smlal v24.4s, v14.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) 435 smlsl v26.4s, v14.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) 436 smlal v28.4s, v14.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) 437 smlal v30.4s, v14.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) 438 439 smlsl v18.4s, v11.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) 440 smlal v6.4s, v11.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) 441 442 add v10.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) 443 sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) 444 445 smlal v24.4s, v15.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7) 446 smlsl v26.4s, v15.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6) 447 smlal v28.4s, v15.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5) 448 smlsl v30.4s, v15.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4) 449 450 add v14.4s, v10.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) 451 sub v10.4s, v10.4s , v6.4s //// a3 = c0 - d0(part of x3,x4) 452 sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) 453 add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) 454 455 add v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0) 456 sub v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7) 457 458 add v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2) 459 sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5) 460 461 add v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1) 462 sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6) 463 464 add v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3) 465 sub v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4) 466 467 sqrshrn v2.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT) 468 sqrshrn v15.4h, v6.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT) 469 sqrshrn v3.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT) 470 sqrshrn v14.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT) 471 sqrshrn v6.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT) 472 sqrshrn v11.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT) 473 sqrshrn v7.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT) 474 sqrshrn v10.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT) 475 476 477 b last4_cols 478 479 480 481 skip_last4_rows: 482 adrp x14, :got:gai2_impeg2_idct_first_col_q15 483 ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15] 484 ld1 {v0.4h, v1.4h}, [x14] 485 486 ld1 {v2.4h}, [x0], #8 487 ld1 {v3.4h}, [x9], #8 488 ld1 {v4.4h}, [x0], x5 489 ld1 {v5.4h}, [x9], x5 490 ld1 {v6.4h}, [x0], #8 491 ld1 {v7.4h}, [x9], #8 492 ld1 {v8.4h}, [x0], x10 493 ld1 {v9.4h}, [x9], x10 494 495 496 497 movi v12.4h, #0 498 movi v13.4h, #0 499 movi v16.4h, #0 500 movi v17.4h, #0 501 502 503 504 505 smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0) 506 smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) 507 smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2) 508 smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) 509 510 smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 511 smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 512 smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 513 smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 514 515 smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) 516 smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0) 517 518 smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) 519 520 521 add v14.4s, v20.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) 522 sub v10.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4) 523 sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) 524 add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) 525 526 add v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0) 527 sub v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7) 528 529 add v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2) 530 sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5) 531 532 add v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1) 533 sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6) 534 535 add v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3) 536 sub v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4) 537 538 sqrshrn v2.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT) 539 sqrshrn v15.4h, v6.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT) 540 sqrshrn v3.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT) 541 sqrshrn v14.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT) 542 sqrshrn v6.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT) 543 sqrshrn v11.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT) 544 sqrshrn v7.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT) 545 sqrshrn v10.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT) 546 547 548 last4_cols: 549 adrp x14, :got:gai2_impeg2_idct_first_col_q15 550 ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15] 551 ld1 {v0.4h, v1.4h}, [x14] 552 553 554 cmp x12, #0xf0 555 bge skip_last4cols 556 557 smull v24.4s, v8.4h, v0.h[1] //// y1 * cos1(part of b0) 558 smull v26.4s, v8.4h, v0.h[3] //// y1 * cos3(part of b1) 559 smull v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2) 560 smull v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) 561 562 smlal v24.4s, v9.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 563 smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 564 smlsl v28.4s, v9.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 565 smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 566 567 smull v18.4s, v5.4h, v1.h[2] //// y2 * sin2 (q4 is freed by this time)(part of d1) 568 smull v8.4s, v5.4h, v0.h[2] //// y2 * cos2(part of d0) 569 570 smull v20.4s, v4.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) 571 smull v22.4s, v12.4h, v0.h[0] //// y4 * cos4(part of c0 and c1) 572 573 smlal v24.4s, v16.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) 574 smlsl v26.4s, v16.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) 575 smlal v28.4s, v16.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) 576 smlal v30.4s, v16.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) 577 578 smlsl v18.4s, v13.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) 579 smlal v8.4s, v13.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) 580 581 add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) 582 sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) 583 584 smlal v24.4s, v17.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7) 585 smlsl v26.4s, v17.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6) 586 smlal v28.4s, v17.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5) 587 smlsl v30.4s, v17.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4) 588 589 add v16.4s, v12.4s , v8.4s //// a0 = c0 + d0(part of e0,e7) 590 sub v12.4s, v12.4s , v8.4s //// a3 = c0 - d0(part of e3,e4) 591 sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of e2,e5) 592 add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of e1,e6) 593 594 add v20.4s, v16.4s , v24.4s //// a0 + b0(part of e0) 595 sub v8.4s, v16.4s , v24.4s //// a0 - b0(part of e7) 596 597 add v24.4s, v22.4s , v28.4s //// a2 + b2(part of e2) 598 sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of e5) 599 600 add v28.4s, v18.4s , v26.4s //// a1 + b1(part of e1) 601 sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of e6) 602 603 add v26.4s, v12.4s , v30.4s //// a3 + b3(part of e3) 604 sub v30.4s, v12.4s , v30.4s //// a3 - b3(part of x4) 605 606 sqrshrn v4.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT) 607 sqrshrn v17.4h, v8.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT) 608 sqrshrn v5.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT) 609 sqrshrn v16.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT) 610 sqrshrn v8.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT) 611 sqrshrn v13.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT) 612 sqrshrn v9.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT) 613 sqrshrn v12.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT) 614 b end_skip_last4cols 615 616 617 618 skip_last4cols: 619 adrp x14, :got:gai2_impeg2_idct_first_col_q11 620 ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q11] 621 ld1 {v0.4h, v1.4h}, [x14] 622 623 umov x15, v25.d[0] 624 625 trn1 v25.4h, v2.4h, v6.4h 626 trn2 v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing 627 628 trn1 v27.4h, v3.4h, v7.4h 629 trn2 v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing 630 631 trn1 v6.2s, v29.2s, v31.2s 632 trn2 v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued..... 633 trn1 v2.2s, v25.2s, v27.2s 634 trn2 v3.2s, v25.2s, v27.2s ////x0,x1,x2,x3 first qudrant transposing continued..... 635 636 637 trn1 v25.4h, v10.4h, v14.4h 638 trn2 v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing 639 640 trn1 v27.4h, v11.4h, v15.4h 641 trn2 v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing 642 643 trn1 v10.2s, v25.2s, v27.2s 644 trn2 v11.2s, v25.2s, v27.2s ////x4,x5,x6,x7 third qudrant transposing continued..... 645 trn1 v14.2s, v29.2s, v31.2s 646 trn2 v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued..... 647 648 mov v25.d[0], x15 649 650 smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0) 651 smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) 652 smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2) 653 smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) 654 655 smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 656 smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 657 smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 658 smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 659 660 smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) 661 // vmull.s16 q11,d4,d0[0] @// y4 * cos4(part of c0 and c1) 662 663 smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) 664 smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0) 665 666 667 668 669 sub v22.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4) 670 add v4.4s, v20.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) 671 672 673 add v2.4s, v4.4s , v24.4s 674 675 sub v6.4s, v4.4s , v24.4s 676 677 add v8.4s, v22.4s , v30.4s 678 679 sub v24.4s, v22.4s , v30.4s 680 681 sqrshrn v5.4h, v8.4s, #idct_stg2_shift 682 sqrshrn v2.4h, v2.4s, #idct_stg2_shift 683 sqrshrn v9.4h, v6.4s, #idct_stg2_shift 684 sqrshrn v6.4h, v24.4s, #idct_stg2_shift 685 686 sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) 687 add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) 688 689 690 add v30.4s, v22.4s , v28.4s 691 692 sub v24.4s, v22.4s , v28.4s 693 694 add v28.4s, v18.4s , v26.4s 695 696 sub v22.4s, v18.4s , v26.4s 697 sqrshrn v4.4h, v30.4s, #idct_stg2_shift 698 sqrshrn v7.4h, v24.4s, #idct_stg2_shift 699 sqrshrn v3.4h, v28.4s, #idct_stg2_shift 700 sqrshrn v8.4h, v22.4s, #idct_stg2_shift 701 702 703 704 umov x19, v25.d[0] 705 umov x20, v25.d[1] 706 707 trn1 v27.4h, v2.4h, v3.4h 708 trn2 v29.4h, v2.4h, v3.4h 709 trn1 v25.4h, v4.4h, v5.4h 710 trn2 v31.4h, v4.4h, v5.4h 711 712 trn1 v2.2s, v27.2s, v25.2s 713 trn2 v4.2s, v27.2s, v25.2s 714 trn1 v3.2s, v29.2s, v31.2s 715 trn2 v5.2s, v29.2s, v31.2s 716 717 trn1 v27.4h, v6.4h, v7.4h 718 trn2 v29.4h, v6.4h, v7.4h 719 trn1 v25.4h, v8.4h, v9.4h 720 trn2 v31.4h, v8.4h, v9.4h 721 722 trn1 v6.2s, v27.2s, v25.2s 723 trn2 v8.2s, v27.2s, v25.2s 724 trn1 v7.2s, v29.2s, v31.2s 725 trn2 v9.2s, v29.2s, v31.2s 726 727 mov v25.d[0], x19 728 mov v25.d[1], x20 729 730 smull v24.4s, v14.4h, v0.h[1] //// y1 * cos1(part of b0) 731 732 smull v26.4s, v14.4h, v0.h[3] //// y1 * cos3(part of b1) 733 smull v28.4s, v14.4h, v1.h[1] //// y1 * sin3(part of b2) 734 smull v30.4s, v14.4h, v1.h[3] //// y1 * sin1(part of b3) 735 736 smlal v24.4s, v15.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 737 smlsl v26.4s, v15.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 738 smlsl v28.4s, v15.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 739 smlsl v30.4s, v15.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 740 smull v20.4s, v10.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) 741 smull v18.4s, v11.4h, v1.h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1) 742 smull v14.4s, v11.4h, v0.h[2] //// y2 * cos2(part of d0) 743 744 745 add x4, x2, x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data 746 747 748 add x5, x8, x8, lsl #1 // 749 750 751 add x0, x3, x7, lsl #1 // x0 points to 3rd row of dest data 752 753 754 add x10, x7, x7, lsl #1 // 755 756 // swapping v3 and v6 757 mov v31.d[0], v3.d[0] 758 mov v3.d[0], v6.d[0] 759 mov v6.d[0], v31.d[0] 760 761 // swapping v5 and v8 762 mov v31.d[0], v5.d[0] 763 mov v5.d[0], v8.d[0] 764 mov v8.d[0], v31.d[0] 765 766 767 sub v22.4s, v20.4s , v14.4s //// a3 = c0 - d0(part of x3,x4) 768 add v12.4s, v20.4s , v14.4s //// a0 = c0 + d0(part of x0,x7) 769 770 771 add v0.4s, v12.4s , v24.4s 772 773 774 sub v24.4s, v12.4s , v24.4s 775 776 777 add v12.4s, v22.4s , v30.4s 778 779 780 sub v14.4s, v22.4s , v30.4s 781 782 sqrshrn v10.4h, v0.4s, #idct_stg2_shift 783 sqrshrn v17.4h, v24.4s, #idct_stg2_shift 784 sqrshrn v13.4h, v12.4s, #idct_stg2_shift 785 sqrshrn v14.4h, v14.4s, #idct_stg2_shift 786 787 sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) 788 add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) 789 790 791 add v0.4s, v22.4s , v28.4s 792 793 794 sub v24.4s, v22.4s , v28.4s 795 796 797 add v28.4s, v18.4s , v26.4s 798 799 800 sub v26.4s, v18.4s , v26.4s 801 ld1 {v18.8b}, [x2], x8 802 803 sqrshrn v12.4h, v0.4s, #idct_stg2_shift 804 ld1 {v20.8b}, [x2], x5 805 806 807 sqrshrn v15.4h, v24.4s, #idct_stg2_shift 808 ld1 {v19.8b}, [x2], x8 809 810 811 812 813 sqrshrn v11.4h, v28.4s, #idct_stg2_shift 814 ld1 {v22.8b}, [x4], x8 815 816 817 818 819 sqrshrn v16.4h, v26.4s, #idct_stg2_shift 820 ld1 {v21.8b}, [x2], x5 821 822 823 b pred_buff_addition 824 end_skip_last4cols: 825 adrp x14, :got:gai2_impeg2_idct_first_col_q11 826 ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q11] 827 ld1 {v0.4h, v1.4h}, [x14] 828 829 830 umov x19, v25.d[0] 831 umov x20, v25.d[1] 832 833 ///* now the idct of columns is done, transpose so that row idct done efficiently(step5) */ 834 trn1 v27.4h, v2.4h, v6.4h 835 trn2 v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing 836 trn1 v25.4h, v3.4h, v7.4h 837 trn2 v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing 838 839 trn1 v2.2s, v27.2s, v25.2s 840 trn2 v3.2s, v27.2s, v25.2s ////x0,x1,x2,x3 first qudrant transposing continued..... 841 trn1 v6.2s, v29.2s, v31.2s 842 trn2 v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued..... 843 844 trn1 v27.4h, v4.4h, v8.4h 845 trn2 v29.4h, v4.4h, v8.4h ////[x3,x1],[x2,x0] second qudrant transposing 846 trn1 v25.4h, v5.4h, v9.4h 847 trn2 v31.4h, v5.4h, v9.4h ////[x3,x1],[x2,x0] second qudrant transposing 848 849 trn1 v4.2s, v27.2s, v25.2s 850 trn2 v5.2s, v27.2s, v25.2s ////x0,x1,x2,x3 second qudrant transposing continued..... 851 trn1 v8.2s, v29.2s, v31.2s 852 trn2 v9.2s, v29.2s, v31.2s ////x0,x1,x2,x3 second qudrant transposing continued..... 853 854 trn1 v27.4h, v10.4h, v14.4h 855 trn2 v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing 856 trn1 v25.4h, v11.4h, v15.4h 857 trn2 v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing 858 859 trn1 v10.2s, v27.2s, v25.2s 860 trn2 v11.2s, v27.2s, v25.2s ////x4,x5,x6,x7 third qudrant transposing continued..... 861 trn1 v14.2s, v29.2s, v31.2s 862 trn2 v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued..... 863 864 trn1 v27.4h, v12.4h, v16.4h 865 trn2 v29.4h, v12.4h, v16.4h ////[x7,x5],[x6,x4] fourth qudrant transposing 866 trn1 v25.4h, v13.4h, v17.4h 867 trn2 v31.4h, v13.4h, v17.4h ////[x7,x5],[x6,x4] fourth qudrant transposing 868 869 trn1 v12.2s, v27.2s, v25.2s 870 trn2 v13.2s, v27.2s, v25.2s ////x4,x5,x6,x7 fourth qudrant transposing continued..... 871 trn1 v16.2s, v29.2s, v31.2s 872 trn2 v17.2s, v29.2s, v31.2s ////x4,x5,x6,x7 fourth qudrant transposing continued..... 873 874 mov v25.d[0], x19 875 mov v25.d[1], x20 876 877 ////step6 operate on first four rows and find their idct 878 ////register usage.extern - storing and idct of rows 879 //// cosine constants - d0 880 //// sine constants - d1 881 //// element 0 first four - d2 - y0 882 //// element 1 first four - d6 - y1 883 //// element 2 first four - d3 - y2 884 //// element 3 first four - d7 - y3 885 //// element 4 first four - d4 - y4 886 //// element 5 first four - d8 - y5 887 //// element 6 first four - d5 - y6 888 //// element 7 first four - d9 - y7 889 //// element 0 second four - d10 - y0 890 //// element 1 second four - d14 - y1 891 //// element 2 second four - d11 - y2 892 //// element 3 second four - d15 - y3 893 //// element 4 second four - d12 - y4 894 //// element 5 second four - d16 - y5 895 //// element 6 second four - d13 - y6 896 //// element 7 second four - d17 - y7 897 898 //// map between first kernel code seq and current 899 //// d2 -> d2 900 //// d6 -> d6 901 //// d3 -> d3 902 //// d7 -> d7 903 //// d10 -> d4 904 //// d14 -> d8 905 //// d11 -> d5 906 //// d15 -> d9 907 //// q3 -> q3 908 //// q5 -> q2 909 //// q7 -> q4 910 911 smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0) 912 smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) 913 smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2) 914 smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) 915 916 smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 917 smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 918 smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 919 smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 920 921 smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) 922 smull v22.4s, v4.4h, v0.h[0] //// y4 * cos4(part of c0 and c1) 923 924 smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) 925 smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0) 926 927 928 smlal v24.4s, v8.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) 929 smlsl v26.4s, v8.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) 930 smlal v28.4s, v8.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) 931 smlal v30.4s, v8.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) 932 933 smlsl v18.4s, v5.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) 934 smlal v6.4s, v5.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) 935 936 add v2.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) 937 sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) 938 939 smlal v24.4s, v9.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7) 940 smlsl v26.4s, v9.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6) 941 smlal v28.4s, v9.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5) 942 smlsl v30.4s, v9.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4) 943 944 sub v22.4s, v2.4s , v6.4s //// a3 = c0 - d0(part of x3,x4) 945 add v4.4s, v2.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) 946 947 948 add v2.4s, v4.4s , v24.4s 949 950 sub v6.4s, v4.4s , v24.4s 951 952 add v8.4s, v22.4s , v30.4s 953 954 sub v24.4s, v22.4s , v30.4s 955 956 sqrshrn v5.4h, v8.4s, #idct_stg2_shift 957 sqrshrn v2.4h, v2.4s, #idct_stg2_shift 958 sqrshrn v9.4h, v6.4s, #idct_stg2_shift 959 sqrshrn v6.4h, v24.4s, #idct_stg2_shift 960 961 sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) 962 add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) 963 964 965 add v30.4s, v22.4s , v28.4s 966 967 sub v24.4s, v22.4s , v28.4s 968 969 add v28.4s, v18.4s , v26.4s 970 971 sub v22.4s, v18.4s , v26.4s 972 sqrshrn v4.4h, v30.4s, #idct_stg2_shift 973 sqrshrn v7.4h, v24.4s, #idct_stg2_shift 974 sqrshrn v3.4h, v28.4s, #idct_stg2_shift 975 sqrshrn v8.4h, v22.4s, #idct_stg2_shift 976 977 978 979 umov x19, v25.d[0] 980 umov x20, v25.d[1] 981 982 trn1 v27.4h, v2.4h, v3.4h 983 trn2 v29.4h, v2.4h, v3.4h 984 trn1 v25.4h, v4.4h, v5.4h 985 trn2 v31.4h, v4.4h, v5.4h 986 987 trn1 v2.2s, v27.2s, v25.2s 988 trn2 v4.2s, v27.2s, v25.2s 989 trn1 v3.2s, v29.2s, v31.2s 990 trn2 v5.2s, v29.2s, v31.2s 991 992 trn1 v27.4h, v6.4h, v7.4h 993 trn2 v29.4h, v6.4h, v7.4h 994 trn1 v25.4h, v8.4h, v9.4h 995 trn2 v31.4h, v8.4h, v9.4h 996 997 trn1 v6.2s, v27.2s, v25.2s 998 trn2 v8.2s, v27.2s, v25.2s 999 trn1 v7.2s, v29.2s, v31.2s 1000 trn2 v9.2s, v29.2s, v31.2s 1001 1002 mov v25.d[0], x19 1003 mov v25.d[1], x20 1004 1005 1006 1007 smull v24.4s, v14.4h, v0.h[1] //// y1 * cos1(part of b0) 1008 smull v26.4s, v14.4h, v0.h[3] //// y1 * cos3(part of b1) 1009 smull v28.4s, v14.4h, v1.h[1] //// y1 * sin3(part of b2) 1010 smull v30.4s, v14.4h, v1.h[3] //// y1 * sin1(part of b3) 1011 smlal v24.4s, v15.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 1012 smlsl v26.4s, v15.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 1013 smlsl v28.4s, v15.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 1014 smlsl v30.4s, v15.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 1015 smull v20.4s, v10.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) 1016 smull v22.4s, v12.4h, v0.h[0] //// y4 * cos4(part of c0 and c1) 1017 smull v18.4s, v11.4h, v1.h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1) 1018 smull v14.4s, v11.4h, v0.h[2] //// y2 * cos2(part of d0) 1019 smlal v24.4s, v16.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) 1020 1021 add x4, x2, x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data 1022 smlsl v26.4s, v16.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) 1023 1024 add x5, x8, x8, lsl #1 // 1025 smlal v28.4s, v16.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) 1026 1027 add x0, x3, x7, lsl #1 // x0 points to 3rd row of dest data 1028 smlal v30.4s, v16.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) 1029 1030 add x10, x7, x7, lsl #1 // 1031 smlsl v18.4s, v13.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) 1032 1033 1034 smlal v14.4s, v13.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) 1035 1036 add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) 1037 sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) 1038 1039 smlal v24.4s, v17.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7) 1040 1041 // swapping v3 and v6 1042 mov v31.d[0], v3.d[0] 1043 mov v3.d[0], v6.d[0] 1044 mov v6.d[0], v31.d[0] 1045 1046 smlsl v26.4s, v17.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6) 1047 // swapping v5 and v8 1048 mov v31.d[0], v5.d[0] 1049 mov v5.d[0], v8.d[0] 1050 mov v8.d[0], v31.d[0] 1051 1052 smlal v28.4s, v17.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5) 1053 smlsl v30.4s, v17.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4) 1054 1055 sub v22.4s, v12.4s , v14.4s //// a3 = c0 - d0(part of x3,x4) 1056 add v12.4s, v12.4s , v14.4s //// a0 = c0 + d0(part of x0,x7) 1057 1058 1059 add v0.4s, v12.4s , v24.4s 1060 1061 1062 sub v24.4s, v12.4s , v24.4s 1063 1064 1065 add v12.4s, v22.4s , v30.4s 1066 1067 1068 sub v14.4s, v22.4s , v30.4s 1069 1070 sqrshrn v10.4h, v0.4s, #idct_stg2_shift 1071 sqrshrn v17.4h, v24.4s, #idct_stg2_shift 1072 sqrshrn v13.4h, v12.4s, #idct_stg2_shift 1073 sqrshrn v14.4h, v14.4s, #idct_stg2_shift 1074 1075 sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) 1076 add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) 1077 1078 1079 add v0.4s, v22.4s , v28.4s 1080 1081 1082 sub v24.4s, v22.4s , v28.4s 1083 1084 1085 add v28.4s, v18.4s , v26.4s 1086 1087 1088 sub v26.4s, v18.4s , v26.4s 1089 ld1 {v18.8b}, [x2], x8 1090 1091 sqrshrn v12.4h, v0.4s, #idct_stg2_shift 1092 ld1 {v20.8b}, [x2], x5 1093 1094 1095 sqrshrn v15.4h, v24.4s, #idct_stg2_shift 1096 ld1 {v19.8b}, [x2], x8 1097 1098 1099 1100 1101 sqrshrn v11.4h, v28.4s, #idct_stg2_shift 1102 ld1 {v22.8b}, [x4], x8 1103 1104 1105 1106 1107 sqrshrn v16.4h, v26.4s, #idct_stg2_shift 1108 ld1 {v21.8b}, [x2], x5 1109 1110 1111 1112 1113 pred_buff_addition: 1114 1115 umov x19, v25.d[0] 1116 umov x20, v25.d[1] 1117 1118 trn1 v27.4h, v10.4h, v11.4h 1119 trn2 v29.4h, v10.4h, v11.4h 1120 trn1 v25.4h, v12.4h, v13.4h 1121 trn2 v31.4h, v12.4h, v13.4h 1122 1123 trn1 v10.2s, v27.2s, v25.2s 1124 trn2 v12.2s, v27.2s, v25.2s 1125 trn1 v11.2s, v29.2s, v31.2s 1126 trn2 v13.2s, v29.2s, v31.2s 1127 1128 trn1 v27.4h, v14.4h, v15.4h 1129 trn2 v29.4h, v14.4h, v15.4h 1130 trn1 v25.4h, v16.4h, v17.4h 1131 trn2 v31.4h, v16.4h, v17.4h 1132 1133 trn1 v14.2s, v27.2s, v25.2s 1134 trn2 v16.2s, v27.2s, v25.2s 1135 trn1 v15.2s, v29.2s, v31.2s 1136 trn2 v17.2s, v29.2s, v31.2s 1137 1138 1139 mov v25.d[0], x19 1140 mov v25.d[1], x20 1141 1142 1143 ld1 {v24.8b}, [x4], x5 1144 ld1 {v23.8b}, [x4], x8 1145 ld1 {v25.8b}, [x4], x5 1146 mov v2.d[1], v3.d[0] 1147 mov v4.d[1], v5.d[0] 1148 mov v6.d[1], v7.d[0] 1149 mov v8.d[1], v9.d[0] 1150 uaddw v2.8h, v2.8h , v18.8b 1151 uaddw v4.8h, v4.8h , v22.8b 1152 uaddw v6.8h, v6.8h , v20.8b 1153 uaddw v8.8h, v8.8h , v24.8b 1154 1155 // swapping v11 and v14 1156 mov v31.d[0], v11.d[0] 1157 mov v11.d[0], v14.d[0] 1158 mov v14.d[0], v31.d[0] 1159 1160 // swapping v13 and v16 1161 mov v31.d[0], v13.d[0] 1162 mov v13.d[0], v16.d[0] 1163 mov v16.d[0], v31.d[0] 1164 // row values stored in the q register. 1165 1166 //q1 :x0 1167 //q3: x1 1168 //q2: x2 1169 //q4: x3 1170 //q5: x4 1171 //q7: x5 1172 //q6: x6 1173 //q8: x7 1174 1175 1176 1177 ///// adding the prediction buffer 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 // load prediction data 1188 1189 1190 1191 1192 1193 //adding recon with prediction 1194 1195 1196 1197 1198 mov v10.d[1], v11.d[0] 1199 mov v12.d[1], v13.d[0] 1200 mov v14.d[1], v15.d[0] 1201 mov v16.d[1], v17.d[0] 1202 uaddw v10.8h, v10.8h , v19.8b 1203 sqxtun v2.8b, v2.8h 1204 uaddw v14.8h, v14.8h , v21.8b 1205 sqxtun v4.8b, v4.8h 1206 uaddw v12.8h, v12.8h , v23.8b 1207 sqxtun v6.8b, v6.8h 1208 uaddw v16.8h, v16.8h , v25.8b 1209 sqxtun v8.8b, v8.8h 1210 1211 1212 1213 1214 1215 1216 1217 st1 {v2.8b}, [x3], x7 1218 sqxtun v10.8b, v10.8h 1219 st1 {v6.8b}, [x3], x10 1220 sqxtun v14.8b, v14.8h 1221 st1 {v4.8b}, [x0], x7 1222 sqxtun v12.8b, v12.8h 1223 st1 {v8.8b}, [x0], x10 1224 sqxtun v16.8b, v16.8h 1225 1226 1227 1228 1229 1230 1231 1232 st1 {v10.8b}, [x3], x7 1233 st1 {v14.8b}, [x3], x10 1234 st1 {v12.8b}, [x0], x7 1235 st1 {v16.8b}, [x0], x10 1236 1237 1238 1239 1240 // ldmfd sp!,{x4-x12,pc} 1241 ldp x19, x20, [sp], #16 1242 pop_v_regs 1243 ret 1244 1245 1246 1247 1248