1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 // ******************************************************************************* 20 // * @file 21 // * ihevc_itrans_recon_8x8_neon.s 22 // * 23 // * @brief 24 // * contains function definitions for single stage inverse transform 25 // * 26 // * @author 27 // * anand s 28 // * 29 // * @par list of functions: 30 // * - ihevc_itrans_recon_32x32() 31 // * 32 // * @remarks 33 // * the input buffer is being corrupted 34 // * 35 // ******************************************************************************* 36 //*/ 37 38 ///** 39 // ******************************************************************************* 40 // * 41 // * @brief 42 // * this function performs inverse transform and reconstruction for 8x8 43 // * input block 44 // * 45 // * @par description: 46 // * performs inverse transform and adds the prediction data and clips output 47 // * to 8 bit 48 // * 49 // * @param[in] pi2_src 50 // * input 16x16 coefficients 51 // * 52 // * @param[in] pi2_tmp 53 // * temporary 16x16 buffer for storing inverse 54 // * 55 // * transform 56 // * 1st stage output 57 // * 58 // * @param[in] pu1_pred 59 // * prediction 16x16 block 60 // * 61 // * @param[out] pu1_dst 62 // * output 8x8 block 63 // * 64 // * @param[in] src_strd 65 // * input stride 66 // * 67 // * @param[in] pred_strd 68 // * prediction stride 69 // * 70 // * @param[in] dst_strd 71 // * output stride 72 // * 73 // * @param[in] shift 74 // * output shift 75 // * 76 // * @param[in] x12 77 // * zero columns in pi2_src 78 // * 79 // * @returns void 80 // * 81 // * @remarks 82 // * none 83 // * 84 // ******************************************************************************* 85 // */ 86 87 //void ihevc_itrans_recon_32x32(word16 *pi2_src, 88 // word16 *pi2_tmp, 89 // uword8 *pu1_pred, 90 // uword8 *pu1_dst, 91 // word32 src_strd, 92 // word32 pred_strd, 93 // word32 dst_strd, 94 // word32 x12 95 // word32 x11 ) 96 97 //**************variables vs registers************************* 98 // x0 => *pi2_src 99 // x1 => *pi2_tmp 100 // x2 => *pu1_pred 101 // x3 => *pu1_dst 102 // src_strd 103 // pred_strd 104 // dst_strd 105 // x12 106 // x11 107 108 109 //d0[0]= 64 d2[0]=83 110 //d0[1]= 90 d2[1]=82 111 //d0[2]= 90 d2[2]=80 112 //d0[3]= 90 d2[3]=78 113 //d1[0]= 89 d3[0]=75 114 //d1[1]= 88 d3[1]=73 115 //d1[2]= 87 d3[2]=70 116 //d1[3]= 85 d3[3]=67 117 118 //d4[0]= 64 d6[0]=36 119 //d4[1]= 61 d6[1]=31 120 //d4[2]= 57 d6[2]=25 121 //d4[3]= 54 d6[3]=22 122 //d5[0]= 50 d7[0]=18 123 //d5[1]= 46 d7[1]=13 124 //d5[2]= 43 d7[2]=9 125 //d5[3]= 38 d7[3]=4 126 127 .text 128 .align 4 129 .include "ihevc_neon_macros.s" 130 131 132 133 134 .set shift_stage1_idct , 7 135 .set shift_stage2_idct , 12 136 137 //#define zero_cols x12 138 //#define zero_rows x11 139 140 .globl ihevc_itrans_recon_32x32_av8 141 142 .extern g_ai2_ihevc_trans_32_transpose 143 144 x5_addr: .word 0xfffff000 145 x9_addr: .word 0xffff0000 146 147 .type ihevc_itrans_recon_32x32_av8, %function 148 149 ihevc_itrans_recon_32x32_av8: 150 151 ldr w11, [sp] 152 153 // stmfd sp!,{x0-x12,x14} 154 push_v_regs 155 stp x19, x20,[sp,#-16]! 156 stp x0, x1,[sp,#-16]! 157 stp x5, x6,[sp,#-16]! 158 159 //ldr x8,[sp,#56] @ prediction stride 160 //ldr x7,[sp,#64] @ destination stride 161 mov x6, x4 // src stride 162 mov x12, x7 163 lsl x6, x6, #1 // x sizeof(word16) 164 add x10,x6,x6, lsl #1 // 3 rows 165 166 167 mov x8,x0 168 169 adrp x14, :got:g_ai2_ihevc_trans_32_transpose 170 ldr x14, [x14, #:got_lo12:g_ai2_ihevc_trans_32_transpose] 171 172 ld1 {v0.4h, v1.4h, v2.4h, v3.4h},[x14],#32 173 ld1 {v4.4h, v5.4h, v6.4h, v7.4h},[x14],#32 174 175 //registers which are free 176 // x10,x9,x11,x12 177 mov x9,#0xffffff00 178 mov x10,#0xfffffff0 179 ldr w5, x5_addr 180 ldr w7, x9_addr 181 cmp x12,x10 182 mov x20,#1 183 csel x14, x20, x14,hs 184 bhs stage1 185 186 187 cmp x12,x9 188 mov x20,#2 189 csel x14, x20, x14,hs 190 bhs stage1 191 192 cmp x12,x5 193 mov x20,#3 194 csel x14, x20, x14,hs 195 bhs stage1 196 197 cmp x12,x7 198 mov x20,#4 199 csel x14, x20, x14,hs 200 201 mov x14,#8 202 b stage1 203 //.ltorg 204 205 206 dct_stage1: 207 add x8,x8,#8 208 mov x0,x8 209 210 stage1: 211 ld1 {v10.4h},[x0],x6 212 ld1 {v8.4h},[x0],x6 213 ld1 {v11.4h},[x0],x6 214 ld1 {v9.4h},[x0],x6 215 216 smull v24.4s, v8.4h, v0.4h[1] //// y1 * cos1(part of b0) 217 smull v26.4s, v8.4h, v0.4h[3] //// y1 * cos3(part of b1) 218 smull v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2) 219 smull v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3) 220 221 smlal v24.4s, v9.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) 222 smlal v26.4s, v9.4h, v2.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) 223 smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) 224 smlal v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 225 226 227 228 229 230 smull v20.4s, v10.4h, v0.4h[0] 231 smlal v20.4s, v11.4h, v0.4h[2] 232 233 234 smull v22.4s, v10.4h, v0.4h[0] 235 smlal v22.4s, v11.4h, v1.4h[2] 236 237 smull v16.4s, v10.4h, v0.4h[0] 238 smlal v16.4s, v11.4h, v2.4h[2] 239 240 smull v18.4s, v10.4h, v0.4h[0] 241 smlal v18.4s, v11.4h, v3.4h[2] 242 cmp x11,x10 243 bhs shift1 244 245 ld1 {v12.4h},[x0],x6 246 ld1 {v14.4h},[x0],x6 247 ld1 {v13.4h},[x0],x6 248 ld1 {v15.4h},[x0],x6 249 250 251 252 253 254 255 256 smlal v24.4s, v14.4h, v1.4h[1] 257 smlal v26.4s, v14.4h, v3.4h[3] 258 smlal v28.4s, v14.4h, v6.4h[1] 259 smlsl v30.4s, v14.4h, v7.4h[1] 260 261 262 smlal v24.4s, v15.4h, v1.4h[3] 263 smlal v26.4s, v15.4h, v5.4h[1] 264 smlsl v28.4s, v15.4h, v7.4h[1] 265 smlsl v30.4s, v15.4h, v3.4h[3] 266 267 268 smlal v20.4s, v12.4h, v1.4h[0] 269 smlal v20.4s, v13.4h, v1.4h[2] 270 smlal v22.4s, v12.4h, v3.4h[0] 271 smlal v22.4s, v13.4h, v4.4h[2] 272 smlal v16.4s, v12.4h, v5.4h[0] 273 smlal v16.4s, v13.4h, v7.4h[2] 274 smlal v18.4s, v12.4h, v7.4h[0] 275 smlsl v18.4s, v13.4h, v5.4h[2] 276 277 cmp x11,x9 278 bhs shift1 279 280 ld1 {v10.4h},[x0],x6 281 ld1 {v8.4h},[x0],x6 282 ld1 {v11.4h},[x0],x6 283 ld1 {v9.4h},[x0],x6 284 285 286 smlal v24.4s, v8.4h, v2.4h[1] //// y1 * cos1(part of b0) 287 smlal v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1) 288 smlsl v28.4s, v8.4h, v4.4h[3] //// y1 * sin3(part of b2) 289 smlsl v30.4s, v8.4h, v0.4h[1] //// y1 * sin1(part of b3) 290 291 smlal v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) 292 smlsl v26.4s, v9.4h, v7.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) 293 smlsl v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) 294 smlsl v30.4s, v9.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 295 296 297 298 299 300 smlal v20.4s, v10.4h, v2.4h[0] 301 smlal v20.4s, v11.4h, v2.4h[2] 302 303 304 smlal v22.4s, v10.4h, v6.4h[0] 305 smlal v22.4s, v11.4h, v7.4h[2] 306 307 smlsl v16.4s, v10.4h, v6.4h[0] 308 smlsl v16.4s, v11.4h, v3.4h[2] 309 310 smlsl v18.4s, v10.4h, v2.4h[0] 311 smlsl v18.4s, v11.4h, v1.4h[2] 312 313 cmp x11,x5 314 bhs shift1 315 316 317 ld1 {v12.4h},[x0],x6 318 ld1 {v14.4h},[x0],x6 319 ld1 {v13.4h},[x0],x6 320 ld1 {v15.4h},[x0],x6 321 322 323 324 325 326 327 328 329 330 smlal v24.4s, v14.4h, v3.4h[1] 331 smlsl v26.4s, v14.4h, v6.4h[1] 332 smlsl v28.4s, v14.4h, v0.4h[1] 333 smlsl v30.4s, v14.4h, v6.4h[3] 334 335 336 smlal v24.4s, v15.4h, v3.4h[3] 337 smlsl v26.4s, v15.4h, v4.4h[3] 338 smlsl v28.4s, v15.4h, v2.4h[3] 339 smlal v30.4s, v15.4h, v5.4h[3] 340 341 342 smlal v20.4s, v12.4h, v3.4h[0] 343 smlal v20.4s, v13.4h, v3.4h[2] 344 smlsl v22.4s, v12.4h, v7.4h[0] 345 smlsl v22.4s, v13.4h, v5.4h[2] 346 smlsl v16.4s, v12.4h, v1.4h[0] 347 smlsl v16.4s, v13.4h, v1.4h[2] 348 smlsl v18.4s, v12.4h, v5.4h[0] 349 smlal v18.4s, v13.4h, v7.4h[2] 350 351 cmp x11,x7 352 bhs shift1 353 354 355 ld1 {v10.4h},[x0],x6 356 ld1 {v8.4h},[x0],x6 357 ld1 {v11.4h},[x0],x6 358 ld1 {v9.4h},[x0],x6 359 360 361 362 smlal v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0) 363 smlsl v26.4s, v8.4h, v3.4h[1] //// y1 * cos3(part of b1) 364 smlsl v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2) 365 smlal v30.4s, v8.4h, v2.4h[1] //// y1 * sin1(part of b3) 366 367 smlal v24.4s, v9.4h, v4.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) 368 smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) 369 smlsl v28.4s, v9.4h, v7.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) 370 smlal v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 371 372 373 374 375 376 smlal v20.4s, v10.4h, v0.4h[0] 377 smlal v20.4s, v11.4h, v4.4h[2] 378 379 380 smlsl v22.4s, v10.4h, v0.4h[0] 381 smlsl v22.4s, v11.4h, v2.4h[2] 382 383 smlsl v16.4s, v10.4h, v0.4h[0] 384 smlsl v16.4s, v11.4h, v6.4h[2] 385 386 smlal v18.4s, v10.4h, v0.4h[0] 387 smlal v18.4s, v11.4h, v0.4h[2] 388 389 390 391 ld1 {v12.4h},[x0],x6 392 ld1 {v14.4h},[x0],x6 393 ld1 {v13.4h},[x0],x6 394 ld1 {v15.4h},[x0],x6 395 396 397 398 399 smlal v24.4s, v14.4h, v5.4h[1] 400 smlsl v26.4s, v14.4h, v0.4h[2] 401 smlal v28.4s, v14.4h, v5.4h[3] 402 smlal v30.4s, v14.4h, v4.4h[3] 403 404 405 smlal v24.4s, v15.4h, v5.4h[3] 406 smlsl v26.4s, v15.4h, v1.4h[1] 407 smlal v28.4s, v15.4h, v3.4h[1] 408 smlsl v30.4s, v15.4h, v7.4h[3] 409 410 411 smlal v20.4s, v12.4h, v5.4h[0] 412 smlal v20.4s, v13.4h, v5.4h[2] 413 smlsl v22.4s, v12.4h, v1.4h[0] 414 smlsl v22.4s, v13.4h, v0.4h[2] 415 smlal v16.4s, v12.4h, v7.4h[0] 416 smlal v16.4s, v13.4h, v4.4h[2] 417 smlal v18.4s, v12.4h, v3.4h[0] 418 smlal v18.4s, v13.4h, v6.4h[2] 419 420 421 ld1 {v10.4h},[x0],x6 422 ld1 {v8.4h},[x0],x6 423 ld1 {v11.4h},[x0],x6 424 ld1 {v9.4h},[x0],x6 425 426 427 428 429 430 431 432 smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0) 433 smlsl v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1) 434 smlal v28.4s, v8.4h, v0.4h[1] //// y1 * sin3(part of b2) 435 smlsl v30.4s, v8.4h, v4.4h[1] //// y1 * sin1(part of b3) 436 437 smlal v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) 438 smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) 439 smlal v28.4s, v9.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) 440 smlsl v30.4s, v9.4h, v0.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 441 442 443 444 445 446 smlal v20.4s, v10.4h, v6.4h[0] 447 smlal v20.4s, v11.4h, v6.4h[2] 448 449 450 smlsl v22.4s, v10.4h, v2.4h[0] 451 smlsl v22.4s, v11.4h, v3.4h[2] 452 453 smlal v16.4s, v10.4h, v2.4h[0] 454 smlal v16.4s, v11.4h, v0.4h[2] 455 456 smlsl v18.4s, v10.4h, v6.4h[0] 457 smlsl v18.4s, v11.4h, v2.4h[2] 458 459 ld1 {v12.4h},[x0],x6 460 ld1 {v14.4h},[x0],x6 461 ld1 {v13.4h},[x0],x6 462 ld1 {v15.4h},[x0],x6 463 464 465 smlal v24.4s, v14.4h, v7.4h[1] 466 smlsl v26.4s, v14.4h, v5.4h[3] 467 smlal v28.4s, v14.4h, v4.4h[1] 468 smlsl v30.4s, v14.4h, v2.4h[3] 469 470 471 smlal v24.4s, v15.4h, v7.4h[3] 472 smlsl v26.4s, v15.4h, v7.4h[1] 473 smlal v28.4s, v15.4h, v6.4h[3] 474 smlsl v30.4s, v15.4h, v6.4h[1] 475 476 477 smlal v20.4s, v12.4h, v7.4h[0] 478 smlal v20.4s, v13.4h, v7.4h[2] 479 smlsl v22.4s, v12.4h, v5.4h[0] 480 smlsl v22.4s, v13.4h, v6.4h[2] 481 smlal v16.4s, v12.4h, v3.4h[0] 482 smlal v16.4s, v13.4h, v5.4h[2] 483 smlsl v18.4s, v12.4h, v1.4h[0] 484 smlsl v18.4s, v13.4h, v4.4h[2] 485 486 487 488 shift1: 489 add v8.4s, v20.4s , v24.4s 490 sub v10.4s, v20.4s , v24.4s 491 492 add v12.4s, v22.4s , v26.4s 493 sub v24.4s, v22.4s , v26.4s 494 495 add v14.4s, v16.4s , v28.4s 496 sub v26.4s, v16.4s , v28.4s 497 498 499 add v16.4s, v18.4s , v30.4s 500 sub v28.4s, v18.4s , v30.4s 501 502 503 sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) 504 sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) 505 sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) 506 sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) 507 sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) 508 sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) 509 sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) 510 sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) 511 512 513 // registers used q15,q14,q6,q7 514 515 umov x15,v24.d[0] 516 umov x16,v25.d[0] 517 umov x19,v26.d[0] 518 umov x20,v27.d[0] 519 520 trn1 v24.4h, v30.4h, v12.4h 521 trn2 v25.4h, v30.4h, v12.4h 522 trn1 v26.4h, v31.4h, v13.4h 523 trn2 v27.4h, v31.4h, v13.4h 524 525 trn1 v30.2s, v24.2s, v26.2s 526 trn2 v31.2s, v24.2s, v26.2s 527 trn1 v12.2s, v25.2s, v27.2s 528 trn2 v13.2s, v25.2s, v27.2s 529 530 trn1 v24.4h, v14.4h, v18.4h 531 trn2 v25.4h, v14.4h, v18.4h 532 trn1 v26.4h, v15.4h, v19.4h 533 trn2 v27.4h, v15.4h, v19.4h 534 535 trn1 v14.2s, v24.2s, v26.2s 536 trn2 v15.2s, v24.2s, v26.2s 537 trn1 v18.2s, v25.2s, v27.2s 538 trn2 v19.2s, v25.2s, v27.2s 539 540 mov v24.d[0],x15 541 mov v25.d[0],x16 542 mov v26.d[0],x19 543 mov v27.d[0],x20 544 545 // d30 =x0 1- 4 values 546 // d31 =x2 1- 4 values 547 // d12=x1 1- 4 values 548 // d13=x3 1- 4 values 549 // d14 =x0 28-31 values 550 // d15 =x2 28- 31 values 551 // d18=x1 28- 31 values 552 // d19=x3 28- 31 values 553 554 555 556 st1 { v30.4h, v31.4h},[x1],#16 557 st1 { v12.4h, v13.4h},[x1],#16 558 add x1,x1,#192 559 st1 { v14.4h, v15.4h},[x1],#16 560 st1 { v18.4h, v19.4h},[x1],#16 561 sub x1,x1,#224 562 563 mov x0,x8 564 565 566 567 568 569 ld1 {v10.4h},[x0],x6 570 ld1 {v8.4h},[x0],x6 571 ld1 {v11.4h},[x0],x6 572 ld1 {v9.4h},[x0],x6 573 574 575 576 577 smull v24.4s, v8.4h, v2.4h[1] //// y1 * cos1(part of b0) 578 smull v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1) 579 smull v28.4s, v8.4h, v3.4h[1] //// y1 * sin3(part of b2) 580 smull v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3) 581 582 smlal v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) 583 smlsl v26.4s, v9.4h, v7.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) 584 smlsl v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) 585 smlsl v30.4s, v9.4h, v4.4h[3] //// y1 * sin1 - y3 * sin3(part of b3) 586 587 588 589 590 591 smull v20.4s, v10.4h, v0.4h[0] 592 smlal v20.4s, v11.4h, v4.4h[2] 593 594 595 smull v22.4s, v10.4h, v0.4h[0] 596 smlal v22.4s, v11.4h, v5.4h[2] 597 598 smull v16.4s, v10.4h, v0.4h[0] 599 smlal v16.4s, v11.4h, v6.4h[2] 600 601 smull v18.4s, v10.4h, v0.4h[0] 602 smlal v18.4s, v11.4h, v7.4h[2] 603 cmp x11,x10 604 bhs shift2 605 606 ld1 {v12.4h},[x0],x6 607 ld1 {v14.4h},[x0],x6 608 ld1 {v13.4h},[x0],x6 609 ld1 {v15.4h},[x0],x6 610 611 612 smlsl v24.4s, v14.4h, v4.4h[3] 613 smlsl v26.4s, v14.4h, v2.4h[1] 614 smlsl v28.4s, v14.4h, v0.4h[1] 615 smlsl v30.4s, v14.4h, v2.4h[3] 616 617 618 smlsl v24.4s, v15.4h, v0.4h[3] 619 smlsl v26.4s, v15.4h, v3.4h[1] 620 smlsl v28.4s, v15.4h, v6.4h[3] 621 smlal v30.4s, v15.4h, v5.4h[3] 622 623 624 smlsl v20.4s, v12.4h, v7.4h[0] 625 smlsl v20.4s, v13.4h, v2.4h[2] 626 smlsl v22.4s, v12.4h, v5.4h[0] 627 smlsl v22.4s, v13.4h, v0.4h[2] 628 smlsl v16.4s, v12.4h, v3.4h[0] 629 smlsl v16.4s, v13.4h, v3.4h[2] 630 smlsl v18.4s, v12.4h, v1.4h[0] 631 smlsl v18.4s, v13.4h, v6.4h[2] 632 633 cmp x11,x9 634 bhs shift2 635 636 637 ld1 {v10.4h},[x0],x6 638 ld1 {v8.4h},[x0],x6 639 ld1 {v11.4h},[x0],x6 640 ld1 {v9.4h},[x0],x6 641 642 643 644 645 646 647 648 smlsl v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0) 649 smlal v26.4s, v8.4h, v7.4h[1] //// y1 * cos3(part of b1) 650 smlal v28.4s, v8.4h, v2.4h[3] //// y1 * sin3(part of b2) 651 smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3) 652 653 smlal v24.4s, v9.4h, v7.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) 654 smlal v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) 655 smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) 656 smlsl v30.4s, v9.4h, v6.4h[3] //// y1 * sin1 - y3 * sin3(part of b3) 657 658 659 660 661 662 smlsl v20.4s, v10.4h, v2.4h[0] 663 smlsl v20.4s, v11.4h, v6.4h[2] 664 665 666 smlsl v22.4s, v10.4h, v6.4h[0] 667 smlal v22.4s, v11.4h, v4.4h[2] 668 669 smlal v16.4s, v10.4h, v6.4h[0] 670 smlal v16.4s, v11.4h, v0.4h[2] 671 672 smlal v18.4s, v10.4h, v2.4h[0] 673 smlal v18.4s, v11.4h, v5.4h[2] 674 675 cmp x11,x5 676 bhs shift2 677 678 679 ld1 {v12.4h},[x0],x6 680 ld1 {v14.4h},[x0],x6 681 ld1 {v13.4h},[x0],x6 682 ld1 {v15.4h},[x0],x6 683 684 685 686 687 688 smlal v24.4s, v14.4h, v2.4h[3] 689 smlal v26.4s, v14.4h, v3.4h[3] 690 smlsl v28.4s, v14.4h, v5.4h[3] 691 smlsl v30.4s, v14.4h, v0.4h[3] 692 693 694 smlal v24.4s, v15.4h, v1.4h[3] 695 smlsl v26.4s, v15.4h, v6.4h[3] 696 smlsl v28.4s, v15.4h, v0.4h[3] 697 smlal v30.4s, v15.4h, v7.4h[3] 698 699 700 smlal v20.4s, v12.4h, v5.4h[0] 701 smlal v20.4s, v13.4h, v0.4h[2] 702 smlal v22.4s, v12.4h, v1.4h[0] 703 smlal v22.4s, v13.4h, v6.4h[2] 704 smlal v16.4s, v12.4h, v7.4h[0] 705 smlsl v16.4s, v13.4h, v2.4h[2] 706 smlsl v18.4s, v12.4h, v3.4h[0] 707 smlsl v18.4s, v13.4h, v4.4h[2] 708 709 710 cmp x11,x7 711 bhs shift2 712 713 714 ld1 {v10.4h},[x0],x6 715 ld1 {v8.4h},[x0],x6 716 ld1 {v11.4h},[x0],x6 717 ld1 {v9.4h},[x0],x6 718 719 720 721 722 723 724 725 smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0) 726 smlsl v26.4s, v8.4h, v1.4h[1] //// y1 * cos3(part of b1) 727 smlsl v28.4s, v8.4h, v7.4h[1] //// y1 * sin3(part of b2) 728 smlal v30.4s, v8.4h, v0.4h[3] //// y1 * sin1(part of b3) 729 730 smlsl v24.4s, v9.4h, v5.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) 731 smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) 732 smlal v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) 733 smlal v30.4s, v9.4h, v7.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 734 735 736 737 738 739 smlal v20.4s, v10.4h, v0.4h[0] 740 smlsl v20.4s, v11.4h, v7.4h[2] 741 742 743 smlsl v22.4s, v10.4h, v0.4h[0] 744 smlsl v22.4s, v11.4h, v1.4h[2] 745 746 smlsl v16.4s, v10.4h, v0.4h[0] 747 smlal v16.4s, v11.4h, v5.4h[2] 748 749 smlal v18.4s, v10.4h, v0.4h[0] 750 smlal v18.4s, v11.4h, v3.4h[2] 751 752 753 754 ld1 {v12.4h},[x0],x6 755 ld1 {v14.4h},[x0],x6 756 ld1 {v13.4h},[x0],x6 757 ld1 {v15.4h},[x0],x6 758 759 760 smlsl v24.4s, v14.4h, v0.4h[1] 761 smlal v26.4s, v14.4h, v6.4h[1] 762 smlal v28.4s, v14.4h, v4.4h[1] 763 smlsl v30.4s, v14.4h, v1.4h[1] 764 765 766 smlsl v24.4s, v15.4h, v3.4h[3] 767 smlal v26.4s, v15.4h, v0.4h[1] 768 smlsl v28.4s, v15.4h, v5.4h[1] 769 smlsl v30.4s, v15.4h, v6.4h[1] 770 771 772 smlsl v20.4s, v12.4h, v3.4h[0] 773 smlsl v20.4s, v13.4h, v1.4h[2] 774 smlsl v22.4s, v12.4h, v7.4h[0] 775 smlal v22.4s, v13.4h, v3.4h[2] 776 smlal v16.4s, v12.4h, v1.4h[0] 777 smlal v16.4s, v13.4h, v7.4h[2] 778 smlsl v18.4s, v12.4h, v5.4h[0] 779 smlsl v18.4s, v13.4h, v2.4h[2] 780 781 ld1 {v10.4h},[x0],x6 782 ld1 {v8.4h},[x0],x6 783 ld1 {v11.4h},[x0],x6 784 ld1 {v9.4h},[x0],x6 785 786 787 788 789 smlal v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0) 790 smlal v26.4s, v8.4h, v4.4h[3] //// y1 * cos3(part of b1) 791 smlsl v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2) 792 smlal v30.4s, v8.4h, v2.4h[1] //// y1 * sin1(part of b3) 793 794 smlal v24.4s, v9.4h, v3.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) 795 smlsl v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) 796 smlsl v28.4s, v9.4h, v7.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) 797 smlal v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 798 799 800 801 802 803 smlsl v20.4s, v10.4h, v6.4h[0] 804 smlal v20.4s, v11.4h, v5.4h[2] 805 806 807 smlal v22.4s, v10.4h, v2.4h[0] 808 smlal v22.4s, v11.4h, v7.4h[2] 809 810 smlsl v16.4s, v10.4h, v2.4h[0] 811 smlsl v16.4s, v11.4h, v4.4h[2] 812 813 smlal v18.4s, v10.4h, v6.4h[0] 814 smlal v18.4s, v11.4h, v1.4h[2] 815 816 817 ld1 {v12.4h},[x0],x6 818 ld1 {v14.4h},[x0],x6 819 ld1 {v13.4h},[x0],x6 820 ld1 {v15.4h},[x0],x6 821 822 823 824 825 826 smlal v24.4s, v14.4h, v1.4h[1] 827 smlsl v26.4s, v14.4h, v0.4h[3] 828 smlal v28.4s, v14.4h, v1.4h[3] 829 smlsl v30.4s, v14.4h, v3.4h[1] 830 831 832 smlal v24.4s, v15.4h, v5.4h[3] 833 smlsl v26.4s, v15.4h, v5.4h[1] 834 smlal v28.4s, v15.4h, v4.4h[3] 835 smlsl v30.4s, v15.4h, v4.4h[1] 836 837 838 smlal v20.4s, v12.4h, v1.4h[0] 839 smlal v20.4s, v13.4h, v3.4h[2] 840 smlsl v22.4s, v12.4h, v3.4h[0] 841 smlsl v22.4s, v13.4h, v2.4h[2] 842 smlal v16.4s, v12.4h, v5.4h[0] 843 smlal v16.4s, v13.4h, v1.4h[2] 844 smlsl v18.4s, v12.4h, v7.4h[0] 845 smlsl v18.4s, v13.4h, v0.4h[2] 846 847 shift2: 848 add v8.4s, v20.4s , v24.4s 849 sub v10.4s, v20.4s , v24.4s 850 851 add v12.4s, v22.4s , v26.4s 852 sub v24.4s, v22.4s , v26.4s 853 854 add v14.4s, v16.4s , v28.4s 855 sub v26.4s, v16.4s , v28.4s 856 857 858 add v16.4s, v18.4s , v30.4s 859 sub v28.4s, v18.4s , v30.4s 860 861 862 sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) 863 sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) 864 sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) 865 sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) 866 sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) 867 sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) 868 sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) 869 sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) 870 871 umov x15,v24.d[0] 872 umov x16,v25.d[0] 873 umov x19,v26.d[0] 874 umov x20,v27.d[0] 875 876 trn1 v24.4h, v30.4h, v12.4h 877 trn2 v25.4h, v30.4h, v12.4h 878 trn1 v26.4h, v31.4h, v13.4h 879 trn2 v27.4h, v31.4h, v13.4h 880 881 trn1 v30.2s, v24.2s, v26.2s 882 trn2 v31.2s, v24.2s, v26.2s 883 trn1 v12.2s, v25.2s, v27.2s 884 trn2 v13.2s, v25.2s, v27.2s 885 886 trn1 v24.4h, v14.4h, v18.4h 887 trn2 v25.4h, v14.4h, v18.4h 888 trn1 v26.4h, v15.4h, v19.4h 889 trn2 v27.4h, v15.4h, v19.4h 890 891 trn1 v14.2s, v24.2s, v26.2s 892 trn2 v15.2s, v24.2s, v26.2s 893 trn1 v18.2s, v25.2s, v27.2s 894 trn2 v19.2s, v25.2s, v27.2s 895 896 mov v24.d[0],x15 897 mov v25.d[0],x16 898 mov v26.d[0],x19 899 mov v27.d[0],x20 900 901 st1 { v30.4h, v31.4h},[x1],#16 902 st1 { v12.4h, v13.4h},[x1],#16 903 add x1,x1,#128 904 st1 { v14.4h, v15.4h},[x1],#16 905 st1 { v18.4h, v19.4h},[x1],#16 906 sub x1,x1,#160 907 mov x0,x8 908 909 910 911 ld1 {v10.4h},[x0],x6 912 ld1 {v8.4h},[x0],x6 913 ld1 {v11.4h},[x0],x6 914 ld1 {v9.4h},[x0],x6 915 916 917 smull v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0) 918 smull v26.4s, v8.4h, v4.4h[3] //// y1 * cos3(part of b1) 919 smull v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2) 920 smull v30.4s, v8.4h, v5.4h[3] //// y1 * sin1(part of b3) 921 922 smlsl v24.4s, v9.4h, v3.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) 923 smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) 924 smlsl v28.4s, v9.4h, v0.4h[2] //// y1 * sin3 - y3 * cos1(part of b2) 925 smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 926 927 928 929 930 931 smull v20.4s, v10.4h, v0.4h[0] 932 smlsl v20.4s, v11.4h, v7.4h[2] 933 934 935 smull v22.4s, v10.4h, v0.4h[0] 936 smlsl v22.4s, v11.4h, v6.4h[2] 937 938 smull v16.4s, v10.4h, v0.4h[0] 939 smlsl v16.4s, v11.4h, v5.4h[2] 940 941 smull v18.4s, v10.4h, v0.4h[0] 942 smlsl v18.4s, v11.4h, v4.4h[2] 943 944 cmp x11,x10 945 bhs shift3 946 947 ld1 {v12.4h},[x0],x6 948 ld1 {v14.4h},[x0],x6 949 ld1 {v13.4h},[x0],x6 950 ld1 {v15.4h},[x0],x6 951 952 953 954 955 smlsl v24.4s, v14.4h, v5.4h[1] 956 smlsl v26.4s, v14.4h, v7.4h[3] 957 smlal v28.4s, v14.4h, v5.4h[3] 958 smlal v30.4s, v14.4h, v3.4h[1] 959 960 961 smlal v24.4s, v15.4h, v2.4h[1] 962 smlal v26.4s, v15.4h, v1.4h[1] 963 smlal v28.4s, v15.4h, v4.4h[3] 964 smlsl v30.4s, v15.4h, v7.4h[3] 965 966 967 smlsl v20.4s, v12.4h, v1.4h[0] 968 smlal v20.4s, v13.4h, v6.4h[2] 969 smlsl v22.4s, v12.4h, v3.4h[0] 970 smlal v22.4s, v13.4h, v3.4h[2] 971 smlsl v16.4s, v12.4h, v5.4h[0] 972 smlal v16.4s, v13.4h, v0.4h[2] 973 smlsl v18.4s, v12.4h, v7.4h[0] 974 smlal v18.4s, v13.4h, v2.4h[2] 975 976 cmp x11,x9 977 bhs shift3 978 979 ld1 {v10.4h},[x0],x6 980 ld1 {v8.4h},[x0],x6 981 ld1 {v11.4h},[x0],x6 982 ld1 {v9.4h},[x0],x6 983 984 smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0) 985 smlsl v26.4s, v8.4h, v5.4h[1] //// y1 * cos3(part of b1) 986 smlsl v28.4s, v8.4h, v0.4h[3] //// y1 * sin3(part of b2) 987 smlsl v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3) 988 989 smlsl v24.4s, v9.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) 990 smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) 991 smlal v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) 992 smlal v30.4s, v9.4h, v0.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 993 994 995 996 997 998 smlal v20.4s, v10.4h, v2.4h[0] 999 smlsl v20.4s, v11.4h, v5.4h[2] 1000 1001 1002 smlal v22.4s, v10.4h, v6.4h[0] 1003 smlsl v22.4s, v11.4h, v0.4h[2] 1004 1005 smlsl v16.4s, v10.4h, v6.4h[0] 1006 smlsl v16.4s, v11.4h, v4.4h[2] 1007 1008 smlsl v18.4s, v10.4h, v2.4h[0] 1009 smlal v18.4s, v11.4h, v6.4h[2] 1010 1011 cmp x11,x5 1012 bhs shift3 1013 1014 1015 ld1 {v12.4h},[x0],x6 1016 ld1 {v14.4h},[x0],x6 1017 ld1 {v13.4h},[x0],x6 1018 ld1 {v15.4h},[x0],x6 1019 1020 1021 1022 1023 1024 1025 smlsl v24.4s, v14.4h, v7.4h[1] 1026 smlal v26.4s, v14.4h, v2.4h[1] 1027 smlal v28.4s, v14.4h, v4.4h[1] 1028 smlsl v30.4s, v14.4h, v5.4h[1] 1029 1030 1031 smlal v24.4s, v15.4h, v0.4h[3] 1032 smlal v26.4s, v15.4h, v7.4h[1] 1033 smlsl v28.4s, v15.4h, v1.4h[1] 1034 smlsl v30.4s, v15.4h, v6.4h[1] 1035 1036 1037 smlsl v20.4s, v12.4h, v3.4h[0] 1038 smlal v20.4s, v13.4h, v4.4h[2] 1039 smlal v22.4s, v12.4h, v7.4h[0] 1040 smlal v22.4s, v13.4h, v2.4h[2] 1041 smlal v16.4s, v12.4h, v1.4h[0] 1042 smlsl v16.4s, v13.4h, v6.4h[2] 1043 smlal v18.4s, v12.4h, v5.4h[0] 1044 smlsl v18.4s, v13.4h, v0.4h[2] 1045 1046 1047 cmp x11,x7 1048 bhs shift3 1049 1050 1051 ld1 {v10.4h},[x0],x6 1052 ld1 {v8.4h},[x0],x6 1053 ld1 {v11.4h},[x0],x6 1054 ld1 {v9.4h},[x0],x6 1055 1056 1057 smlsl v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0) 1058 smlsl v26.4s, v8.4h, v0.4h[1] //// y1 * cos3(part of b1) 1059 smlal v28.4s, v8.4h, v6.4h[3] //// y1 * sin3(part of b2) 1060 smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3) 1061 1062 smlsl v24.4s, v9.4h, v0.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) 1063 smlal v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) 1064 smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) 1065 smlsl v30.4s, v9.4h, v2.4h[3] //// y1 * sin1 - y3 * sin3(part of b3) 1066 1067 1068 1069 1070 1071 smlal v20.4s, v10.4h, v0.4h[0] 1072 smlsl v20.4s, v11.4h, v3.4h[2] 1073 1074 1075 smlsl v22.4s, v10.4h, v0.4h[0] 1076 smlsl v22.4s, v11.4h, v5.4h[2] 1077 1078 smlsl v16.4s, v10.4h, v0.4h[0] 1079 smlal v16.4s, v11.4h, v1.4h[2] 1080 1081 smlal v18.4s, v10.4h, v0.4h[0] 1082 smlal v18.4s, v11.4h, v7.4h[2] 1083 1084 1085 ld1 {v12.4h},[x0],x6 1086 ld1 {v14.4h},[x0],x6 1087 ld1 {v13.4h},[x0],x6 1088 ld1 {v15.4h},[x0],x6 1089 1090 1091 1092 smlal v24.4s, v14.4h, v6.4h[3] 1093 smlal v26.4s, v14.4h, v3.4h[3] 1094 smlsl v28.4s, v14.4h, v1.4h[3] 1095 smlal v30.4s, v14.4h, v7.4h[1] 1096 1097 1098 smlal v24.4s, v15.4h, v1.4h[3] 1099 smlsl v26.4s, v15.4h, v2.4h[3] 1100 smlal v28.4s, v15.4h, v7.4h[1] 1101 smlal v30.4s, v15.4h, v4.4h[1] 1102 1103 1104 smlsl v20.4s, v12.4h, v5.4h[0] 1105 smlal v20.4s, v13.4h, v2.4h[2] 1106 smlal v22.4s, v12.4h, v1.4h[0] 1107 smlsl v22.4s, v13.4h, v7.4h[2] 1108 smlsl v16.4s, v12.4h, v7.4h[0] 1109 smlsl v16.4s, v13.4h, v3.4h[2] 1110 smlsl v18.4s, v12.4h, v3.4h[0] 1111 smlal v18.4s, v13.4h, v1.4h[2] 1112 1113 1114 1115 ld1 {v10.4h},[x0],x6 1116 ld1 {v8.4h},[x0],x6 1117 ld1 {v11.4h},[x0],x6 1118 ld1 {v9.4h},[x0],x6 1119 1120 1121 1122 1123 smlsl v24.4s, v8.4h, v5.4h[3] //// y1 * cos1(part of b0) 1124 smlsl v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1) 1125 smlal v28.4s, v8.4h, v3.4h[1] //// y1 * sin3(part of b2) 1126 smlsl v30.4s, v8.4h, v0.4h[1] //// y1 * sin1(part of b3) 1127 1128 smlsl v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) 1129 smlal v26.4s, v9.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) 1130 smlsl v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) 1131 smlal v30.4s, v9.4h, v4.4h[3] //// y1 * sin1 - y3 * sin3(part of b3) 1132 1133 1134 1135 1136 1137 smlal v20.4s, v10.4h, v6.4h[0] 1138 smlsl v20.4s, v11.4h, v1.4h[2] 1139 1140 1141 smlsl v22.4s, v10.4h, v2.4h[0] 1142 smlal v22.4s, v11.4h, v4.4h[2] 1143 1144 smlal v16.4s, v10.4h, v2.4h[0] 1145 smlsl v16.4s, v11.4h, v7.4h[2] 1146 1147 smlsl v18.4s, v10.4h, v6.4h[0] 1148 smlsl v18.4s, v11.4h, v5.4h[2] 1149 1150 1151 ld1 {v12.4h},[x0],x6 1152 ld1 {v14.4h},[x0],x6 1153 ld1 {v13.4h},[x0],x6 1154 ld1 {v15.4h},[x0],x6 1155 1156 smlal v24.4s, v14.4h, v4.4h[3] 1157 smlsl v26.4s, v14.4h, v6.4h[1] 1158 smlal v28.4s, v14.4h, v7.4h[3] 1159 smlal v30.4s, v14.4h, v6.4h[3] 1160 1161 1162 smlal v24.4s, v15.4h, v3.4h[3] 1163 smlsl v26.4s, v15.4h, v3.4h[1] 1164 smlal v28.4s, v15.4h, v2.4h[3] 1165 smlsl v30.4s, v15.4h, v2.4h[1] 1166 1167 1168 smlsl v20.4s, v12.4h, v7.4h[0] 1169 smlal v20.4s, v13.4h, v0.4h[2] 1170 smlal v22.4s, v12.4h, v5.4h[0] 1171 smlsl v22.4s, v13.4h, v1.4h[2] 1172 smlsl v16.4s, v12.4h, v3.4h[0] 1173 smlal v16.4s, v13.4h, v2.4h[2] 1174 smlal v18.4s, v12.4h, v1.4h[0] 1175 smlsl v18.4s, v13.4h, v3.4h[2] 1176 1177 shift3: 1178 add v8.4s, v20.4s , v24.4s 1179 sub v10.4s, v20.4s , v24.4s 1180 1181 add v12.4s, v22.4s , v26.4s 1182 sub v24.4s, v22.4s , v26.4s 1183 1184 add v14.4s, v16.4s , v28.4s 1185 sub v26.4s, v16.4s , v28.4s 1186 1187 1188 add v16.4s, v18.4s , v30.4s 1189 sub v28.4s, v18.4s , v30.4s 1190 1191 1192 sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) 1193 sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) 1194 sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) 1195 sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) 1196 sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) 1197 sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) 1198 sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) 1199 sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) 1200 1201 umov x15,v24.d[0] 1202 umov x16,v25.d[0] 1203 umov x19,v26.d[0] 1204 umov x20,v27.d[0] 1205 1206 trn1 v24.4h, v30.4h, v12.4h 1207 trn2 v25.4h, v30.4h, v12.4h 1208 trn1 v26.4h, v31.4h, v13.4h 1209 trn2 v27.4h, v31.4h, v13.4h 1210 1211 trn1 v30.2s, v24.2s, v26.2s 1212 trn2 v31.2s, v24.2s, v26.2s 1213 trn1 v12.2s, v25.2s, v27.2s 1214 trn2 v13.2s, v25.2s, v27.2s 1215 1216 trn1 v24.4h, v14.4h, v18.4h 1217 trn2 v25.4h, v14.4h, v18.4h 1218 trn1 v26.4h, v15.4h, v19.4h 1219 trn2 v27.4h, v15.4h, v19.4h 1220 1221 trn1 v14.2s, v24.2s, v26.2s 1222 trn2 v15.2s, v24.2s, v26.2s 1223 trn1 v18.2s, v25.2s, v27.2s 1224 trn2 v19.2s, v25.2s, v27.2s 1225 1226 mov v24.d[0],x15 1227 mov v25.d[0],x16 1228 mov v26.d[0],x19 1229 mov v27.d[0],x20 1230 st1 { v30.4h, v31.4h},[x1],#16 1231 st1 { v12.4h, v13.4h},[x1],#16 1232 add x1,x1,#64 1233 st1 { v14.4h, v15.4h},[x1],#16 1234 st1 { v18.4h, v19.4h},[x1],#16 1235 sub x1,x1,#96 1236 1237 mov x0,x8 1238 1239 1240 1241 ld1 {v10.4h},[x0],x6 1242 ld1 {v8.4h},[x0],x6 1243 ld1 {v11.4h},[x0],x6 1244 ld1 {v9.4h},[x0],x6 1245 1246 1247 smull v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0) 1248 smull v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1) 1249 smull v28.4s, v8.4h, v7.4h[1] //// y1 * sin3(part of b2) 1250 smull v30.4s, v8.4h, v7.4h[3] //// y1 * sin1(part of b3) 1251 1252 smlsl v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) 1253 smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) 1254 smlsl v28.4s, v9.4h, v5.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) 1255 smlsl v30.4s, v9.4h, v7.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 1256 1257 1258 1259 1260 1261 smull v20.4s, v10.4h, v0.4h[0] 1262 smlsl v20.4s, v11.4h, v3.4h[2] 1263 1264 1265 smull v22.4s, v10.4h, v0.4h[0] 1266 smlsl v22.4s, v11.4h, v2.4h[2] 1267 1268 smull v16.4s, v10.4h, v0.4h[0] 1269 smlsl v16.4s, v11.4h, v1.4h[2] 1270 1271 smull v18.4s, v10.4h, v0.4h[0] 1272 smlsl v18.4s, v11.4h, v0.4h[2] 1273 1274 cmp x11,x10 1275 bhs shift4 1276 1277 ld1 {v12.4h},[x0],x6 1278 ld1 {v14.4h},[x0],x6 1279 ld1 {v13.4h},[x0],x6 1280 ld1 {v15.4h},[x0],x6 1281 1282 1283 1284 1285 1286 1287 smlal v24.4s, v14.4h, v0.4h[1] 1288 smlal v26.4s, v14.4h, v1.4h[3] 1289 smlal v28.4s, v14.4h, v4.4h[1] 1290 smlal v30.4s, v14.4h, v6.4h[3] 1291 1292 1293 smlsl v24.4s, v15.4h, v4.4h[1] 1294 smlsl v26.4s, v15.4h, v0.4h[3] 1295 smlsl v28.4s, v15.4h, v2.4h[3] 1296 smlsl v30.4s, v15.4h, v6.4h[1] 1297 1298 1299 smlal v20.4s, v12.4h, v7.4h[0] 1300 smlal v20.4s, v13.4h, v5.4h[2] 1301 smlal v22.4s, v12.4h, v5.4h[0] 1302 smlsl v22.4s, v13.4h, v7.4h[2] 1303 smlal v16.4s, v12.4h, v3.4h[0] 1304 smlsl v16.4s, v13.4h, v4.4h[2] 1305 smlal v18.4s, v12.4h, v1.4h[0] 1306 smlsl v18.4s, v13.4h, v1.4h[2] 1307 1308 cmp x11,x9 1309 bhs shift4 1310 1311 ld1 {v10.4h},[x0],x6 1312 ld1 {v8.4h},[x0],x6 1313 ld1 {v11.4h},[x0],x6 1314 ld1 {v9.4h},[x0],x6 1315 1316 1317 1318 smlal v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0) 1319 smlal v26.4s, v8.4h, v3.4h[1] //// y1 * cos3(part of b1) 1320 smlal v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2) 1321 smlal v30.4s, v8.4h, v5.4h[3] //// y1 * sin1(part of b3) 1322 1323 smlal v24.4s, v9.4h, v4.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) 1324 smlsl v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) 1325 smlsl v28.4s, v9.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) 1326 smlsl v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 1327 1328 1329 1330 1331 1332 smlsl v20.4s, v10.4h, v2.4h[0] 1333 smlal v20.4s, v11.4h, v1.4h[2] 1334 1335 1336 smlsl v22.4s, v10.4h, v6.4h[0] 1337 smlal v22.4s, v11.4h, v3.4h[2] 1338 1339 smlal v16.4s, v10.4h, v6.4h[0] 1340 smlsl v16.4s, v11.4h, v7.4h[2] 1341 1342 smlal v18.4s, v10.4h, v2.4h[0] 1343 smlsl v18.4s, v11.4h, v2.4h[2] 1344 1345 cmp x11,x5 1346 bhs shift4 1347 1348 1349 ld1 {v12.4h},[x0],x6 1350 ld1 {v14.4h},[x0],x6 1351 ld1 {v13.4h},[x0],x6 1352 ld1 {v15.4h},[x0],x6 1353 1354 1355 1356 1357 1358 1359 smlsl v24.4s, v14.4h, v1.4h[1] 1360 smlsl v26.4s, v14.4h, v7.4h[3] 1361 smlal v28.4s, v14.4h, v1.4h[3] 1362 smlal v30.4s, v14.4h, v4.4h[3] 1363 1364 1365 smlal v24.4s, v15.4h, v2.4h[1] 1366 smlal v26.4s, v15.4h, v5.4h[1] 1367 smlsl v28.4s, v15.4h, v3.4h[1] 1368 smlsl v30.4s, v15.4h, v4.4h[1] 1369 1370 1371 smlsl v20.4s, v12.4h, v5.4h[0] 1372 smlsl v20.4s, v13.4h, v7.4h[2] 1373 smlsl v22.4s, v12.4h, v1.4h[0] 1374 smlal v22.4s, v13.4h, v1.4h[2] 1375 smlsl v16.4s, v12.4h, v7.4h[0] 1376 smlal v16.4s, v13.4h, v5.4h[2] 1377 smlal v18.4s, v12.4h, v3.4h[0] 1378 smlsl v18.4s, v13.4h, v3.4h[2] 1379 1380 cmp x11,x7 1381 bhs shift4 1382 1383 1384 ld1 {v10.4h},[x0],x6 1385 ld1 {v8.4h},[x0],x6 1386 ld1 {v11.4h},[x0],x6 1387 ld1 {v9.4h},[x0],x6 1388 1389 1390 smlsl v24.4s, v8.4h, v5.4h[3] //// y1 * cos1(part of b0) 1391 smlsl v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1) 1392 smlal v28.4s, v8.4h, v4.4h[3] //// y1 * sin3(part of b2) 1393 smlal v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3) 1394 1395 smlsl v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) 1396 smlal v26.4s, v9.4h, v0.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) 1397 smlsl v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) 1398 smlsl v30.4s, v9.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 1399 1400 1401 1402 1403 1404 smlal v20.4s, v10.4h, v0.4h[0] 1405 smlsl v20.4s, v11.4h, v0.4h[2] 1406 1407 1408 smlsl v22.4s, v10.4h, v0.4h[0] 1409 smlal v22.4s, v11.4h, v6.4h[2] 1410 1411 smlsl v16.4s, v10.4h, v0.4h[0] 1412 smlal v16.4s, v11.4h, v2.4h[2] 1413 1414 smlal v18.4s, v10.4h, v0.4h[0] 1415 smlsl v18.4s, v11.4h, v4.4h[2] 1416 1417 1418 1419 1420 ld1 {v12.4h},[x0],x6 1421 ld1 {v14.4h},[x0],x6 1422 ld1 {v13.4h},[x0],x6 1423 ld1 {v15.4h},[x0],x6 1424 1425 1426 1427 1428 1429 1430 smlal v24.4s, v14.4h, v3.4h[1] 1431 smlsl v26.4s, v14.4h, v2.4h[1] 1432 smlal v28.4s, v14.4h, v7.4h[3] 1433 smlal v30.4s, v14.4h, v2.4h[3] 1434 1435 1436 smlsl v24.4s, v15.4h, v0.4h[3] 1437 smlal v26.4s, v15.4h, v4.4h[3] 1438 smlal v28.4s, v15.4h, v6.4h[3] 1439 smlsl v30.4s, v15.4h, v2.4h[1] 1440 1441 1442 smlal v20.4s, v12.4h, v3.4h[0] 1443 smlsl v20.4s, v13.4h, v6.4h[2] 1444 smlal v22.4s, v12.4h, v7.4h[0] 1445 smlsl v22.4s, v13.4h, v4.4h[2] 1446 smlsl v16.4s, v12.4h, v1.4h[0] 1447 smlal v16.4s, v13.4h, v0.4h[2] 1448 smlal v18.4s, v12.4h, v5.4h[0] 1449 smlsl v18.4s, v13.4h, v5.4h[2] 1450 1451 1452 ld1 {v10.4h},[x0],x6 1453 ld1 {v8.4h},[x0],x6 1454 ld1 {v11.4h},[x0],x6 1455 ld1 {v9.4h},[x0],x6 1456 1457 1458 1459 1460 1461 smlal v24.4s, v8.4h, v3.4h[3] //// y1 * cos1(part of b0) 1462 smlsl v26.4s, v8.4h, v7.4h[1] //// y1 * cos3(part of b1) 1463 smlsl v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2) 1464 smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3) 1465 1466 smlsl v24.4s, v9.4h, v7.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) 1467 smlsl v26.4s, v9.4h, v6.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) 1468 smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) 1469 smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 1470 1471 1472 1473 1474 1475 smlsl v20.4s, v10.4h, v6.4h[0] 1476 smlal v20.4s, v11.4h, v2.4h[2] 1477 1478 1479 smlal v22.4s, v10.4h, v2.4h[0] 1480 smlsl v22.4s, v11.4h, v0.4h[2] 1481 1482 smlsl v16.4s, v10.4h, v2.4h[0] 1483 smlal v16.4s, v11.4h, v3.4h[2] 1484 1485 smlal v18.4s, v10.4h, v6.4h[0] 1486 smlsl v18.4s, v11.4h, v6.4h[2] 1487 1488 1489 ld1 {v12.4h},[x0],x6 1490 ld1 {v14.4h},[x0],x6 1491 ld1 {v13.4h},[x0],x6 1492 ld1 {v15.4h},[x0],x6 1493 1494 1495 1496 1497 smlsl v24.4s, v14.4h, v5.4h[1] 1498 smlal v26.4s, v14.4h, v3.4h[3] 1499 smlsl v28.4s, v14.4h, v2.4h[1] 1500 smlal v30.4s, v14.4h, v0.4h[3] 1501 1502 1503 smlal v24.4s, v15.4h, v1.4h[3] 1504 smlsl v26.4s, v15.4h, v1.4h[1] 1505 smlal v28.4s, v15.4h, v0.4h[3] 1506 smlsl v30.4s, v15.4h, v0.4h[1] 1507 1508 1509 smlsl v20.4s, v12.4h, v1.4h[0] 1510 smlal v20.4s, v13.4h, v4.4h[2] 1511 smlal v22.4s, v12.4h, v3.4h[0] 1512 smlsl v22.4s, v13.4h, v5.4h[2] 1513 smlsl v16.4s, v12.4h, v5.4h[0] 1514 smlal v16.4s, v13.4h, v6.4h[2] 1515 smlal v18.4s, v12.4h, v7.4h[0] 1516 smlsl v18.4s, v13.4h, v7.4h[2] 1517 1518 shift4: 1519 add v8.4s, v20.4s , v24.4s 1520 sub v10.4s, v20.4s , v24.4s 1521 1522 add v12.4s, v22.4s , v26.4s 1523 sub v24.4s, v22.4s , v26.4s 1524 1525 add v14.4s, v16.4s , v28.4s 1526 sub v26.4s, v16.4s , v28.4s 1527 1528 1529 add v16.4s, v18.4s , v30.4s 1530 sub v28.4s, v18.4s , v30.4s 1531 1532 1533 sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) 1534 sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) 1535 sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) 1536 sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) 1537 sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) 1538 sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) 1539 sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) 1540 sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) 1541 1542 umov x15,v24.d[0] 1543 umov x16,v25.d[0] 1544 umov x19,v26.d[0] 1545 umov x20,v27.d[0] 1546 1547 trn1 v24.4h, v30.4h, v12.4h 1548 trn2 v25.4h, v30.4h, v12.4h 1549 trn1 v26.4h, v31.4h, v13.4h 1550 trn2 v27.4h, v31.4h, v13.4h 1551 1552 trn1 v30.2s, v24.2s, v26.2s 1553 trn2 v31.2s, v24.2s, v26.2s 1554 trn1 v12.2s, v25.2s, v27.2s 1555 trn2 v13.2s, v25.2s, v27.2s 1556 1557 trn1 v24.4h, v14.4h, v18.4h 1558 trn2 v25.4h, v14.4h, v18.4h 1559 trn1 v26.4h, v15.4h, v19.4h 1560 trn2 v27.4h, v15.4h, v19.4h 1561 1562 trn1 v14.2s, v24.2s, v26.2s 1563 trn2 v15.2s, v24.2s, v26.2s 1564 trn1 v18.2s, v25.2s, v27.2s 1565 trn2 v19.2s, v25.2s, v27.2s 1566 1567 mov v24.d[0],x15 1568 mov v25.d[0],x16 1569 mov v26.d[0],x19 1570 mov v27.d[0],x20 1571 1572 st1 { v30.4h, v31.4h},[x1],#16 1573 st1 { v12.4h, v13.4h},[x1],#16 1574 st1 { v14.4h, v15.4h},[x1],#16 1575 st1 { v18.4h, v19.4h},[x1],#16 1576 1577 add x1,x1,#96 1578 1579 subs x14,x14,#1 1580 bne dct_stage1 1581 second_stage_dct: 1582 // mov x0,x1 1583 ldp x8, x7,[sp],#16 1584 ldp x0, x1,[sp],#16 1585 1586 // add x4,x2,x8, lsl #1 @ x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data 1587 // add x5,x8,x8, lsl #1 @ 1588 // sub x0,x0,#512 1589 mov x11,#0xfffffff0 1590 mov x5, #0xffffff00 1591 ldr w6, x5_addr 1592 ldr w9, x9_addr 1593 // sub x1,x1,#2048 1594 mov x4,x1 1595 mov x10,#240 1596 mov x14,#8 1597 b stage2 1598 1599 // registers free : 1600 1601 // arm registers used 1602 // x8 : predicition stride 1603 // x7 : destination stride 1604 // x1: temp buffer 1605 // x2 : pred buffer 1606 // x3 : destination buffer 1607 // x14 : loop counter 1608 //x0 : scratch buffer 1609 //x10 : used as stride 1610 // x4 : used to store the initial address 1611 //x12 : zero cols 1612 // x11 : 0xfffffff0 1613 // x5 : 0xffffff00 1614 dct_stage2: 1615 add x4,x4,#32 1616 mov x1,x4 1617 stage2: 1618 ld1 {v10.4h, v11.4h},[x1],#16 1619 ld1 {v8.4h, v9.4h},[x1],x10 1620 1621 smull v24.4s, v8.4h, v0.4h[1] //// y1 * cos1(part of b0) 1622 smull v26.4s, v8.4h, v0.4h[3] //// y1 * cos3(part of b1) 1623 smull v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2) 1624 smull v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3) 1625 1626 smlal v24.4s, v9.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) 1627 smlal v26.4s, v9.4h, v2.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) 1628 smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) 1629 smlal v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 1630 1631 1632 1633 smull v20.4s, v10.4h, v0.4h[0] 1634 smlal v20.4s, v11.4h, v0.4h[2] 1635 1636 1637 smull v22.4s, v10.4h, v0.4h[0] 1638 smlal v22.4s, v11.4h, v1.4h[2] 1639 1640 smull v16.4s, v10.4h, v0.4h[0] 1641 smlal v16.4s, v11.4h, v2.4h[2] 1642 1643 smull v18.4s, v10.4h, v0.4h[0] 1644 smlal v18.4s, v11.4h, v3.4h[2] 1645 cmp x12,x11 1646 bhs stage2_shift1 1647 1648 ld1 {v12.4h, v13.4h},[x1],#16 1649 ld1 {v14.4h, v15.4h},[x1],x10 1650 1651 1652 1653 1654 1655 1656 smlal v24.4s, v14.4h, v1.4h[1] 1657 smlal v26.4s, v14.4h, v3.4h[3] 1658 smlal v28.4s, v14.4h, v6.4h[1] 1659 smlsl v30.4s, v14.4h, v7.4h[1] 1660 1661 1662 smlal v24.4s, v15.4h, v1.4h[3] 1663 smlal v26.4s, v15.4h, v5.4h[1] 1664 smlsl v28.4s, v15.4h, v7.4h[1] 1665 smlsl v30.4s, v15.4h, v3.4h[3] 1666 1667 1668 smlal v20.4s, v12.4h, v1.4h[0] 1669 smlal v20.4s, v13.4h, v1.4h[2] 1670 smlal v22.4s, v12.4h, v3.4h[0] 1671 smlal v22.4s, v13.4h, v4.4h[2] 1672 smlal v16.4s, v12.4h, v5.4h[0] 1673 smlal v16.4s, v13.4h, v7.4h[2] 1674 smlal v18.4s, v12.4h, v7.4h[0] 1675 smlsl v18.4s, v13.4h, v5.4h[2] 1676 cmp x12,x5 1677 bhs stage2_shift1 1678 1679 ld1 {v10.4h, v11.4h},[x1],#16 1680 ld1 {v8.4h, v9.4h},[x1],x10 1681 1682 smlal v24.4s, v8.4h, v2.4h[1] //// y1 * cos1(part of b0) 1683 smlal v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1) 1684 smlsl v28.4s, v8.4h, v4.4h[3] //// y1 * sin3(part of b2) 1685 smlsl v30.4s, v8.4h, v0.4h[1] //// y1 * sin1(part of b3) 1686 1687 smlal v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) 1688 smlsl v26.4s, v9.4h, v7.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) 1689 smlsl v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) 1690 smlsl v30.4s, v9.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 1691 1692 1693 1694 1695 1696 smlal v20.4s, v10.4h, v2.4h[0] 1697 smlal v20.4s, v11.4h, v2.4h[2] 1698 1699 1700 smlal v22.4s, v10.4h, v6.4h[0] 1701 smlal v22.4s, v11.4h, v7.4h[2] 1702 1703 smlsl v16.4s, v10.4h, v6.4h[0] 1704 smlsl v16.4s, v11.4h, v3.4h[2] 1705 1706 smlsl v18.4s, v10.4h, v2.4h[0] 1707 smlsl v18.4s, v11.4h, v1.4h[2] 1708 1709 cmp x12,x6 1710 bhs stage2_shift1 1711 1712 1713 ld1 {v12.4h, v13.4h},[x1],#16 1714 ld1 {v14.4h, v15.4h},[x1],x10 1715 1716 1717 1718 1719 1720 smlal v24.4s, v14.4h, v3.4h[1] 1721 smlsl v26.4s, v14.4h, v6.4h[1] 1722 smlsl v28.4s, v14.4h, v0.4h[1] 1723 smlsl v30.4s, v14.4h, v6.4h[3] 1724 1725 1726 smlal v24.4s, v15.4h, v3.4h[3] 1727 smlsl v26.4s, v15.4h, v4.4h[3] 1728 smlsl v28.4s, v15.4h, v2.4h[3] 1729 smlal v30.4s, v15.4h, v5.4h[3] 1730 1731 1732 smlal v20.4s, v12.4h, v3.4h[0] 1733 smlal v20.4s, v13.4h, v3.4h[2] 1734 smlsl v22.4s, v12.4h, v7.4h[0] 1735 smlsl v22.4s, v13.4h, v5.4h[2] 1736 smlsl v16.4s, v12.4h, v1.4h[0] 1737 smlsl v16.4s, v13.4h, v1.4h[2] 1738 smlsl v18.4s, v12.4h, v5.4h[0] 1739 smlal v18.4s, v13.4h, v7.4h[2] 1740 1741 cmp x12,x9 1742 bhs stage2_shift1 1743 1744 1745 ld1 {v10.4h, v11.4h},[x1],#16 1746 ld1 {v8.4h, v9.4h},[x1],x10 1747 1748 1749 smlal v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0) 1750 smlsl v26.4s, v8.4h, v3.4h[1] //// y1 * cos3(part of b1) 1751 smlsl v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2) 1752 smlal v30.4s, v8.4h, v2.4h[1] //// y1 * sin1(part of b3) 1753 1754 smlal v24.4s, v9.4h, v4.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) 1755 smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) 1756 smlsl v28.4s, v9.4h, v7.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) 1757 smlal v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 1758 1759 1760 1761 1762 1763 smlal v20.4s, v10.4h, v0.4h[0] 1764 smlal v20.4s, v11.4h, v4.4h[2] 1765 1766 1767 smlsl v22.4s, v10.4h, v0.4h[0] 1768 smlsl v22.4s, v11.4h, v2.4h[2] 1769 1770 smlsl v16.4s, v10.4h, v0.4h[0] 1771 smlsl v16.4s, v11.4h, v6.4h[2] 1772 1773 smlal v18.4s, v10.4h, v0.4h[0] 1774 smlal v18.4s, v11.4h, v0.4h[2] 1775 1776 ld1 {v12.4h, v13.4h},[x1],#16 1777 ld1 {v14.4h, v15.4h},[x1],x10 1778 1779 1780 1781 1782 1783 smlal v24.4s, v14.4h, v5.4h[1] 1784 smlsl v26.4s, v14.4h, v0.4h[2] 1785 smlal v28.4s, v14.4h, v5.4h[3] 1786 smlal v30.4s, v14.4h, v4.4h[3] 1787 1788 1789 smlal v24.4s, v15.4h, v5.4h[3] 1790 smlsl v26.4s, v15.4h, v1.4h[1] 1791 smlal v28.4s, v15.4h, v3.4h[1] 1792 smlsl v30.4s, v15.4h, v7.4h[3] 1793 1794 1795 smlal v20.4s, v12.4h, v5.4h[0] 1796 smlal v20.4s, v13.4h, v5.4h[2] 1797 smlsl v22.4s, v12.4h, v1.4h[0] 1798 smlsl v22.4s, v13.4h, v0.4h[2] 1799 smlal v16.4s, v12.4h, v7.4h[0] 1800 smlal v16.4s, v13.4h, v4.4h[2] 1801 smlal v18.4s, v12.4h, v3.4h[0] 1802 smlal v18.4s, v13.4h, v6.4h[2] 1803 1804 1805 ld1 {v10.4h, v11.4h},[x1],#16 1806 ld1 {v8.4h, v9.4h},[x1],x10 1807 1808 1809 1810 1811 smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0) 1812 smlsl v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1) 1813 smlal v28.4s, v8.4h, v0.4h[1] //// y1 * sin3(part of b2) 1814 smlsl v30.4s, v8.4h, v4.4h[1] //// y1 * sin1(part of b3) 1815 1816 smlal v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) 1817 smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) 1818 smlal v28.4s, v9.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) 1819 smlsl v30.4s, v9.4h, v0.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 1820 1821 1822 1823 1824 1825 smlal v20.4s, v10.4h, v6.4h[0] 1826 smlal v20.4s, v11.4h, v6.4h[2] 1827 1828 1829 smlsl v22.4s, v10.4h, v2.4h[0] 1830 smlsl v22.4s, v11.4h, v3.4h[2] 1831 1832 smlal v16.4s, v10.4h, v2.4h[0] 1833 smlal v16.4s, v11.4h, v0.4h[2] 1834 1835 smlsl v18.4s, v10.4h, v6.4h[0] 1836 smlsl v18.4s, v11.4h, v2.4h[2] 1837 1838 ld1 {v12.4h, v13.4h},[x1],#16 1839 ld1 {v14.4h, v15.4h},[x1],x10 1840 1841 smlal v24.4s, v14.4h, v7.4h[1] 1842 smlsl v26.4s, v14.4h, v5.4h[3] 1843 smlal v28.4s, v14.4h, v4.4h[1] 1844 smlsl v30.4s, v14.4h, v2.4h[3] 1845 1846 1847 smlal v24.4s, v15.4h, v7.4h[3] 1848 smlsl v26.4s, v15.4h, v7.4h[1] 1849 smlal v28.4s, v15.4h, v6.4h[3] 1850 smlsl v30.4s, v15.4h, v6.4h[1] 1851 1852 1853 smlal v20.4s, v12.4h, v7.4h[0] 1854 smlal v20.4s, v13.4h, v7.4h[2] 1855 smlsl v22.4s, v12.4h, v5.4h[0] 1856 smlsl v22.4s, v13.4h, v6.4h[2] 1857 smlal v16.4s, v12.4h, v3.4h[0] 1858 smlal v16.4s, v13.4h, v5.4h[2] 1859 smlsl v18.4s, v12.4h, v1.4h[0] 1860 smlsl v18.4s, v13.4h, v4.4h[2] 1861 1862 stage2_shift1: 1863 add v8.4s, v20.4s , v24.4s 1864 sub v10.4s, v20.4s , v24.4s 1865 1866 add v12.4s, v22.4s , v26.4s 1867 sub v24.4s, v22.4s , v26.4s 1868 1869 add v14.4s, v16.4s , v28.4s 1870 sub v26.4s, v16.4s , v28.4s 1871 1872 1873 add v16.4s, v18.4s , v30.4s 1874 sub v28.4s, v18.4s , v30.4s 1875 1876 1877 sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct) 1878 sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct) 1879 sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct) 1880 sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct) 1881 sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct) 1882 sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct) 1883 sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct) 1884 sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct) 1885 1886 1887 umov x15,v24.d[0] 1888 umov x16,v25.d[0] 1889 umov x19,v26.d[0] 1890 umov x20,v27.d[0] 1891 1892 trn1 v24.4h, v30.4h, v12.4h 1893 trn2 v25.4h, v30.4h, v12.4h 1894 trn1 v26.4h, v31.4h, v13.4h 1895 trn2 v27.4h, v31.4h, v13.4h 1896 1897 trn1 v30.2s, v24.2s, v26.2s 1898 trn2 v31.2s, v24.2s, v26.2s 1899 trn1 v12.2s, v25.2s, v27.2s 1900 trn2 v13.2s, v25.2s, v27.2s 1901 1902 trn1 v24.4h, v14.4h, v18.4h 1903 trn2 v25.4h, v14.4h, v18.4h 1904 trn1 v26.4h, v15.4h, v19.4h 1905 trn2 v27.4h, v15.4h, v19.4h 1906 1907 trn1 v14.2s, v24.2s, v26.2s 1908 trn2 v15.2s, v24.2s, v26.2s 1909 trn1 v18.2s, v25.2s, v27.2s 1910 trn2 v19.2s, v25.2s, v27.2s 1911 1912 mov v24.d[0],x15 1913 mov v25.d[0],x16 1914 mov v26.d[0],x19 1915 mov v27.d[0],x20 1916 1917 st1 { v30.4h, v31.4h},[x0],#16 1918 st1 { v12.4h, v13.4h},[x0],#16 1919 st1 { v14.4h, v15.4h},[x0],#16 1920 st1 { v18.4h, v19.4h},[x0],#16 1921 1922 mov x1,x4 1923 1924 1925 1926 1927 1928 1929 ld1 {v10.4h, v11.4h},[x1],#16 1930 ld1 {v8.4h, v9.4h},[x1],x10 1931 1932 1933 smull v24.4s, v8.4h, v2.4h[1] //// y1 * cos1(part of b0) 1934 smull v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1) 1935 smull v28.4s, v8.4h, v3.4h[1] //// y1 * sin3(part of b2) 1936 smull v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3) 1937 1938 smlal v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) 1939 smlsl v26.4s, v9.4h, v7.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) 1940 smlsl v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) 1941 smlsl v30.4s, v9.4h, v4.4h[3] //// y1 * sin1 - y3 * sin3(part of b3) 1942 1943 1944 1945 1946 1947 smull v20.4s, v10.4h, v0.4h[0] 1948 smlal v20.4s, v11.4h, v4.4h[2] 1949 1950 1951 smull v22.4s, v10.4h, v0.4h[0] 1952 smlal v22.4s, v11.4h, v5.4h[2] 1953 1954 smull v16.4s, v10.4h, v0.4h[0] 1955 smlal v16.4s, v11.4h, v6.4h[2] 1956 1957 smull v18.4s, v10.4h, v0.4h[0] 1958 smlal v18.4s, v11.4h, v7.4h[2] 1959 1960 cmp x12,x11 1961 bhs stage2_shift2 1962 1963 ld1 {v12.4h, v13.4h},[x1],#16 1964 ld1 {v14.4h, v15.4h},[x1],x10 1965 1966 1967 smlsl v24.4s, v14.4h, v4.4h[3] 1968 smlsl v26.4s, v14.4h, v2.4h[1] 1969 smlsl v28.4s, v14.4h, v0.4h[1] 1970 smlsl v30.4s, v14.4h, v2.4h[3] 1971 1972 1973 smlsl v24.4s, v15.4h, v0.4h[3] 1974 smlsl v26.4s, v15.4h, v3.4h[1] 1975 smlsl v28.4s, v15.4h, v6.4h[3] 1976 smlal v30.4s, v15.4h, v5.4h[3] 1977 1978 1979 smlsl v20.4s, v12.4h, v7.4h[0] 1980 smlsl v20.4s, v13.4h, v2.4h[2] 1981 smlsl v22.4s, v12.4h, v5.4h[0] 1982 smlsl v22.4s, v13.4h, v0.4h[2] 1983 smlsl v16.4s, v12.4h, v3.4h[0] 1984 smlsl v16.4s, v13.4h, v3.4h[2] 1985 smlsl v18.4s, v12.4h, v1.4h[0] 1986 smlsl v18.4s, v13.4h, v6.4h[2] 1987 1988 cmp x12,x5 1989 bhs stage2_shift2 1990 1991 ld1 {v10.4h, v11.4h},[x1],#16 1992 ld1 {v8.4h, v9.4h},[x1],x10 1993 1994 1995 1996 1997 1998 smlsl v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0) 1999 smlal v26.4s, v8.4h, v7.4h[1] //// y1 * cos3(part of b1) 2000 smlal v28.4s, v8.4h, v2.4h[3] //// y1 * sin3(part of b2) 2001 smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3) 2002 2003 smlal v24.4s, v9.4h, v7.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) 2004 smlal v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) 2005 smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) 2006 smlsl v30.4s, v9.4h, v6.4h[3] //// y1 * sin1 - y3 * sin3(part of b3) 2007 2008 2009 2010 2011 2012 smlsl v20.4s, v10.4h, v2.4h[0] 2013 smlsl v20.4s, v11.4h, v6.4h[2] 2014 2015 2016 smlsl v22.4s, v10.4h, v6.4h[0] 2017 smlal v22.4s, v11.4h, v4.4h[2] 2018 2019 smlal v16.4s, v10.4h, v6.4h[0] 2020 smlal v16.4s, v11.4h, v0.4h[2] 2021 2022 smlal v18.4s, v10.4h, v2.4h[0] 2023 smlal v18.4s, v11.4h, v5.4h[2] 2024 2025 cmp x12,x6 2026 bhs stage2_shift2 2027 2028 2029 ld1 {v12.4h, v13.4h},[x1],#16 2030 ld1 {v14.4h, v15.4h},[x1],x10 2031 2032 2033 2034 2035 2036 2037 smlal v24.4s, v14.4h, v2.4h[3] 2038 smlal v26.4s, v14.4h, v3.4h[3] 2039 smlsl v28.4s, v14.4h, v5.4h[3] 2040 smlsl v30.4s, v14.4h, v0.4h[3] 2041 2042 2043 smlal v24.4s, v15.4h, v1.4h[3] 2044 smlsl v26.4s, v15.4h, v6.4h[3] 2045 smlsl v28.4s, v15.4h, v0.4h[3] 2046 smlal v30.4s, v15.4h, v7.4h[3] 2047 2048 2049 smlal v20.4s, v12.4h, v5.4h[0] 2050 smlal v20.4s, v13.4h, v0.4h[2] 2051 smlal v22.4s, v12.4h, v1.4h[0] 2052 smlal v22.4s, v13.4h, v6.4h[2] 2053 smlal v16.4s, v12.4h, v7.4h[0] 2054 smlsl v16.4s, v13.4h, v2.4h[2] 2055 smlsl v18.4s, v12.4h, v3.4h[0] 2056 smlsl v18.4s, v13.4h, v4.4h[2] 2057 2058 cmp x12,x9 2059 bhs stage2_shift2 2060 2061 2062 ld1 {v10.4h, v11.4h},[x1],#16 2063 ld1 {v8.4h, v9.4h},[x1],x10 2064 2065 2066 2067 smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0) 2068 smlsl v26.4s, v8.4h, v1.4h[1] //// y1 * cos3(part of b1) 2069 smlsl v28.4s, v8.4h, v7.4h[1] //// y1 * sin3(part of b2) 2070 smlal v30.4s, v8.4h, v0.4h[3] //// y1 * sin1(part of b3) 2071 2072 smlsl v24.4s, v9.4h, v5.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) 2073 smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) 2074 smlal v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) 2075 smlal v30.4s, v9.4h, v7.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 2076 2077 2078 2079 2080 2081 smlal v20.4s, v10.4h, v0.4h[0] 2082 smlsl v20.4s, v11.4h, v7.4h[2] 2083 2084 2085 smlsl v22.4s, v10.4h, v0.4h[0] 2086 smlsl v22.4s, v11.4h, v1.4h[2] 2087 2088 smlsl v16.4s, v10.4h, v0.4h[0] 2089 smlal v16.4s, v11.4h, v5.4h[2] 2090 2091 smlal v18.4s, v10.4h, v0.4h[0] 2092 smlal v18.4s, v11.4h, v3.4h[2] 2093 2094 ld1 {v12.4h, v13.4h},[x1],#16 2095 ld1 {v14.4h, v15.4h},[x1],x10 2096 2097 2098 2099 2100 smlsl v24.4s, v14.4h, v0.4h[1] 2101 smlal v26.4s, v14.4h, v6.4h[1] 2102 smlal v28.4s, v14.4h, v4.4h[1] 2103 smlsl v30.4s, v14.4h, v1.4h[1] 2104 2105 2106 smlsl v24.4s, v15.4h, v3.4h[3] 2107 smlal v26.4s, v15.4h, v0.4h[1] 2108 smlsl v28.4s, v15.4h, v5.4h[1] 2109 smlsl v30.4s, v15.4h, v6.4h[1] 2110 2111 2112 smlsl v20.4s, v12.4h, v3.4h[0] 2113 smlsl v20.4s, v13.4h, v1.4h[2] 2114 smlsl v22.4s, v12.4h, v7.4h[0] 2115 smlal v22.4s, v13.4h, v3.4h[2] 2116 smlal v16.4s, v12.4h, v1.4h[0] 2117 smlal v16.4s, v13.4h, v7.4h[2] 2118 smlsl v18.4s, v12.4h, v5.4h[0] 2119 smlsl v18.4s, v13.4h, v2.4h[2] 2120 2121 2122 ld1 {v10.4h, v11.4h},[x1],#16 2123 ld1 {v8.4h, v9.4h},[x1],x10 2124 2125 2126 smlal v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0) 2127 smlal v26.4s, v8.4h, v4.4h[3] //// y1 * cos3(part of b1) 2128 smlsl v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2) 2129 smlal v30.4s, v8.4h, v2.4h[1] //// y1 * sin1(part of b3) 2130 2131 smlal v24.4s, v9.4h, v3.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) 2132 smlsl v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) 2133 smlsl v28.4s, v9.4h, v7.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) 2134 smlal v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 2135 2136 2137 2138 2139 2140 smlsl v20.4s, v10.4h, v6.4h[0] 2141 smlal v20.4s, v11.4h, v5.4h[2] 2142 2143 2144 smlal v22.4s, v10.4h, v2.4h[0] 2145 smlal v22.4s, v11.4h, v7.4h[2] 2146 2147 smlsl v16.4s, v10.4h, v2.4h[0] 2148 smlsl v16.4s, v11.4h, v4.4h[2] 2149 2150 smlal v18.4s, v10.4h, v6.4h[0] 2151 smlal v18.4s, v11.4h, v1.4h[2] 2152 2153 2154 ld1 {v12.4h, v13.4h},[x1],#16 2155 ld1 {v14.4h, v15.4h},[x1],x10 2156 2157 2158 2159 smlal v24.4s, v14.4h, v1.4h[1] 2160 smlsl v26.4s, v14.4h, v0.4h[3] 2161 smlal v28.4s, v14.4h, v1.4h[3] 2162 smlsl v30.4s, v14.4h, v3.4h[1] 2163 2164 2165 smlal v24.4s, v15.4h, v5.4h[3] 2166 smlsl v26.4s, v15.4h, v5.4h[1] 2167 smlal v28.4s, v15.4h, v4.4h[3] 2168 smlsl v30.4s, v15.4h, v4.4h[1] 2169 2170 2171 smlal v20.4s, v12.4h, v1.4h[0] 2172 smlal v20.4s, v13.4h, v3.4h[2] 2173 smlsl v22.4s, v12.4h, v3.4h[0] 2174 smlsl v22.4s, v13.4h, v2.4h[2] 2175 smlal v16.4s, v12.4h, v5.4h[0] 2176 smlal v16.4s, v13.4h, v1.4h[2] 2177 smlsl v18.4s, v12.4h, v7.4h[0] 2178 smlsl v18.4s, v13.4h, v0.4h[2] 2179 2180 stage2_shift2: 2181 add v8.4s, v20.4s , v24.4s 2182 sub v10.4s, v20.4s , v24.4s 2183 2184 add v12.4s, v22.4s , v26.4s 2185 sub v24.4s, v22.4s , v26.4s 2186 2187 add v14.4s, v16.4s , v28.4s 2188 sub v26.4s, v16.4s , v28.4s 2189 2190 2191 add v16.4s, v18.4s , v30.4s 2192 sub v28.4s, v18.4s , v30.4s 2193 2194 2195 sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct) 2196 sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct) 2197 sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct) 2198 sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct) 2199 sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct) 2200 sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct) 2201 sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct) 2202 sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct) 2203 2204 umov x15,v24.d[0] 2205 umov x16,v25.d[0] 2206 umov x19,v26.d[0] 2207 umov x20,v27.d[0] 2208 2209 trn1 v24.4h, v30.4h, v12.4h 2210 trn2 v25.4h, v30.4h, v12.4h 2211 trn1 v26.4h, v31.4h, v13.4h 2212 trn2 v27.4h, v31.4h, v13.4h 2213 2214 trn1 v30.2s, v24.2s, v26.2s 2215 trn2 v31.2s, v24.2s, v26.2s 2216 trn1 v12.2s, v25.2s, v27.2s 2217 trn2 v13.2s, v25.2s, v27.2s 2218 2219 trn1 v24.4h, v14.4h, v18.4h 2220 trn2 v25.4h, v14.4h, v18.4h 2221 trn1 v26.4h, v15.4h, v19.4h 2222 trn2 v27.4h, v15.4h, v19.4h 2223 2224 trn1 v14.2s, v24.2s, v26.2s 2225 trn2 v15.2s, v24.2s, v26.2s 2226 trn1 v18.2s, v25.2s, v27.2s 2227 trn2 v19.2s, v25.2s, v27.2s 2228 2229 mov v24.d[0],x15 2230 mov v25.d[0],x16 2231 mov v26.d[0],x19 2232 mov v27.d[0],x20 2233 2234 st1 { v30.4h, v31.4h},[x0],#16 2235 st1 { v12.4h, v13.4h},[x0],#16 2236 st1 { v14.4h, v15.4h},[x0],#16 2237 st1 { v18.4h, v19.4h},[x0],#16 2238 2239 2240 mov x1,x4 2241 2242 2243 2244 2245 ld1 {v10.4h, v11.4h},[x1],#16 2246 ld1 {v8.4h, v9.4h},[x1],x10 2247 2248 smull v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0) 2249 smull v26.4s, v8.4h, v4.4h[3] //// y1 * cos3(part of b1) 2250 smull v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2) 2251 smull v30.4s, v8.4h, v5.4h[3] //// y1 * sin1(part of b3) 2252 2253 smlsl v24.4s, v9.4h, v3.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) 2254 smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) 2255 smlsl v28.4s, v9.4h, v0.4h[2] //// y1 * sin3 - y3 * cos1(part of b2) 2256 smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 2257 2258 2259 2260 2261 2262 smull v20.4s, v10.4h, v0.4h[0] 2263 smlsl v20.4s, v11.4h, v7.4h[2] 2264 2265 2266 smull v22.4s, v10.4h, v0.4h[0] 2267 smlsl v22.4s, v11.4h, v6.4h[2] 2268 2269 smull v16.4s, v10.4h, v0.4h[0] 2270 smlsl v16.4s, v11.4h, v5.4h[2] 2271 2272 smull v18.4s, v10.4h, v0.4h[0] 2273 smlsl v18.4s, v11.4h, v4.4h[2] 2274 2275 cmp x12,x11 2276 bhs stage2_shift3 2277 2278 ld1 {v12.4h, v13.4h},[x1],#16 2279 ld1 {v14.4h, v15.4h},[x1],x10 2280 2281 smlsl v24.4s, v14.4h, v5.4h[1] 2282 smlsl v26.4s, v14.4h, v7.4h[3] 2283 smlal v28.4s, v14.4h, v5.4h[3] 2284 smlal v30.4s, v14.4h, v3.4h[1] 2285 2286 2287 smlal v24.4s, v15.4h, v2.4h[1] 2288 smlal v26.4s, v15.4h, v1.4h[1] 2289 smlal v28.4s, v15.4h, v4.4h[3] 2290 smlsl v30.4s, v15.4h, v7.4h[3] 2291 2292 2293 smlsl v20.4s, v12.4h, v1.4h[0] 2294 smlal v20.4s, v13.4h, v6.4h[2] 2295 smlsl v22.4s, v12.4h, v3.4h[0] 2296 smlal v22.4s, v13.4h, v3.4h[2] 2297 smlsl v16.4s, v12.4h, v5.4h[0] 2298 smlal v16.4s, v13.4h, v0.4h[2] 2299 smlsl v18.4s, v12.4h, v7.4h[0] 2300 smlal v18.4s, v13.4h, v2.4h[2] 2301 2302 cmp x12,x5 2303 bhs stage2_shift3 2304 2305 ld1 {v10.4h, v11.4h},[x1],#16 2306 ld1 {v8.4h, v9.4h},[x1],x10 2307 2308 2309 2310 smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0) 2311 smlsl v26.4s, v8.4h, v5.4h[1] //// y1 * cos3(part of b1) 2312 smlsl v28.4s, v8.4h, v0.4h[3] //// y1 * sin3(part of b2) 2313 smlsl v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3) 2314 2315 smlsl v24.4s, v9.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) 2316 smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) 2317 smlal v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) 2318 smlal v30.4s, v9.4h, v0.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 2319 2320 2321 2322 2323 2324 smlal v20.4s, v10.4h, v2.4h[0] 2325 smlsl v20.4s, v11.4h, v5.4h[2] 2326 2327 2328 smlal v22.4s, v10.4h, v6.4h[0] 2329 smlsl v22.4s, v11.4h, v0.4h[2] 2330 2331 smlsl v16.4s, v10.4h, v6.4h[0] 2332 smlsl v16.4s, v11.4h, v4.4h[2] 2333 2334 smlsl v18.4s, v10.4h, v2.4h[0] 2335 smlal v18.4s, v11.4h, v6.4h[2] 2336 2337 cmp x12,x6 2338 bhs stage2_shift3 2339 2340 ld1 {v12.4h, v13.4h},[x1],#16 2341 ld1 {v14.4h, v15.4h},[x1],x10 2342 2343 2344 2345 2346 2347 smlsl v24.4s, v14.4h, v7.4h[1] 2348 smlal v26.4s, v14.4h, v2.4h[1] 2349 smlal v28.4s, v14.4h, v4.4h[1] 2350 smlsl v30.4s, v14.4h, v5.4h[1] 2351 2352 2353 smlal v24.4s, v15.4h, v0.4h[3] 2354 smlal v26.4s, v15.4h, v7.4h[1] 2355 smlsl v28.4s, v15.4h, v1.4h[1] 2356 smlsl v30.4s, v15.4h, v6.4h[1] 2357 2358 2359 smlsl v20.4s, v12.4h, v3.4h[0] 2360 smlal v20.4s, v13.4h, v4.4h[2] 2361 smlal v22.4s, v12.4h, v7.4h[0] 2362 smlal v22.4s, v13.4h, v2.4h[2] 2363 smlal v16.4s, v12.4h, v1.4h[0] 2364 smlsl v16.4s, v13.4h, v6.4h[2] 2365 smlal v18.4s, v12.4h, v5.4h[0] 2366 smlsl v18.4s, v13.4h, v0.4h[2] 2367 2368 cmp x12,x9 2369 bhs stage2_shift3 2370 2371 2372 ld1 {v10.4h, v11.4h},[x1],#16 2373 ld1 {v8.4h, v9.4h},[x1],x10 2374 2375 2376 smlsl v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0) 2377 smlsl v26.4s, v8.4h, v0.4h[1] //// y1 * cos3(part of b1) 2378 smlal v28.4s, v8.4h, v6.4h[3] //// y1 * sin3(part of b2) 2379 smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3) 2380 2381 smlsl v24.4s, v9.4h, v0.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) 2382 smlal v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) 2383 smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) 2384 smlsl v30.4s, v9.4h, v2.4h[3] //// y1 * sin1 - y3 * sin3(part of b3) 2385 2386 2387 2388 2389 2390 smlal v20.4s, v10.4h, v0.4h[0] 2391 smlsl v20.4s, v11.4h, v3.4h[2] 2392 2393 2394 smlsl v22.4s, v10.4h, v0.4h[0] 2395 smlsl v22.4s, v11.4h, v5.4h[2] 2396 2397 smlsl v16.4s, v10.4h, v0.4h[0] 2398 smlal v16.4s, v11.4h, v1.4h[2] 2399 2400 smlal v18.4s, v10.4h, v0.4h[0] 2401 smlal v18.4s, v11.4h, v7.4h[2] 2402 2403 ld1 {v12.4h, v13.4h},[x1],#16 2404 ld1 {v14.4h, v15.4h},[x1],x10 2405 2406 2407 2408 2409 smlal v24.4s, v14.4h, v6.4h[3] 2410 smlal v26.4s, v14.4h, v3.4h[3] 2411 smlsl v28.4s, v14.4h, v1.4h[3] 2412 smlal v30.4s, v14.4h, v7.4h[1] 2413 2414 2415 smlal v24.4s, v15.4h, v1.4h[3] 2416 smlsl v26.4s, v15.4h, v2.4h[3] 2417 smlal v28.4s, v15.4h, v7.4h[1] 2418 smlal v30.4s, v15.4h, v4.4h[1] 2419 2420 2421 smlsl v20.4s, v12.4h, v5.4h[0] 2422 smlal v20.4s, v13.4h, v2.4h[2] 2423 smlal v22.4s, v12.4h, v1.4h[0] 2424 smlsl v22.4s, v13.4h, v7.4h[2] 2425 smlsl v16.4s, v12.4h, v7.4h[0] 2426 smlsl v16.4s, v13.4h, v3.4h[2] 2427 smlsl v18.4s, v12.4h, v3.4h[0] 2428 smlal v18.4s, v13.4h, v1.4h[2] 2429 2430 2431 ld1 {v10.4h, v11.4h},[x1],#16 2432 ld1 {v8.4h, v9.4h},[x1],x10 2433 2434 2435 smlsl v24.4s, v8.4h, v5.4h[3] //// y1 * cos1(part of b0) 2436 smlsl v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1) 2437 smlal v28.4s, v8.4h, v3.4h[1] //// y1 * sin3(part of b2) 2438 smlsl v30.4s, v8.4h, v0.4h[1] //// y1 * sin1(part of b3) 2439 2440 smlsl v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) 2441 smlal v26.4s, v9.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) 2442 smlsl v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) 2443 smlal v30.4s, v9.4h, v4.4h[3] //// y1 * sin1 - y3 * sin3(part of b3) 2444 2445 2446 2447 2448 2449 smlal v20.4s, v10.4h, v6.4h[0] 2450 smlsl v20.4s, v11.4h, v1.4h[2] 2451 2452 2453 smlsl v22.4s, v10.4h, v2.4h[0] 2454 smlal v22.4s, v11.4h, v4.4h[2] 2455 2456 smlal v16.4s, v10.4h, v2.4h[0] 2457 smlsl v16.4s, v11.4h, v7.4h[2] 2458 2459 smlsl v18.4s, v10.4h, v6.4h[0] 2460 smlsl v18.4s, v11.4h, v5.4h[2] 2461 2462 ld1 {v12.4h, v13.4h},[x1],#16 2463 ld1 {v14.4h, v15.4h},[x1],x10 2464 2465 2466 2467 smlal v24.4s, v14.4h, v4.4h[3] 2468 smlsl v26.4s, v14.4h, v6.4h[1] 2469 smlal v28.4s, v14.4h, v7.4h[3] 2470 smlal v30.4s, v14.4h, v6.4h[3] 2471 2472 2473 smlal v24.4s, v15.4h, v3.4h[3] 2474 smlsl v26.4s, v15.4h, v3.4h[1] 2475 smlal v28.4s, v15.4h, v2.4h[3] 2476 smlsl v30.4s, v15.4h, v2.4h[1] 2477 2478 2479 smlsl v20.4s, v12.4h, v7.4h[0] 2480 smlal v20.4s, v13.4h, v0.4h[2] 2481 smlal v22.4s, v12.4h, v5.4h[0] 2482 smlsl v22.4s, v13.4h, v1.4h[2] 2483 smlsl v16.4s, v12.4h, v3.4h[0] 2484 smlal v16.4s, v13.4h, v2.4h[2] 2485 smlal v18.4s, v12.4h, v1.4h[0] 2486 smlsl v18.4s, v13.4h, v3.4h[2] 2487 2488 stage2_shift3: 2489 add v8.4s, v20.4s , v24.4s 2490 sub v10.4s, v20.4s , v24.4s 2491 2492 add v12.4s, v22.4s , v26.4s 2493 sub v24.4s, v22.4s , v26.4s 2494 2495 add v14.4s, v16.4s , v28.4s 2496 sub v26.4s, v16.4s , v28.4s 2497 2498 2499 add v16.4s, v18.4s , v30.4s 2500 sub v28.4s, v18.4s , v30.4s 2501 2502 2503 sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct) 2504 sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct) 2505 sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct) 2506 sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct) 2507 sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct) 2508 sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct) 2509 sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct) 2510 sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct) 2511 2512 umov x15,v24.d[0] 2513 umov x16,v25.d[0] 2514 umov x19,v26.d[0] 2515 umov x20,v27.d[0] 2516 2517 trn1 v24.4h, v30.4h, v12.4h 2518 trn2 v25.4h, v30.4h, v12.4h 2519 trn1 v26.4h, v31.4h, v13.4h 2520 trn2 v27.4h, v31.4h, v13.4h 2521 2522 trn1 v30.2s, v24.2s, v26.2s 2523 trn2 v31.2s, v24.2s, v26.2s 2524 trn1 v12.2s, v25.2s, v27.2s 2525 trn2 v13.2s, v25.2s, v27.2s 2526 2527 trn1 v24.4h, v14.4h, v18.4h 2528 trn2 v25.4h, v14.4h, v18.4h 2529 trn1 v26.4h, v15.4h, v19.4h 2530 trn2 v27.4h, v15.4h, v19.4h 2531 2532 trn1 v14.2s, v24.2s, v26.2s 2533 trn2 v15.2s, v24.2s, v26.2s 2534 trn1 v18.2s, v25.2s, v27.2s 2535 trn2 v19.2s, v25.2s, v27.2s 2536 2537 mov v24.d[0],x15 2538 mov v25.d[0],x16 2539 mov v26.d[0],x19 2540 mov v27.d[0],x20 2541 2542 st1 { v30.4h, v31.4h},[x0],#16 2543 st1 { v12.4h, v13.4h},[x0],#16 2544 st1 { v14.4h, v15.4h},[x0],#16 2545 st1 { v18.4h, v19.4h},[x0],#16 2546 2547 2548 2549 mov x1,x4 2550 2551 2552 2553 2554 ld1 {v10.4h, v11.4h},[x1],#16 2555 ld1 {v8.4h, v9.4h},[x1],x10 2556 2557 2558 smull v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0) 2559 smull v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1) 2560 smull v28.4s, v8.4h, v7.4h[1] //// y1 * sin3(part of b2) 2561 smull v30.4s, v8.4h, v7.4h[3] //// y1 * sin1(part of b3) 2562 2563 smlsl v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) 2564 smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) 2565 smlsl v28.4s, v9.4h, v5.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) 2566 smlsl v30.4s, v9.4h, v7.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 2567 2568 2569 2570 2571 2572 smull v20.4s, v10.4h, v0.4h[0] 2573 smlsl v20.4s, v11.4h, v3.4h[2] 2574 2575 2576 smull v22.4s, v10.4h, v0.4h[0] 2577 smlsl v22.4s, v11.4h, v2.4h[2] 2578 2579 smull v16.4s, v10.4h, v0.4h[0] 2580 smlsl v16.4s, v11.4h, v1.4h[2] 2581 2582 smull v18.4s, v10.4h, v0.4h[0] 2583 smlsl v18.4s, v11.4h, v0.4h[2] 2584 2585 cmp x12,x11 2586 bhs stage2_shift4 2587 ld1 {v12.4h, v13.4h},[x1],#16 2588 ld1 {v14.4h, v15.4h},[x1],x10 2589 2590 2591 2592 2593 2594 2595 smlal v24.4s, v14.4h, v0.4h[1] 2596 smlal v26.4s, v14.4h, v1.4h[3] 2597 smlal v28.4s, v14.4h, v4.4h[1] 2598 smlal v30.4s, v14.4h, v6.4h[3] 2599 2600 2601 smlsl v24.4s, v15.4h, v4.4h[1] 2602 smlsl v26.4s, v15.4h, v0.4h[3] 2603 smlsl v28.4s, v15.4h, v2.4h[3] 2604 smlsl v30.4s, v15.4h, v6.4h[1] 2605 2606 2607 smlal v20.4s, v12.4h, v7.4h[0] 2608 smlal v20.4s, v13.4h, v5.4h[2] 2609 smlal v22.4s, v12.4h, v5.4h[0] 2610 smlsl v22.4s, v13.4h, v7.4h[2] 2611 smlal v16.4s, v12.4h, v3.4h[0] 2612 smlsl v16.4s, v13.4h, v4.4h[2] 2613 smlal v18.4s, v12.4h, v1.4h[0] 2614 smlsl v18.4s, v13.4h, v1.4h[2] 2615 2616 cmp x12,x5 2617 bhs stage2_shift4 2618 2619 ld1 {v10.4h, v11.4h},[x1],#16 2620 ld1 {v8.4h, v9.4h},[x1],x10 2621 2622 2623 2624 smlal v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0) 2625 smlal v26.4s, v8.4h, v3.4h[1] //// y1 * cos3(part of b1) 2626 smlal v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2) 2627 smlal v30.4s, v8.4h, v5.4h[3] //// y1 * sin1(part of b3) 2628 2629 smlal v24.4s, v9.4h, v4.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) 2630 smlsl v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) 2631 smlsl v28.4s, v9.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) 2632 smlsl v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 2633 2634 2635 2636 2637 2638 smlsl v20.4s, v10.4h, v2.4h[0] 2639 smlal v20.4s, v11.4h, v1.4h[2] 2640 2641 2642 smlsl v22.4s, v10.4h, v6.4h[0] 2643 smlal v22.4s, v11.4h, v3.4h[2] 2644 2645 smlal v16.4s, v10.4h, v6.4h[0] 2646 smlsl v16.4s, v11.4h, v7.4h[2] 2647 2648 smlal v18.4s, v10.4h, v2.4h[0] 2649 smlsl v18.4s, v11.4h, v2.4h[2] 2650 2651 cmp x12,x6 2652 bhs stage2_shift4 2653 2654 2655 ld1 {v12.4h, v13.4h},[x1],#16 2656 ld1 {v14.4h, v15.4h},[x1],x10 2657 2658 2659 2660 2661 2662 2663 smlsl v24.4s, v14.4h, v1.4h[1] 2664 smlsl v26.4s, v14.4h, v7.4h[3] 2665 smlal v28.4s, v14.4h, v1.4h[3] 2666 smlal v30.4s, v14.4h, v4.4h[3] 2667 2668 2669 smlal v24.4s, v15.4h, v2.4h[1] 2670 smlal v26.4s, v15.4h, v5.4h[1] 2671 smlsl v28.4s, v15.4h, v3.4h[1] 2672 smlsl v30.4s, v15.4h, v4.4h[1] 2673 2674 2675 smlsl v20.4s, v12.4h, v5.4h[0] 2676 smlsl v20.4s, v13.4h, v7.4h[2] 2677 smlsl v22.4s, v12.4h, v1.4h[0] 2678 smlal v22.4s, v13.4h, v1.4h[2] 2679 smlsl v16.4s, v12.4h, v7.4h[0] 2680 smlal v16.4s, v13.4h, v5.4h[2] 2681 smlal v18.4s, v12.4h, v3.4h[0] 2682 smlsl v18.4s, v13.4h, v3.4h[2] 2683 2684 cmp x12,x9 2685 bhs stage2_shift4 2686 2687 2688 ld1 {v10.4h, v11.4h},[x1],#16 2689 ld1 {v8.4h, v9.4h},[x1],x10 2690 2691 2692 smlsl v24.4s, v8.4h, v5.4h[3] //// y1 * cos1(part of b0) 2693 smlsl v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1) 2694 smlal v28.4s, v8.4h, v4.4h[3] //// y1 * sin3(part of b2) 2695 smlal v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3) 2696 2697 smlsl v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) 2698 smlal v26.4s, v9.4h, v0.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) 2699 smlsl v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) 2700 smlsl v30.4s, v9.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 2701 2702 2703 2704 2705 2706 smlal v20.4s, v10.4h, v0.4h[0] 2707 smlsl v20.4s, v11.4h, v0.4h[2] 2708 2709 2710 smlsl v22.4s, v10.4h, v0.4h[0] 2711 smlal v22.4s, v11.4h, v6.4h[2] 2712 2713 smlsl v16.4s, v10.4h, v0.4h[0] 2714 smlal v16.4s, v11.4h, v2.4h[2] 2715 2716 smlal v18.4s, v10.4h, v0.4h[0] 2717 smlsl v18.4s, v11.4h, v4.4h[2] 2718 2719 ld1 {v12.4h, v13.4h},[x1],#16 2720 ld1 {v14.4h, v15.4h},[x1],x10 2721 2722 2723 2724 2725 smlal v24.4s, v14.4h, v3.4h[1] 2726 smlsl v26.4s, v14.4h, v2.4h[1] 2727 smlal v28.4s, v14.4h, v7.4h[3] 2728 smlal v30.4s, v14.4h, v2.4h[3] 2729 2730 2731 smlsl v24.4s, v15.4h, v0.4h[3] 2732 smlal v26.4s, v15.4h, v4.4h[3] 2733 smlal v28.4s, v15.4h, v6.4h[3] 2734 smlsl v30.4s, v15.4h, v2.4h[1] 2735 2736 2737 smlal v20.4s, v12.4h, v3.4h[0] 2738 smlsl v20.4s, v13.4h, v6.4h[2] 2739 smlal v22.4s, v12.4h, v7.4h[0] 2740 smlsl v22.4s, v13.4h, v4.4h[2] 2741 smlsl v16.4s, v12.4h, v1.4h[0] 2742 smlal v16.4s, v13.4h, v0.4h[2] 2743 smlal v18.4s, v12.4h, v5.4h[0] 2744 smlsl v18.4s, v13.4h, v5.4h[2] 2745 2746 2747 ld1 {v10.4h, v11.4h},[x1],#16 2748 ld1 {v8.4h, v9.4h},[x1],x10 2749 2750 2751 2752 2753 smlal v24.4s, v8.4h, v3.4h[3] //// y1 * cos1(part of b0) 2754 smlsl v26.4s, v8.4h, v7.4h[1] //// y1 * cos3(part of b1) 2755 smlsl v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2) 2756 smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3) 2757 2758 smlsl v24.4s, v9.4h, v7.4h[1] //// y1 * cos1 + y3 * cos3(part of b0) 2759 smlsl v26.4s, v9.4h, v6.4h[1] //// y1 * cos3 - y3 * sin1(part of b1) 2760 smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2) 2761 smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) 2762 2763 2764 2765 2766 2767 smlsl v20.4s, v10.4h, v6.4h[0] 2768 smlal v20.4s, v11.4h, v2.4h[2] 2769 2770 2771 smlal v22.4s, v10.4h, v2.4h[0] 2772 smlsl v22.4s, v11.4h, v0.4h[2] 2773 2774 smlsl v16.4s, v10.4h, v2.4h[0] 2775 smlal v16.4s, v11.4h, v3.4h[2] 2776 2777 smlal v18.4s, v10.4h, v6.4h[0] 2778 smlsl v18.4s, v11.4h, v6.4h[2] 2779 2780 2781 ld1 {v12.4h, v13.4h},[x1],#16 2782 ld1 {v14.4h, v15.4h},[x1],x10 2783 2784 2785 2786 smlsl v24.4s, v14.4h, v5.4h[1] 2787 smlal v26.4s, v14.4h, v3.4h[3] 2788 smlsl v28.4s, v14.4h, v2.4h[1] 2789 smlal v30.4s, v14.4h, v0.4h[3] 2790 2791 2792 smlal v24.4s, v15.4h, v1.4h[3] 2793 smlsl v26.4s, v15.4h, v1.4h[1] 2794 smlal v28.4s, v15.4h, v0.4h[3] 2795 smlsl v30.4s, v15.4h, v0.4h[1] 2796 2797 2798 smlsl v20.4s, v12.4h, v1.4h[0] 2799 smlal v20.4s, v13.4h, v4.4h[2] 2800 smlal v22.4s, v12.4h, v3.4h[0] 2801 smlsl v22.4s, v13.4h, v5.4h[2] 2802 smlsl v16.4s, v12.4h, v5.4h[0] 2803 smlal v16.4s, v13.4h, v6.4h[2] 2804 smlal v18.4s, v12.4h, v7.4h[0] 2805 smlsl v18.4s, v13.4h, v7.4h[2] 2806 2807 stage2_shift4: 2808 add v8.4s, v20.4s , v24.4s 2809 sub v10.4s, v20.4s , v24.4s 2810 2811 add v12.4s, v22.4s , v26.4s 2812 sub v24.4s, v22.4s , v26.4s 2813 2814 add v14.4s, v16.4s , v28.4s 2815 sub v26.4s, v16.4s , v28.4s 2816 2817 2818 add v16.4s, v18.4s , v30.4s 2819 sub v28.4s, v18.4s , v30.4s 2820 2821 2822 sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct) 2823 sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct) 2824 sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct) 2825 sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct) 2826 sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct) 2827 sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct) 2828 sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct) 2829 sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct) 2830 2831 2832 2833 umov x15,v24.d[0] 2834 umov x16,v25.d[0] 2835 umov x19,v26.d[0] 2836 umov x20,v27.d[0] 2837 2838 trn1 v24.4h, v30.4h, v12.4h 2839 trn2 v25.4h, v30.4h, v12.4h 2840 trn1 v26.4h, v31.4h, v13.4h 2841 trn2 v27.4h, v31.4h, v13.4h 2842 2843 trn1 v30.2s, v24.2s, v26.2s 2844 trn2 v31.2s, v24.2s, v26.2s 2845 trn1 v12.2s, v25.2s, v27.2s 2846 trn2 v13.2s, v25.2s, v27.2s 2847 2848 trn1 v24.4h, v14.4h, v18.4h 2849 trn2 v25.4h, v14.4h, v18.4h 2850 trn1 v26.4h, v15.4h, v19.4h 2851 trn2 v27.4h, v15.4h, v19.4h 2852 2853 trn1 v14.2s, v24.2s, v26.2s 2854 trn2 v15.2s, v24.2s, v26.2s 2855 trn1 v18.2s, v25.2s, v27.2s 2856 trn2 v19.2s, v25.2s, v27.2s 2857 2858 mov v24.d[0],x15 2859 mov v25.d[0],x16 2860 mov v26.d[0],x19 2861 mov v27.d[0],x20 2862 2863 st1 { v30.4h, v31.4h},[x0],#16 2864 st1 { v12.4h, v13.4h},[x0],#16 2865 st1 { v14.4h, v15.4h},[x0],#16 2866 st1 { v18.4h, v19.4h},[x0],#16 2867 2868 2869 2870 2871 sub x0,x0,#256 2872 prediction_buffer: 2873 2874 2875 ld1 {v12.8h},[x0],#16 2876 ld1 {v14.8h},[x0],#16 2877 2878 add x0,x0,#32 2879 2880 ld1 {v16.8h},[x0],#16 2881 ld1 {v18.8h},[x0],#16 2882 add x0,x0,#32 2883 2884 ld1 {v20.8h},[x0],#16 2885 ld1 {v22.8h},[x0],#16 2886 2887 2888 add x0,x0,#32 2889 2890 ld1 {v24.8h},[x0],#16 2891 ld1 {v26.8h},[x0],#16 2892 2893 2894 2895 2896 2897 // d12 =x0 1- 4 values 2898 // d13 =x2 1- 4 values 2899 // d14=x1 1- 4 values 2900 // d15=x3 1- 4 values 2901 2902 // d16 =x0 5- 8 values 2903 // d17 =x2 5- 8 values 2904 // d18=x1 5- 8 values 2905 // d19=x3 5- 8 values 2906 2907 // d20 =x0 9- 12 values 2908 // d21 =x2 9- 12 values 2909 // d22=x1 9- 12 values 2910 // d23=x3 9- 12 values 2911 2912 // d24 =x0 13-16 values 2913 // d25 =x2 13- 16 values 2914 // d26=x1 13- 16 values 2915 // d27=x3 13- 16 values 2916 2917 // swapping v12 upper and v16 lower 64bits 2918 mov v13.d[0], v12.d[1] 2919 mov v12.d[1], v16.d[0] 2920 mov v16.d[0], v13.d[0] 2921 // swapping v20 upper and v24 lower 64bits 2922 mov v21.d[0], v20.d[1] 2923 mov v20.d[1], v24.d[0] 2924 mov v24.d[0], v21.d[0] 2925 // swapping v14 uppper and v18 lower 64bits 2926 mov v15.d[0], v14.d[1] 2927 mov v14.d[1], v18.d[0] 2928 mov v18.d[0], v15.d[0] 2929 // swapping v22 upper and v26 lower 64bits 2930 mov v23.d[0], v22.d[1] 2931 mov v22.d[1], v26.d[0] 2932 mov v26.d[0], v23.d[0] 2933 2934 2935 ld1 {v8.8b, v9.8b},[x2],x8 2936 ld1 {v10.8b, v11.8b},[x2],x8 2937 ld1 {v28.8b, v29.8b},[x2],x8 2938 ld1 {v30.8b, v31.8b},[x2],x8 2939 2940 2941 uaddw v12.8h, v12.8h , v8.8b 2942 uaddw v20.8h, v20.8h , v9.8b 2943 uaddw v14.8h, v14.8h , v10.8b 2944 uaddw v22.8h, v22.8h , v11.8b 2945 uaddw v16.8h, v16.8h , v28.8b 2946 uaddw v24.8h, v24.8h , v29.8b 2947 uaddw v18.8h, v18.8h , v30.8b 2948 uaddw v26.8h, v26.8h , v31.8b 2949 sub x2,x2,x8,lsl #2 2950 add x2,x2,#16 2951 sqxtun v12.8b, v12.8h 2952 sqxtun v13.8b, v20.8h 2953 sqxtun v20.8b, v14.8h 2954 sqxtun v21.8b, v22.8h 2955 sqxtun v14.8b, v16.8h 2956 sqxtun v15.8b, v24.8h 2957 sqxtun v22.8b, v18.8h 2958 sqxtun v23.8b, v26.8h 2959 2960 2961 st1 {v12.8b, v13.8b},[x3],x7 2962 st1 {v20.8b, v21.8b},[x3],x7 2963 st1 {v14.8b, v15.8b},[x3],x7 2964 st1 {v22.8b, v23.8b},[x3],x7 2965 2966 2967 sub x3,x3,x7,lsl #2 2968 add x3,x3,#16 2969 2970 ld1 {v12.8h},[x0],#16 2971 ld1 {v14.8h},[x0],#16 2972 2973 sub x0,x0,#96 2974 2975 ld1 {v16.8h},[x0],#16 2976 ld1 {v18.8h},[x0],#16 2977 sub x0,x0,#96 2978 2979 ld1 {v20.8h},[x0],#16 2980 ld1 {v22.8h},[x0],#16 2981 2982 2983 sub x0,x0,#96 2984 2985 ld1 {v24.8h},[x0],#16 2986 ld1 {v26.8h},[x0],#16 2987 2988 2989 sub x0,x0,#64 2990 2991 2992 // swapping v12 upper and v16 lower 64bits 2993 mov v13.d[0], v12.d[1] 2994 mov v12.d[1], v16.d[0] 2995 mov v16.d[0], v13.d[0] 2996 // swapping v20 upper and v24 lower 64bits 2997 mov v21.d[0], v20.d[1] 2998 mov v20.d[1], v24.d[0] 2999 mov v24.d[0], v21.d[0] 3000 // swapping v14 uppper and v18 lower 64bits 3001 mov v15.d[0], v14.d[1] 3002 mov v14.d[1], v18.d[0] 3003 mov v18.d[0], v15.d[0] 3004 // swapping v22 upper and v26 lower 64bits 3005 mov v23.d[0], v22.d[1] 3006 mov v22.d[1], v26.d[0] 3007 mov v26.d[0], v23.d[0] 3008 3009 3010 ld1 {v8.8b, v9.8b},[x2],x8 3011 ld1 {v10.8b, v11.8b},[x2],x8 3012 ld1 {v28.8b, v29.8b},[x2],x8 3013 ld1 {v30.8b, v31.8b},[x2],x8 3014 3015 3016 uaddw v12.8h, v12.8h , v8.8b 3017 uaddw v20.8h, v20.8h , v9.8b 3018 uaddw v14.8h, v14.8h , v10.8b 3019 uaddw v22.8h, v22.8h , v11.8b 3020 uaddw v16.8h, v16.8h , v28.8b 3021 uaddw v24.8h, v24.8h , v29.8b 3022 uaddw v18.8h, v18.8h , v30.8b 3023 uaddw v26.8h, v26.8h , v31.8b 3024 sub x2,x2,#16 3025 3026 sqxtun v12.8b, v12.8h 3027 sqxtun v13.8b, v20.8h 3028 sqxtun v20.8b, v14.8h 3029 sqxtun v21.8b, v22.8h 3030 sqxtun v14.8b, v16.8h 3031 sqxtun v15.8b, v24.8h 3032 sqxtun v22.8b, v18.8h 3033 sqxtun v23.8b, v26.8h 3034 3035 3036 st1 {v12.8b, v13.8b},[x3],x7 3037 st1 {v20.8b, v21.8b},[x3],x7 3038 st1 {v14.8b, v15.8b},[x3],x7 3039 st1 {v22.8b, v23.8b},[x3],x7 3040 3041 sub x3,x3,#16 3042 3043 subs x14,x14,#1 3044 bne dct_stage2 3045 // ldmfd sp!,{x0-x12,pc} 3046 ldp x19, x20,[sp],#16 3047 pop_v_regs 3048 ret 3049 3050 3051 3052 3053 3054