1 //****************************************************************************** 2 //* 3 //* Copyright (C) 2015 The Android Open Source Project 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //***************************************************************************** 18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 //*/ 20 21 ///* 22 ////---------------------------------------------------------------------------- 23 //// File Name : impeg2_inter_pred.s 24 //// 25 //// Description : This file has motion compensation related 26 //// interpolation functions on Neon + CortexA-8 platform 27 //// 28 //// Reference Document : 29 //// 30 //// Revision History : 31 //// Date Author Detail Description 32 //// ------------ ---------------- ---------------------------------- 33 //// 18 jun 2010 S Hamsalekha Created 34 //// 35 ////------------------------------------------------------------------------- 36 //*/ 37 38 ///* 39 //// ---------------------------------------------------------------------------- 40 //// Include Files 41 //// ---------------------------------------------------------------------------- 42 //*/ 43 // PRESERVE8 44 .text 45 .include "impeg2_neon_macros.s" 46 47 ///* 48 //// ---------------------------------------------------------------------------- 49 //// Struct/Union Types and Define 50 //// ---------------------------------------------------------------------------- 51 //*/ 52 53 54 ///* 55 //// ---------------------------------------------------------------------------- 56 //// Static Global Data section variables 57 //// ---------------------------------------------------------------------------- 58 //*/ 59 //// -------------------------- NONE -------------------------------------------- 60 61 62 ///* 63 //// ---------------------------------------------------------------------------- 64 //// Static Prototype Functions 65 //// ---------------------------------------------------------------------------- 66 //*/ 67 //// -------------------------- NONE -------------------------------------------- 68 69 ///* 70 //// ---------------------------------------------------------------------------- 71 //// Exported functions 72 //// ---------------------------------------------------------------------------- 73 //*/ 74 75 76 ///* 77 ////--------------------------------------------------------------------------- 78 //// Function Name : impeg2_copy_mb_av8() 79 //// 80 //// Detail Description : Copies one MB worth of data from src to the dst 81 //// 82 //// Inputs : x0 - pointer to src 83 //// x1 - pointer to dst 84 //// x2 - source width 85 //// x3 - destination width 86 //// Registers Used : v0, v1 87 //// 88 //// Stack Usage : 64 bytes 89 //// 90 //// Outputs : 91 //// 92 //// Return Data : None 93 //// 94 //// Programming Note : <program limitation> 95 ////----------------------------------------------------------------------------- 96 //*/ 97 98 99 100 .global impeg2_copy_mb_av8 101 102 103 impeg2_copy_mb_av8: 104 105 //STMFD x13!,{x4,x5,x12,x14} 106 push_v_regs 107 108 109 ldr x4, [x0] //src->y 110 ldr x5, [x1] //dst->y 111 112 //Read one row of data from the src 113 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 114 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 115 116 ////Repeat 15 times for y 117 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 118 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 119 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 120 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 121 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 122 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 123 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 124 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 125 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 126 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 127 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 128 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 129 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 130 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 131 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 132 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 133 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 134 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 135 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 136 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 137 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 138 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 139 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 140 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 141 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 142 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 143 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 144 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 145 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 146 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 147 148 lsr x2, x2, #1 //src_offset /= 2 149 lsr x3, x3, #1 //dst_offset /= 2 150 151 ldr x4, [x0, #8] //src->u 152 ldr x5, [x1, #8] //dst->u 153 154 //Read one row of data from the src 155 ld1 {v0.8b}, [x4], x2 //Load and increment src 156 st1 {v0.8b}, [x5], x3 //Store and increment dst 157 158 ////Repeat 7 times for u 159 ld1 {v0.8b}, [x4], x2 //Load and increment src 160 st1 {v0.8b}, [x5], x3 //Store and increment dst 161 ld1 {v0.8b}, [x4], x2 //Load and increment src 162 st1 {v0.8b}, [x5], x3 //Store and increment dst 163 ld1 {v0.8b}, [x4], x2 //Load and increment src 164 st1 {v0.8b}, [x5], x3 //Store and increment dst 165 ld1 {v0.8b}, [x4], x2 //Load and increment src 166 st1 {v0.8b}, [x5], x3 //Store and increment dst 167 ld1 {v0.8b}, [x4], x2 //Load and increment src 168 st1 {v0.8b}, [x5], x3 //Store and increment dst 169 ld1 {v0.8b}, [x4], x2 //Load and increment src 170 st1 {v0.8b}, [x5], x3 //Store and increment dst 171 ld1 {v0.8b}, [x4], x2 //Load and increment src 172 st1 {v0.8b}, [x5], x3 //Store and increment dst 173 174 ldr x4, [x0, #16] //src->v 175 ldr x5, [x1, #16] //dst->v 176 177 //Read one row of data from the src 178 ld1 {v0.8b}, [x4], x2 //Load and increment src 179 st1 {v0.8b}, [x5], x3 //Store and increment dst 180 181 ////Repeat 7 times for v 182 ld1 {v0.8b}, [x4], x2 //Load and increment src 183 st1 {v0.8b}, [x5], x3 //Store and increment dst 184 ld1 {v0.8b}, [x4], x2 //Load and increment src 185 st1 {v0.8b}, [x5], x3 //Store and increment dst 186 ld1 {v0.8b}, [x4], x2 //Load and increment src 187 st1 {v0.8b}, [x5], x3 //Store and increment dst 188 ld1 {v0.8b}, [x4], x2 //Load and increment src 189 st1 {v0.8b}, [x5], x3 //Store and increment dst 190 ld1 {v0.8b}, [x4], x2 //Load and increment src 191 st1 {v0.8b}, [x5], x3 //Store and increment dst 192 ld1 {v0.8b}, [x4], x2 //Load and increment src 193 st1 {v0.8b}, [x5], x3 //Store and increment dst 194 ld1 {v0.8b}, [x4], x2 //Load and increment src 195 st1 {v0.8b}, [x5], x3 //Store and increment dst 196 197 //LDMFD x13!,{x4,x5,x12,PC} 198 pop_v_regs 199 ret 200 201 202 ///* 203 ////--------------------------------------------------------------------------- 204 //// Function Name : impeg2_mc_fullx_halfy_8x8_av8() 205 //// 206 //// Detail Description : This function pastes the reference block in the 207 //// current frame buffer.This function is called for 208 //// blocks that are not coded and have motion vectors 209 //// with a half pel resolution. 210 //// 211 //// Inputs : x0 - out : Current Block Pointer 212 //// x1 - ref : Refernce Block Pointer 213 //// x2 - ref_wid : Refernce Block Width 214 //// x3 - out_wid @ Current Block Width 215 //// 216 //// Registers Used : x14, D0-D9 217 //// 218 //// Stack Usage : 64 bytes 219 //// 220 //// Outputs : The Motion Compensated Block 221 //// 222 //// Return Data : None 223 //// 224 //// Programming Note : <program limitation> 225 ////----------------------------------------------------------------------------- 226 //*/ 227 228 .global impeg2_mc_fullx_halfy_8x8_av8 229 230 impeg2_mc_fullx_halfy_8x8_av8: 231 232 //STMFD x13!,{x12,x14} 233 push_v_regs 234 add x14, x1, x2 235 lsl x2, x2, #1 236 237 ///* Load 8 + 1 rows from reference block */ 238 ///* Do the addition with out rounding off as rounding value is 1 */ 239 ld1 {v0.8b}, [x1], x2 //// first row hence x1 = D0 240 ld1 {v2.8b}, [x14], x2 //// second row hence x2 = D2 241 ld1 {v4.8b}, [x1], x2 //// third row hence x3 = D4 242 ld1 {v6.8b}, [x14], x2 //// fourth row hence x4 = D6 243 ld1 {v1.8b}, [x1], x2 //// fifth row hence x5 = D1 244 ld1 {v3.8b}, [x14], x2 //// sixth row hence x6 = D3 245 urhadd v9.8b, v1.8b , v6.8b //// estimated row 4 = D9 246 ld1 {v5.8b}, [x1], x2 //// seventh row hence x7 = D5 247 urhadd v0.16b, v0.16b , v2.16b //// estimated row 1 = D0, row 5 = D1 248 urhadd v1.16b, v1.16b , v3.16b //// estimated row 1 = D0, row 5 = D1 249 ld1 {v7.8b}, [x14], x2 //// eighth row hence x8 = D7 250 urhadd v2.16b, v2.16b , v4.16b //// estimated row 2 = D2, row 6 = D3 251 urhadd v3.16b, v3.16b , v5.16b //// estimated row 2 = D2, row 6 = D3 252 ld1 {v8.8b}, [x1], x2 //// ninth row hence x9 = D8 253 urhadd v4.16b, v4.16b , v6.16b //// estimated row 3 = D4, row 7 = D5 254 urhadd v5.16b, v5.16b , v7.16b //// estimated row 3 = D4, row 7 = D5 255 256 add x14, x0, x3 257 lsl x3, x3, #1 258 259 ///* Store the eight rows calculated above */ 260 st1 {v2.8b}, [x14], x3 //// second row hence D2 261 urhadd v7.8b, v7.8b , v8.8b //// estimated row 8 = D7 262 st1 {v0.8b}, [x0], x3 //// first row hence D0 263 st1 {v9.8b}, [x14], x3 //// fourth row hence D9 264 st1 {v4.8b}, [x0], x3 //// third row hence D4 265 st1 {v3.8b}, [x14], x3 //// sixth row hence x6 = D3 266 st1 {v1.8b}, [x0], x3 //// fifth row hence x5 = D1 267 st1 {v7.8b}, [x14], x3 //// eighth row hence x8 = D7 268 st1 {v5.8b}, [x0], x3 //// seventh row hence x7 = D5 269 270 // LDMFD sp!,{x12,pc} 271 pop_v_regs 272 ret 273 274 275 276 277 278 ///* 279 ////--------------------------------------------------------------------------- 280 //// Function Name : impeg2_mc_halfx_fully_8x8_av8() 281 //// 282 //// Detail Description : This function pastes the reference block in the 283 //// current frame buffer.This function is called for 284 //// blocks that are not coded and have motion vectors 285 //// with a half pel resolutionand VopRoundingType is 0 .. 286 //// 287 //// Inputs : x0 - out : Current Block Pointer 288 //// x1 - ref : Refernce Block Pointer 289 //// x2 - ref_wid : Refernce Block Width 290 //// x3 - out_wid @ Current Block Width 291 //// 292 //// Registers Used : x12, x14, v0-v10, v12-v14, v16-v18, v20-v22 293 294 //// 295 //// Stack Usage : 64 bytes 296 //// 297 //// Outputs : The Motion Compensated Block 298 //// 299 //// Return Data : None 300 //// 301 //// Programming Note : <program limitation> 302 ////----------------------------------------------------------------------------- 303 //*/ 304 305 306 307 .global impeg2_mc_halfx_fully_8x8_av8 308 309 310 311 impeg2_mc_halfx_fully_8x8_av8: 312 313 // STMFD sp!,{x12,x14} 314 push_v_regs 315 316 add x14, x1, x2, lsl #2 317 318 add x12, x0, x3, lsl#2 319 320 ld1 {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of row1 321 322 ld1 {v2.8b, v3.8b}, [x14], x2 // row5 323 324 325 ld1 {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2 326 327 ld1 {v6.8b, v7.8b}, [x14], x2 //row6 328 329 330 ext v8.8b, v0.8b , v1.8b , #1 331 332 ext v12.8b, v2.8b , v3.8b , #1 333 334 ext v16.8b, v4.8b , v5.8b , #1 335 336 ext v20.8b, v6.8b , v7.8b , #1 337 338 339 ld1 {v9.8b, v10.8b}, [x1], x2 //load row3 340 341 ld1 {v13.8b, v14.8b}, [x14], x2 //load row7 342 343 ld1 {v17.8b, v18.8b}, [x1], x2 //load row4 344 345 ld1 {v21.8b, v22.8b}, [x14], x2 //load row8 346 347 348 ext v1.8b, v9.8b , v10.8b , #1 349 350 ext v3.8b, v13.8b , v14.8b , #1 351 352 353 354 ext v5.8b, v17.8b , v18.8b , #1 355 356 ext v7.8b, v21.8b , v22.8b , #1 357 358 359 urhadd v0.16b, v0.16b , v8.16b //operate on row1 and row3 360 urhadd v1.16b, v1.16b , v9.16b //operate on row1 and row3 361 362 urhadd v2.16b, v2.16b , v12.16b //operate on row5 and row7 363 urhadd v3.16b, v3.16b , v13.16b //operate on row5 and row7 364 365 366 urhadd v4.16b, v4.16b , v16.16b //operate on row2 and row4 367 urhadd v5.16b, v5.16b , v17.16b //operate on row2 and row4 368 369 370 urhadd v6.16b, v6.16b , v20.16b //operate on row6 and row8 371 urhadd v7.16b, v7.16b , v21.16b //operate on row6 and row8 372 373 st1 {v0.8b}, [x0], x3 //store row1 374 375 st1 {v2.8b}, [x12], x3 //store row5 376 377 st1 {v4.8b}, [x0], x3 //store row2 378 379 st1 {v6.8b}, [x12], x3 //store row6 380 381 st1 {v1.8b}, [x0], x3 //store row3 382 383 st1 {v3.8b}, [x12], x3 //store row7 384 385 st1 {v5.8b}, [x0], x3 //store row4 386 387 st1 {v7.8b}, [x12], x3 //store row8 388 389 390 391 // LDMFD sp!,{x12,pc} 392 pop_v_regs 393 ret 394 395 396 397 398 399 400 401 ///* 402 ////--------------------------------------------------------------------------- 403 //// Function Name : impeg2_mc_halfx_halfy_8x8_av8() 404 //// 405 //// Detail Description : This function pastes the reference block in the 406 //// current frame buffer.This function is called for 407 //// blocks that are not coded and have motion vectors 408 //// with a half pel resolutionand VopRoundingType is 0 .. 409 //// 410 //// Inputs : x0 - out : Current Block Pointer 411 //// x1 - ref : Refernce Block Pointer 412 //// x2 - ref_wid : Refernce Block Width 413 //// x3 - out_wid @ Current Block Width 414 //// 415 //// Registers Used : x14, v0-v18, v22, v24, v26, v28, v30 416 417 //// 418 //// Stack Usage : 64 bytes 419 //// 420 //// Outputs : The Motion Compensated Block 421 //// 422 //// Return Data : None 423 //// 424 //// Programming Note : <program limitation> 425 ////----------------------------------------------------------------------------- 426 //*/ 427 428 429 .global impeg2_mc_halfx_halfy_8x8_av8 430 431 impeg2_mc_halfx_halfy_8x8_av8: 432 433 // STMFD sp!,{x12,x14} 434 push_v_regs 435 436 add x14, x1, x2, lsl #2 437 438 ld1 {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of row1 439 440 ld1 {v2.8b, v3.8b}, [x14], x2 // row5 441 442 ld1 {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2 443 444 ld1 {v6.8b, v7.8b}, [x14], x2 //row6 445 446 ext v1.8b, v0.8b , v1.8b , #1 447 448 449 450 ext v3.8b, v2.8b , v3.8b , #1 451 452 453 454 ext v5.8b, v4.8b , v5.8b , #1 455 456 ext v7.8b, v6.8b , v7.8b , #1 457 458 459 460 461 ld1 {v8.8b, v9.8b}, [x1], x2 //load row3 462 463 464 465 ld1 {v10.8b, v11.8b}, [x14], x2 //load row7 466 467 ld1 {v12.8b, v13.8b}, [x1], x2 //load row4 468 469 ld1 {v14.8b, v15.8b}, [x14], x2 //load row8 470 471 ext v9.8b, v8.8b , v9.8b , #1 472 473 ld1 {v16.8b, v17.8b}, [x14], x2 //load row9 474 475 476 477 478 479 ext v11.8b, v10.8b , v11.8b , #1 480 481 482 483 ext v13.8b, v12.8b , v13.8b , #1 484 485 486 487 ext v15.8b, v14.8b , v15.8b , #1 488 489 ext v17.8b, v16.8b , v17.8b , #1 490 491 492 //interpolation in x direction 493 494 uaddl v0.8h, v0.8b, v1.8b //operate row1 495 496 uaddl v2.8h, v2.8b, v3.8b //operate row5 497 498 uaddl v4.8h, v4.8b, v5.8b //operate row2 499 500 uaddl v6.8h, v6.8b, v7.8b //operate row6 501 502 uaddl v8.8h, v8.8b, v9.8b //operate row3 503 504 uaddl v10.8h, v10.8b, v11.8b //operate row7 505 506 uaddl v12.8h, v12.8b, v13.8b //operate row4 507 508 uaddl v14.8h, v14.8b, v15.8b //operate row8 509 510 uaddl v16.8h, v16.8b, v17.8b //operate row9 511 512 //interpolation in y direction 513 514 add x14, x0, x3, lsl #2 515 516 517 518 add v18.8h, v0.8h , v4.8h //operate row1 and row2 519 520 add v26.8h, v2.8h , v6.8h //operate row5 and row6 521 522 add v20.8h, v4.8h , v8.8h //operate row2 and row3 523 524 add v28.8h, v6.8h , v10.8h //operate row6 and row7 525 526 rshrn v18.8b, v18.8h, #2 //row1 527 528 rshrn v26.8b, v26.8h, #2 //row5 529 530 rshrn v20.8b, v20.8h, #2 //row2 531 532 rshrn v28.8b, v28.8h, #2 //row6 533 534 add v22.8h, v8.8h , v12.8h //operate row3 and row4 535 536 st1 {v18.8b}, [x0], x3 //store row1 537 538 add v30.8h, v10.8h , v14.8h //operate row7 and row8 539 540 st1 {v26.8b}, [x14], x3 //store row5 541 542 add v24.8h, v12.8h , v2.8h //operate row4 and row5 543 544 st1 {v20.8b}, [x0], x3 //store row2 545 546 add v14.8h, v14.8h , v16.8h //operate row8 and row9 547 548 st1 {v28.8b}, [x14], x3 //store row6 549 550 551 552 rshrn v22.8b, v22.8h, #2 //row3 553 554 rshrn v30.8b, v30.8h, #2 //row7 555 556 rshrn v24.8b, v24.8h, #2 //row4 557 558 rshrn v14.8b, v14.8h, #2 //row8 559 560 561 st1 {v22.8b}, [x0], x3 //store row3 562 st1 {v30.8b}, [x14], x3 //store row7 563 st1 {v24.8b}, [x0], x3 //store row4 564 st1 {v14.8b}, [x14], x3 //store row8 565 566 567 568 // LDMFD sp!,{x12,pc} 569 pop_v_regs 570 ret 571 572 573 574 575 ///* 576 ////--------------------------------------------------------------------------- 577 //// Function Name : impeg2_mc_fullx_fully_8x8_av8() 578 //// 579 //// Detail Description : This function pastes the reference block in the 580 //// current frame buffer.This function is called for 581 //// blocks that are not coded and have motion vectors 582 //// with a half pel resolutionand .. 583 //// 584 //// Inputs : x0 - out : Current Block Pointer 585 //// x1 - ref : Refernce Block Pointer 586 //// x2 - ref_wid : Refernce Block Width 587 //// x3 - out_wid @ Current Block Width 588 //// 589 //// Registers Used : x12, x14, v0-v3 590 591 //// 592 //// Stack Usage : 64 bytes 593 //// 594 //// Outputs : The Motion Compensated Block 595 //// 596 //// Return Data : None 597 //// 598 //// Programming Note : <program limitation> 599 ////----------------------------------------------------------------------------- 600 //*/ 601 602 603 .global impeg2_mc_fullx_fully_8x8_av8 604 impeg2_mc_fullx_fully_8x8_av8: 605 606 607 // STMFD sp!,{x12,x14} 608 push_v_regs 609 610 add x14, x1, x2, lsl #2 611 612 add x12, x0, x3, lsl #2 613 614 615 ld1 {v0.8b}, [x1], x2 //load row1 616 617 ld1 {v1.8b}, [x14], x2 //load row4 618 619 ld1 {v2.8b}, [x1], x2 //load row2 620 621 ld1 {v3.8b}, [x14], x2 //load row5 622 623 624 st1 {v0.8b}, [x0], x3 //store row1 625 626 st1 {v1.8b}, [x12], x3 //store row4 627 628 st1 {v2.8b}, [x0], x3 //store row2 629 630 st1 {v3.8b}, [x12], x3 //store row5 631 632 633 ld1 {v0.8b}, [x1], x2 //load row3 634 635 ld1 {v1.8b}, [x14], x2 //load row6 636 637 ld1 {v2.8b}, [x1], x2 //load row4 638 639 ld1 {v3.8b}, [x14], x2 //load row8 640 641 642 st1 {v0.8b}, [x0], x3 //store row3 643 644 st1 {v1.8b}, [x12], x3 //store row6 645 646 st1 {v2.8b}, [x0], x3 //store row4 647 648 st1 {v3.8b}, [x12], x3 //store row8 649 650 651 // LDMFD sp!,{x12,pc} 652 pop_v_regs 653 ret 654 655 656 657 658 ///* 659 ////--------------------------------------------------------------------------- 660 //// Function Name : impeg2_interpolate_av8() 661 //// 662 //// Detail Description : interpolates two buffers and adds pred 663 //// 664 //// Inputs : x0 - pointer to src1 665 //// x1 - pointer to src2 666 //// x2 - dest buf 667 //// x3 - dst stride 668 //// Registers Used : x12, v0-v15 669 //// 670 //// Stack Usage : 64 bytes 671 //// 672 //// Outputs : The Motion Compensated Block 673 //// 674 //// Return Data : None 675 //// 676 //// Programming Note : <program limitation> 677 ////----------------------------------------------------------------------------- 678 //*/ 679 680 681 .global impeg2_interpolate_av8 682 683 684 impeg2_interpolate_av8: 685 686 //STMFD x13!,{x4-x7,x12,x14} 687 push_v_regs 688 689 ldr x4, [x0, #0] //ptr_y src1 690 691 ldr x5, [x1, #0] //ptr_y src2 692 693 ldr x7, [x2, #0] //ptr_y dst buf 694 695 mov x12, #4 //counter for number of blocks 696 697 698 interp_lumablocks_stride: 699 ld1 {v0.16b}, [x4], #16 //row1 src1 700 701 ld1 {v2.16b}, [x4], #16 //row2 src1 702 703 ld1 {v4.16b}, [x4], #16 //row3 src1 704 705 ld1 {v6.16b}, [x4], #16 //row4 src1 706 707 708 ld1 {v8.16b}, [x5], #16 //row1 src2 709 710 ld1 {v10.16b}, [x5], #16 //row2 src2 711 712 ld1 {v12.16b}, [x5], #16 //row3 src2 713 714 ld1 {v14.16b}, [x5], #16 //row4 src2 715 716 urhadd v0.16b, v0.16b , v8.16b //operate on row1 717 718 urhadd v2.16b, v2.16b , v10.16b //operate on row2 719 720 urhadd v4.16b, v4.16b , v12.16b //operate on row3 721 722 urhadd v6.16b, v6.16b , v14.16b //operate on row4 723 st1 {v0.16b}, [x7], x3 //row1 724 725 st1 {v2.16b}, [x7], x3 //row2 726 727 st1 {v4.16b}, [x7], x3 //row3 728 729 st1 {v6.16b}, [x7], x3 //row4 730 731 subs x12, x12, #1 732 733 bne interp_lumablocks_stride 734 735 736 lsr x3, x3, #1 //stride >> 1 737 738 ldr x4, [x0, #8] //ptr_u src1 739 740 ldr x5, [x1, #8] //ptr_u src2 741 742 ldr x7 , [x2, #8] //ptr_u dst buf 743 744 mov x12, #2 //counter for number of blocks 745 746 747 748 //chroma blocks 749 750 interp_chromablocks_stride: 751 ld1 {v0.8b, v1.8b}, [x4], #16 //row1 & 2 src1 752 753 ld1 {v2.8b, v3.8b}, [x4], #16 //row3 & 4 src1 754 755 ld1 {v4.8b, v5.8b}, [x4], #16 //row5 & 6 src1 756 757 ld1 {v6.8b, v7.8b}, [x4], #16 //row7 & 8 src1 758 759 760 ld1 {v8.8b, v9.8b}, [x5], #16 //row1 & 2 src2 761 762 ld1 {v10.8b, v11.8b}, [x5], #16 //row3 & 4 src2 763 764 ld1 {v12.8b, v13.8b}, [x5], #16 //row5 & 6 src2 765 766 ld1 {v14.8b, v15.8b}, [x5], #16 //row7 & 8 src2 767 768 urhadd v0.16b, v0.16b , v8.16b //operate on row1 & 2 769 urhadd v1.16b, v1.16b , v9.16b //operate on row1 & 2 770 771 urhadd v2.16b, v2.16b , v10.16b //operate on row3 & 4 772 urhadd v3.16b, v3.16b , v11.16b //operate on row3 & 4 773 774 urhadd v4.16b, v4.16b , v12.16b //operate on row5 & 6 775 urhadd v5.16b, v5.16b , v13.16b //operate on row5 & 6 776 777 urhadd v6.16b, v6.16b , v14.16b //operate on row7 & 8 778 urhadd v7.16b, v7.16b , v15.16b //operate on row7 & 8 779 780 st1 {v0.8b}, [x7], x3 //row1 781 782 st1 {v1.8b}, [x7], x3 //row2 783 784 st1 {v2.8b}, [x7], x3 //row3 785 786 st1 {v3.8b}, [x7], x3 //row4 787 788 st1 {v4.8b}, [x7], x3 //row5 789 790 st1 {v5.8b}, [x7], x3 //row6 791 792 st1 {v6.8b}, [x7], x3 //row7 793 794 st1 {v7.8b}, [x7], x3 //row8 795 796 797 ldr x4, [x0, #16] //ptr_v src1 798 799 ldr x5, [x1, #16] //ptr_v src2 800 801 ldr x7, [x2, #16] //ptr_v dst buf 802 803 subs x12, x12, #1 804 805 bne interp_chromablocks_stride 806 807 808 //LDMFD x13!,{x4-x7,x12,PC} 809 pop_v_regs 810 ret 811 812 813 814 815