1 @/****************************************************************************** 2 @ * 3 @ * Copyright (C) 2015 The Android Open Source Project 4 @ * 5 @ * Licensed under the Apache License, Version 2.0 (the "License"); 6 @ * you may not use this file except in compliance with the License. 7 @ * You may obtain a copy of the License at: 8 @ * 9 @ * http://www.apache.org/licenses/LICENSE-2.0 10 @ * 11 @ * Unless required by applicable law or agreed to in writing, software 12 @ * distributed under the License is distributed on an "AS IS" BASIS, 13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @ * See the License for the specific language governing permissions and 15 @ * limitations under the License. 16 @ * 17 @ ***************************************************************************** 18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 @*/ 20 21 @/* 22 @//---------------------------------------------------------------------------- 23 @// File Name : impeg2_inter_pred.s 24 @// 25 @// Description : This file has motion compensation related 26 @// interpolation functions on Neon + CortexA-8 platform 27 @// 28 @// Reference Document : 29 @// 30 @// Revision History : 31 @// Date Author Detail Description 32 @// ------------ ---------------- ---------------------------------- 33 @// 18 jun 2010 S Hamsalekha Created 34 @// 35 @//------------------------------------------------------------------------- 36 @*/ 37 38 @/* 39 @// ---------------------------------------------------------------------------- 40 @// Include Files 41 @// ---------------------------------------------------------------------------- 42 @*/ 43 .text 44 .p2align 2 45 46 47 @/* 48 @// ---------------------------------------------------------------------------- 49 @// Struct/Union Types and Define 50 @// ---------------------------------------------------------------------------- 51 @*/ 52 53 54 @/* 55 @// ---------------------------------------------------------------------------- 56 @// Static Global Data section variables 57 @// ---------------------------------------------------------------------------- 58 @*/ 59 @// -------------------------- NONE -------------------------------------------- 60 61 62 @/* 63 @// ---------------------------------------------------------------------------- 64 @// Static Prototype Functions 65 @// ---------------------------------------------------------------------------- 66 @*/ 67 @// -------------------------- NONE -------------------------------------------- 68 69 @/* 70 @// ---------------------------------------------------------------------------- 71 @// Exported functions 72 @// ---------------------------------------------------------------------------- 73 @*/ 74 75 @//--------------------------------------------------------------------------- 76 @// Function Name : impeg2_copy_mb_a9q() 77 @// 78 @// Detail Description : Copies one MB worth of data from src to the dst 79 @// 80 @// Inputs : r0 - pointer to src 81 @// r1 - pointer to dst 82 @// r2 - source width 83 @// r3 - destination width 84 @// Registers Used : r4, r5, d0, d1 85 @// 86 @// Stack Usage : 12 bytes 87 @// 88 @// Outputs : 89 @// 90 @// Return Data : None 91 @// 92 @// Programming Note : <program limitation> 93 @//----------------------------------------------------------------------------- 94 @*/ 95 96 97 98 .global impeg2_copy_mb_a9q 99 100 101 impeg2_copy_mb_a9q: 102 103 stmfd r13!, {r4, r5, r14} 104 105 106 ldr r4, [r0] @src->y 107 ldr r5, [r1] @dst->y 108 @Read one row of data from the src 109 vld1.8 {d0, d1}, [r4], r2 @Load and increment src 110 vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 111 112 @//Repeat 15 times for y 113 vld1.8 {d0, d1}, [r4], r2 @Load and increment src 114 vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 115 vld1.8 {d0, d1}, [r4], r2 @Load and increment src 116 vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 117 vld1.8 {d0, d1}, [r4], r2 @Load and increment src 118 vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 119 vld1.8 {d0, d1}, [r4], r2 @Load and increment src 120 vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 121 vld1.8 {d0, d1}, [r4], r2 @Load and increment src 122 vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 123 vld1.8 {d0, d1}, [r4], r2 @Load and increment src 124 vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 125 vld1.8 {d0, d1}, [r4], r2 @Load and increment src 126 vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 127 vld1.8 {d0, d1}, [r4], r2 @Load and increment src 128 vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 129 vld1.8 {d0, d1}, [r4], r2 @Load and increment src 130 vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 131 vld1.8 {d0, d1}, [r4], r2 @Load and increment src 132 vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 133 vld1.8 {d0, d1}, [r4], r2 @Load and increment src 134 vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 135 vld1.8 {d0, d1}, [r4], r2 @Load and increment src 136 vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 137 vld1.8 {d0, d1}, [r4], r2 @Load and increment src 138 vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 139 vld1.8 {d0, d1}, [r4], r2 @Load and increment src 140 vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 141 vld1.8 {d0, d1}, [r4], r2 @Load and increment src 142 vst1.8 {d0, d1}, [r5], r3 @Store and increment dst 143 144 mov r2, r2, lsr #1 @src_offset /= 2 145 mov r3, r3, lsr #1 @dst_offset /= 2 146 147 ldr r4, [r0, #4] @src->u 148 ldr r5, [r1, #4] @dst->u 149 @Read one row of data from the src 150 vld1.8 {d0}, [r4], r2 @Load and increment src 151 vst1.8 {d0}, [r5], r3 @Store and increment dst 152 153 @//Repeat 7 times for u 154 vld1.8 {d0}, [r4], r2 @Load and increment src 155 vst1.8 {d0}, [r5], r3 @Store and increment dst 156 vld1.8 {d0}, [r4], r2 @Load and increment src 157 vst1.8 {d0}, [r5], r3 @Store and increment dst 158 vld1.8 {d0}, [r4], r2 @Load and increment src 159 vst1.8 {d0}, [r5], r3 @Store and increment dst 160 vld1.8 {d0}, [r4], r2 @Load and increment src 161 vst1.8 {d0}, [r5], r3 @Store and increment dst 162 vld1.8 {d0}, [r4], r2 @Load and increment src 163 vst1.8 {d0}, [r5], r3 @Store and increment dst 164 vld1.8 {d0}, [r4], r2 @Load and increment src 165 vst1.8 {d0}, [r5], r3 @Store and increment dst 166 vld1.8 {d0}, [r4], r2 @Load and increment src 167 vst1.8 {d0}, [r5], r3 @Store and increment dst 168 169 ldr r4, [r0, #8] @src->v 170 ldr r5, [r1, #8] @dst->v 171 @Read one row of data from the src 172 vld1.8 {d0}, [r4], r2 @Load and increment src 173 vst1.8 {d0}, [r5], r3 @Store and increment dst 174 175 @//Repeat 7 times for v 176 vld1.8 {d0}, [r4], r2 @Load and increment src 177 vst1.8 {d0}, [r5], r3 @Store and increment dst 178 vld1.8 {d0}, [r4], r2 @Load and increment src 179 vst1.8 {d0}, [r5], r3 @Store and increment dst 180 vld1.8 {d0}, [r4], r2 @Load and increment src 181 vst1.8 {d0}, [r5], r3 @Store and increment dst 182 vld1.8 {d0}, [r4], r2 @Load and increment src 183 vst1.8 {d0}, [r5], r3 @Store and increment dst 184 vld1.8 {d0}, [r4], r2 @Load and increment src 185 vst1.8 {d0}, [r5], r3 @Store and increment dst 186 vld1.8 {d0}, [r4], r2 @Load and increment src 187 vst1.8 {d0}, [r5], r3 @Store and increment dst 188 vld1.8 {d0}, [r4], r2 @Load and increment src 189 vst1.8 {d0}, [r5], r3 @Store and increment dst 190 191 ldmfd r13!, {r4, r5, pc} 192 193 194 195 196 @/* 197 @//--------------------------------------------------------------------------- 198 @// Function Name : impeg2_mc_fullx_halfy_8x8_a9q() 199 @// 200 @// Detail Description : This function pastes the reference block in the 201 @// current frame buffer.This function is called for 202 @// blocks that are not coded and have motion vectors 203 @// with a half pel resolution. 204 @// 205 @// Inputs : r0 - out : Current Block Pointer 206 @// r1 - ref : Refernce Block Pointer 207 @// r2 - ref_wid : Refernce Block Width 208 @// r3 - out_wid ; Current Block Width 209 @// 210 @// Registers Used : D0-D9 211 @// 212 @// Stack Usage : 4 bytes 213 @// 214 @// Outputs : The Motion Compensated Block 215 @// 216 @// Return Data : None 217 @// 218 @// Programming Note : <program limitation> 219 @//----------------------------------------------------------------------------- 220 @*/ 221 222 .global impeg2_mc_fullx_halfy_8x8_a9q 223 224 impeg2_mc_fullx_halfy_8x8_a9q: 225 226 stmfd r13!, {r14} 227 add r14, r1, r2 228 mov r2, r2, lsl #1 229 230 @/* Load 8 + 1 rows from reference block */ 231 @/* Do the addition with out rounding off as rounding value is 1 */ 232 vld1.8 {d0}, [r1], r2 @// first row hence r1 = D0 233 vld1.8 {d2}, [r14], r2 @// second row hence r2 = D2 234 vld1.8 {d4}, [r1], r2 @// third row hence r3 = D4 235 vld1.8 {d6}, [r14], r2 @// fourth row hence r4 = D6 236 vld1.8 {d1}, [r1], r2 @// fifth row hence r5 = D1 237 vld1.8 {d3}, [r14], r2 @// sixth row hence r6 = D3 238 vrhadd.u8 d9, d1, d6 @// estimated row 4 = D9 239 vld1.8 {d5}, [r1], r2 @// seventh row hence r7 = D5 240 vrhadd.u8 q0, q0, q1 @// estimated row 1 = D0, row 5 = D1 241 vld1.8 {d7}, [r14], r2 @// eighth row hence r8 = D7 242 vrhadd.u8 q1, q1, q2 @// estimated row 2 = D2, row 6 = D3 243 vld1.8 {d8}, [r1], r2 @// ninth row hence r9 = D8 244 vrhadd.u8 q2, q2, q3 @// estimated row 3 = D4, row 7 = D5 245 246 add r14, r0, r3 247 mov r3, r3, lsl #1 248 249 @/* Store the eight rows calculated above */ 250 vst1.8 {d2}, [r14], r3 @// second row hence D2 251 vrhadd.u8 d7, d7, d8 @// estimated row 8 = D7 252 vst1.8 {d0}, [r0], r3 @// first row hence D0 253 vst1.8 {d9}, [r14], r3 @// fourth row hence D9 254 vst1.8 {d4}, [r0], r3 @// third row hence D4 255 vst1.8 {d3}, [r14], r3 @// sixth row hence r6 = D3 256 vst1.8 {d1}, [r0], r3 @// fifth row hence r5 = D1 257 vst1.8 {d7}, [r14], r3 @// eighth row hence r8 = D7 258 vst1.8 {d5}, [r0], r3 @// seventh row hence r7 = D5 259 260 ldmfd sp!, {pc} 261 262 263 264 265 266 267 @/* 268 @//--------------------------------------------------------------------------- 269 @// Function Name : impeg2_mc_halfx_fully_8x8_a9q() 270 @// 271 @// Detail Description : This function pastes the reference block in the 272 @// current frame buffer.This function is called for 273 @// blocks that are not coded and have motion vectors 274 @// with a half pel resolutionand VopRoundingType is 0 .. 275 @// 276 @// Inputs : r0 - out : Current Block Pointer 277 @// r1 - ref : Refernce Block Pointer 278 @// r2 - ref_wid : Refernce Block Width 279 @// r3 - out_wid ; Current Block Width 280 @// 281 @// Registers Used : r12, r14, d0-d10, d12-d14, d16-d18, d20-d22 282 283 @// 284 @// Stack Usage : 8 bytes 285 @// 286 @// Outputs : The Motion Compensated Block 287 @// 288 @// Return Data : None 289 @// 290 @// Programming Note : <program limitation> 291 @//----------------------------------------------------------------------------- 292 @*/ 293 294 295 296 .global impeg2_mc_halfx_fully_8x8_a9q 297 298 299 300 impeg2_mc_halfx_fully_8x8_a9q: 301 302 stmfd sp!, {r12, lr} 303 304 add r14, r1, r2, lsl #2 305 306 add r12, r0, r3, lsl#2 307 308 vld1.8 {d0, d1}, [r1], r2 @load 16 pixels of row1 309 310 vld1.8 {d2, d3}, [r14], r2 @ row5 311 312 313 vld1.8 {d4, d5}, [r1], r2 @load 16 pixels row2 314 315 vld1.8 {d6, d7}, [r14], r2 @row6 316 317 318 vext.8 d8, d0, d1, #1 @Extract pixels (1-8) of row1 319 320 vext.8 d12, d2, d3, #1 @Extract pixels (1-8) of row5 321 322 vext.8 d16, d4, d5, #1 @Extract pixels (1-8) of row2 323 324 vext.8 d20, d6, d7, #1 @Extract pixels (1-8) of row6 325 326 327 vld1.8 {d9, d10}, [r1], r2 @load row3 328 329 vld1.8 {d13, d14}, [r14], r2 @load row7 330 331 vld1.8 {d17, d18}, [r1], r2 @load row4 332 333 vld1.8 {d21, d22}, [r14], r2 @load row8 334 335 336 vext.8 d1, d9, d10, #1 @Extract pixels (1-8) of row3 337 338 vext.8 d3, d13, d14, #1 @Extract pixels (1-8) of row7 339 340 341 342 vext.8 d5, d17, d18, #1 @Extract pixels (1-8) of row4 343 344 vext.8 d7, d21, d22, #1 @Extract pixels (1-8) of row8 345 346 347 vrhadd.u8 q0, q0, q4 @operate on row1 and row3 348 349 vrhadd.u8 q1, q1, q6 @operate on row5 and row7 350 351 352 vrhadd.u8 q2, q2, q8 @operate on row2 and row4 353 354 355 356 vrhadd.u8 q3, q3, q10 @operate on row6 and row8 357 358 vst1.8 d0, [r0], r3 @store row1 359 360 vst1.8 d2, [r12], r3 @store row5 361 362 vst1.8 d4, [r0], r3 @store row2 363 364 vst1.8 d6, [r12], r3 @store row6 365 366 vst1.8 d1, [r0], r3 @store row3 367 368 vst1.8 d3, [r12], r3 @store row7 369 370 vst1.8 d5, [r0], r3 @store row4 371 372 vst1.8 d7, [r12], r3 @store row8 373 374 375 376 ldmfd sp!, {r12, pc} 377 378 379 380 381 382 383 384 385 @/* 386 @//--------------------------------------------------------------------------- 387 @// Function Name : impeg2_mc_halfx_halfy_8x8_a9q() 388 @// 389 @// Detail Description : This function pastes the reference block in the 390 @// current frame buffer.This function is called for 391 @// blocks that are not coded and have motion vectors 392 @// with a half pel resolutionand VopRoundingType is 0 .. 393 @// 394 @// Inputs : r0 - out : Current Block Pointer 395 @// r1 - ref : Refernce Block Pointer 396 @// r2 - ref_wid : Refernce Block Width 397 @// r3 - out_wid ; Current Block Width 398 @// 399 @// Registers Used : r14, q0-q15 400 401 @// 402 @// Stack Usage : 4 bytes 403 @// 404 @// Outputs : The Motion Compensated Block 405 @// 406 @// Return Data : None 407 @// 408 @// Programming Note : <program limitation> 409 @//----------------------------------------------------------------------------- 410 @*/ 411 412 413 .global impeg2_mc_halfx_halfy_8x8_a9q 414 415 impeg2_mc_halfx_halfy_8x8_a9q: 416 417 stmfd sp!, {r14} 418 419 add r14, r1, r2, lsl #2 420 421 vld1.8 {d0, d1}, [r1], r2 @load 16 pixels of row1 422 423 vld1.8 {d2, d3}, [r14], r2 @ row5 424 425 vld1.8 {d4, d5}, [r1], r2 @load 16 pixels row2 426 427 vld1.8 {d6, d7}, [r14], r2 @row6 428 429 vext.8 d1, d0, d1, #1 @Extract pixels (1-8) of row1 430 431 432 433 vext.8 d3, d2, d3, #1 @Extract pixels (1-8) of row5 434 435 436 437 vext.8 d5, d4, d5, #1 @Extract pixels (1-8) of row2 438 439 vext.8 d7, d6, d7, #1 @Extract pixels (1-8) of row6 440 441 442 443 444 vld1.8 {d8, d9}, [r1], r2 @load row3 445 446 447 448 vld1.8 {d10, d11}, [r14], r2 @load row7 449 450 vld1.8 {d12, d13}, [r1], r2 @load row4 451 452 vld1.8 {d14, d15}, [r14], r2 @load row8 453 454 vext.8 d9, d8, d9, #1 @Extract pixels (1-8) of row3 455 456 vld1.8 {d16, d17}, [r14], r2 @load row9 457 458 459 460 461 462 vext.8 d11, d10, d11, #1 @Extract pixels (1-8) of row7 463 464 465 466 vext.8 d13, d12, d13, #1 @Extract pixels (1-8) of row4 467 468 469 470 vext.8 d15, d14, d15, #1 @Extract pixels (1-8) of row8 471 472 vext.8 d17, d16, d17, #1 @Extract pixels (1-8) of row9 473 474 475 @interpolation in x direction 476 477 vaddl.u8 q0, d0, d1 @operate row1 478 479 vaddl.u8 q1, d2, d3 @operate row5 480 481 vaddl.u8 q2, d4, d5 @operate row2 482 483 vaddl.u8 q3, d6, d7 @operate row6 484 485 vaddl.u8 q4, d8, d9 @operate row3 486 487 vaddl.u8 q5, d10, d11 @operate row7 488 489 vaddl.u8 q6, d12, d13 @operate row4 490 491 vaddl.u8 q7, d14, d15 @operate row8 492 493 vaddl.u8 q8, d16, d17 @operate row9 494 495 @interpolation in y direction 496 497 add r14, r0, r3, lsl #2 498 499 500 501 vadd.u16 q9, q0, q2 @operate row1 and row2 502 503 vadd.u16 q13, q1, q3 @operate row5 and row6 504 505 vadd.u16 q10, q2, q4 @operate row2 and row3 506 507 vadd.u16 q14, q3, q5 @operate row6 and row7 508 509 vrshrn.u16 d18, q9, #2 @row1 510 511 vrshrn.u16 d26, q13, #2 @row5 512 513 vrshrn.u16 d20, q10, #2 @row2 514 515 vrshrn.u16 d28, q14, #2 @row6 516 517 vadd.u16 q11, q4, q6 @operate row3 and row4 518 519 vst1.8 d18, [r0], r3 @store row1 520 521 vadd.u16 q15, q5, q7 @operate row7 and row8 522 523 vst1.8 d26, [r14], r3 @store row5 524 525 vadd.u16 q12, q6, q1 @operate row4 and row5 526 527 vst1.8 d20, [r0], r3 @store row2 528 529 vadd.u16 q7, q7, q8 @operate row8 and row9 530 531 vst1.8 d28, [r14], r3 @store row6 532 533 534 535 vrshrn.u16 d22, q11, #2 @row3 536 537 vrshrn.u16 d30, q15, #2 @row7 538 539 vrshrn.u16 d24, q12, #2 @row4 540 541 vrshrn.u16 d14, q7, #2 @row8 542 543 544 vst1.8 d22, [r0], r3 @store row3 545 vst1.8 d30, [r14], r3 @store row7 546 vst1.8 d24, [r0], r3 @store row4 547 vst1.8 d14, [r14], r3 @store row8 548 549 550 551 ldmfd sp!, {pc} 552 553 554 555 556 557 @/* 558 @//--------------------------------------------------------------------------- 559 @// Function Name : impeg2_mc_fullx_fully_8x8_a9q() 560 @// 561 @// Detail Description : This function pastes the reference block in the 562 @// current frame buffer.This function is called for 563 @// blocks that are not coded and have motion vectors 564 @// with a half pel resolutionand .. 565 @// 566 @// Inputs : r0 - out : Current Block Pointer 567 @// r1 - ref : Refernce Block Pointer 568 @// r2 - ref_wid : Refernce Block Width 569 @// r3 - out_wid ; Current Block Width 570 @// 571 @// Registers Used : r12, r14, d0-d3 572 573 @// 574 @// Stack Usage : 8 bytes 575 @// 576 @// Outputs : The Motion Compensated Block 577 @// 578 @// Return Data : None 579 @// 580 @// Programming Note : <program limitation> 581 @//----------------------------------------------------------------------------- 582 @*/ 583 584 585 .global impeg2_mc_fullx_fully_8x8_a9q 586 impeg2_mc_fullx_fully_8x8_a9q: 587 588 589 stmfd sp!, {r12, lr} 590 591 add r14, r1, r2, lsl #2 592 593 add r12, r0, r3, lsl #2 594 595 596 vld1.8 d0, [r1], r2 @load row1 597 598 vld1.8 d1, [r14], r2 @load row4 599 600 vld1.8 d2, [r1], r2 @load row2 601 602 vld1.8 d3, [r14], r2 @load row5 603 604 605 vst1.8 d0, [r0], r3 @store row1 606 607 vst1.8 d1, [r12], r3 @store row4 608 609 vst1.8 d2, [r0], r3 @store row2 610 611 vst1.8 d3, [r12], r3 @store row5 612 613 614 vld1.8 d0, [r1], r2 @load row3 615 616 vld1.8 d1, [r14], r2 @load row6 617 618 vld1.8 d2, [r1], r2 @load row4 619 620 vld1.8 d3, [r14], r2 @load row8 621 622 623 vst1.8 d0, [r0], r3 @store row3 624 625 vst1.8 d1, [r12], r3 @store row6 626 627 vst1.8 d2, [r0], r3 @store row4 628 629 vst1.8 d3, [r12], r3 @store row8 630 631 632 ldmfd sp!, {r12, pc} 633 634 635 636 637 638 @/* 639 @//--------------------------------------------------------------------------- 640 @// Function Name : impeg2_interpolate_a9q() 641 @// 642 @// Detail Description : interpolates two buffers and adds pred 643 @// 644 @// Inputs : r0 - pointer to src1 645 @// r1 - pointer to src2 646 @// r2 - dest buf 647 @// r3 - dst stride 648 @// Registers Used : r4, r5, r7, r14, d0-d15 649 @// 650 @// Stack Usage : 20 bytes 651 @// 652 @// Outputs : The Motion Compensated Block 653 @// 654 @// Return Data : None 655 @// 656 @// Programming Note : <program limitation> 657 @//----------------------------------------------------------------------------- 658 @*/ 659 660 661 .global impeg2_interpolate_a9q 662 663 664 impeg2_interpolate_a9q: 665 666 stmfd r13!, {r4, r5, r7, r12, r14} 667 668 ldr r4, [r0, #0] @ptr_y src1 669 670 ldr r5, [r1, #0] @ptr_y src2 671 672 ldr r7, [r2, #0] @ptr_y dst buf 673 674 mov r12, #4 @counter for number of blocks 675 676 677 interp_lumablocks_stride: 678 679 vld1.8 {d0, d1}, [r4]! @row1 src1 680 681 vld1.8 {d2, d3}, [r4]! @row2 src1 682 683 vld1.8 {d4, d5}, [r4]! @row3 src1 684 685 vld1.8 {d6, d7}, [r4]! @row4 src1 686 687 688 vld1.8 {d8, d9}, [r5]! @row1 src2 689 690 vld1.8 {d10, d11}, [r5]! @row2 src2 691 692 vld1.8 {d12, d13}, [r5]! @row3 src2 693 694 vld1.8 {d14, d15}, [r5]! @row4 src2 695 696 697 698 699 vrhadd.u8 q0, q0, q4 @operate on row1 700 701 vrhadd.u8 q1, q1, q5 @operate on row2 702 703 vrhadd.u8 q2, q2, q6 @operate on row3 704 705 vrhadd.u8 q3, q3, q7 @operate on row4 706 707 708 709 vst1.8 {d0, d1}, [r7], r3 @row1 710 711 vst1.8 {d2, d3}, [r7], r3 @row2 712 713 vst1.8 {d4, d5}, [r7], r3 @row3 714 715 vst1.8 {d6, d7}, [r7], r3 @row4 716 717 subs r12, r12, #1 718 719 bne interp_lumablocks_stride 720 721 722 mov r3, r3, lsr #1 @stride >> 1 723 724 ldr r4, [r0, #4] @ptr_u src1 725 726 ldr r5, [r1, #4] @ptr_u src2 727 728 ldr r7 , [r2, #4] @ptr_u dst buf 729 730 mov r12, #2 @counter for number of blocks 731 732 733 734 @chroma blocks 735 736 interp_chromablocks_stride: 737 738 vld1.8 {d0, d1}, [r4]! @row1 & 2 src1 739 740 vld1.8 {d2, d3}, [r4]! @row3 & 4 src1 741 742 vld1.8 {d4, d5}, [r4]! @row5 & 6 src1 743 744 vld1.8 {d6, d7}, [r4]! @row7 & 8 src1 745 746 747 vld1.8 {d8, d9}, [r5]! @row1 & 2 src2 748 749 vld1.8 {d10, d11}, [r5]! @row3 & 4 src2 750 751 vld1.8 {d12, d13}, [r5]! @row5 & 6 src2 752 753 vld1.8 {d14, d15}, [r5]! @row7 & 8 src2 754 755 756 757 758 vrhadd.u8 q0, q0, q4 @operate on row1 & 2 759 760 vrhadd.u8 q1, q1, q5 @operate on row3 & 4 761 762 vrhadd.u8 q2, q2, q6 @operate on row5 & 6 763 764 vrhadd.u8 q3, q3, q7 @operate on row7 & 8 765 766 767 vst1.8 {d0}, [r7], r3 @row1 768 769 vst1.8 {d1}, [r7], r3 @row2 770 771 vst1.8 {d2}, [r7], r3 @row3 772 773 vst1.8 {d3}, [r7], r3 @row4 774 775 vst1.8 {d4}, [r7], r3 @row5 776 777 vst1.8 {d5}, [r7], r3 @row6 778 779 vst1.8 {d6}, [r7], r3 @row7 780 781 vst1.8 {d7}, [r7], r3 @row8 782 783 784 785 ldr r4, [r0, #8] @ptr_v src1 786 787 ldr r5, [r1, #8] @ptr_v src2 788 789 ldr r7, [r2, #8] @ptr_v dst buf 790 791 subs r12, r12, #1 792 793 bne interp_chromablocks_stride 794 795 796 ldmfd r13!, {r4, r5, r7, r12, pc} 797 798 799 800 801 802