1 @/****************************************************************************** 2 @ * 3 @ * Copyright (C) 2015 The Android Open Source Project 4 @ * 5 @ * Licensed under the Apache License, Version 2.0 (the "License"); 6 @ * you may not use this file except in compliance with the License. 7 @ * You may obtain a copy of the License at: 8 @ * 9 @ * http://www.apache.org/licenses/LICENSE-2.0 10 @ * 11 @ * Unless required by applicable law or agreed to in writing, software 12 @ * distributed under the License is distributed on an "AS IS" BASIS, 13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @ * See the License for the specific language governing permissions and 15 @ * limitations under the License. 16 @ * 17 @ ***************************************************************************** 18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 @*/ 20 @* 21 @ ******************************************************************************* 22 @ * @file 23 @ * ih264_padding_neon.s 24 @ * 25 @ * @brief 26 @ * Contains function definitions padding 27 @ * 28 @ * @author 29 @ * Ittiam 30 @ * 31 @ * @par List of Functions: 32 @ * - ih264_pad_top_a9q() 33 @ * - ih264_pad_left_luma_a9q() 34 @ * - ih264_pad_left_chroma_a9q() 35 @ * - ih264_pad_right_luma_a9q() 36 @ * - ih264_pad_right_chroma_a9q() 37 @ * 38 @ * @remarks 39 @ * None 40 @ * 41 @ ******************************************************************************* 42 @* 43 44 45 @** 46 @******************************************************************************* 47 @* 48 @* @brief pad at the top of a 2d array 49 @* 50 @* @par Description: 51 @* The top row of a 2d array is replicated for pad_size times at the top 52 @* 53 @* @param[in] pu1_src 54 @* UWORD8 pointer to the source 55 @* 56 @* @param[in] src_strd 57 @* integer source stride 58 @* 59 @* @param[in] wd 60 @* integer width of the array 61 @* 62 @* @param[in] pad_size 63 @* integer -padding size of the array 64 @* 65 @* @returns none 66 @* 67 @* @remarks none 68 @* 69 @******************************************************************************* 70 @* 71 @void ih264_pad_top(UWORD8 *pu1_src, 72 @ WORD32 src_strd, 73 @ WORD32 wd, 74 @ WORD32 pad_size) 75 @**************Variables Vs Registers************************* 76 @ r0 => *pu1_src 77 @ r1 => src_strd 78 @ r2 => wd 79 @ r3 => pad_size 80 81 .text 82 .p2align 2 83 84 .global ih264_pad_top_a9q 85 86 ih264_pad_top_a9q: 87 88 stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments 89 90 sub r5, r0, r1 91 rsb r6, r1, #0 92 93 loop_neon_memcpy_mul_16: 94 @ Load 16 bytes 95 vld1.8 {d0, d1}, [r0]! 96 mov r4, r5 97 mov r7, r3 98 add r5, r5, #16 99 100 loop_neon_pad_top: 101 vst1.8 {d0, d1}, [r4], r6 102 subs r7, r7, #1 103 bne loop_neon_pad_top 104 105 subs r2, r2, #16 106 bne loop_neon_memcpy_mul_16 107 108 ldmfd sp!, {r4-r11, pc} @Reload the registers from SP 109 110 111 112 113 @** 114 @******************************************************************************* 115 @* 116 @* @brief 117 @* Padding (luma block) at the left of a 2d array 118 @* 119 @* @par Description: 120 @* The left column of a 2d array is replicated for pad_size times at the left 121 @* 122 @* 123 @* @param[in] pu1_src 124 @* UWORD8 pointer to the source 125 @* 126 @* @param[in] src_strd 127 @* integer source stride 128 @* 129 @* @param[in] ht 130 @* integer height of the array 131 @* 132 @* @param[in] wd 133 @* integer width of the array 134 @* 135 @* @param[in] pad_size 136 @* integer -padding size of the array 137 @* 138 @* @param[in] ht 139 @* integer height of the array 140 @* 141 @* @param[in] wd 142 @* integer width of the array 143 @* 144 @* @returns 145 @* 146 @* @remarks 147 @* None 148 @* 149 @******************************************************************************* 150 @* 151 @#if PAD_LEFT_LUMA == C 152 @void ih264_pad_left_luma(UWORD8 *pu1_src, 153 @ WORD32 src_strd, 154 @ WORD32 ht, 155 @ WORD32 pad_size) 156 @**************Variables Vs Registers************************* 157 @ r0 => *pu1_src 158 @ r1 => src_strd 159 @ r2 => ht 160 @ r3 => pad_size 161 162 163 164 .global ih264_pad_left_luma_a9q 165 166 ih264_pad_left_luma_a9q: 167 168 stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments 169 170 171 sub r4, r0, r3 172 sub r6, r1, #16 173 subs r5, r3, #16 174 bne loop_32 175 loop_16: @ /*hard coded for width=16 ,height =8,16*/ 176 ldrb r8, [r0], r1 177 ldrb r9, [r0], r1 178 vdup.u8 q0, r8 179 ldrb r10, [r0], r1 180 vst1.8 {q0}, [r4], r1 @ 16 bytes store 181 vdup.u8 q1, r9 182 vst1.8 {q1}, [r4], r1 @ 16 bytes store 183 ldrb r11, [r0], r1 184 vdup.u8 q2, r10 185 vdup.u8 q3, r11 186 vst1.8 {q2}, [r4], r1 @ 16 bytes store 187 ldrb r8, [r0], r1 188 vst1.8 {q3}, [r4], r1 @ 16 bytes store 189 ldrb r9, [r0], r1 190 vdup.u8 q0, r8 191 ldrb r10, [r0], r1 192 vst1.8 {q0}, [r4], r1 @ 16 bytes store 193 vdup.u8 q1, r9 194 ldrb r11, [r0], r1 195 vst1.8 {q1}, [r4], r1 @ 16 bytes store 196 vdup.u8 q2, r10 197 vdup.u8 q3, r11 198 subs r2, r2, #8 199 vst1.8 {q2}, [r4], r1 @ 16 bytes store 200 vst1.8 {q3}, [r4], r1 @ 16 bytes store 201 bne loop_16 202 b end_func 203 204 loop_32: @ /*hard coded for width=32 ,height =8,16*/ 205 ldrb r8, [r0], r1 206 ldrb r9, [r0], r1 207 vdup.u8 q0, r8 208 ldrb r10, [r0], r1 209 vst1.8 {q0}, [r4]! @ 16 bytes store 210 vdup.u8 q1, r9 211 vst1.8 {q0}, [r4], r6 212 vst1.8 {q1}, [r4]! @ 16 bytes store 213 vdup.u8 q2, r10 214 vst1.8 {q1}, [r4], r6 @ 16 bytes store 215 ldrb r11, [r0], r1 216 vst1.8 {q2}, [r4]! @ 16 bytes store 217 vdup.u8 q3, r11 218 vst1.8 {q2}, [r4], r6 @ 16 bytes store 219 ldrb r8, [r0], r1 220 vst1.8 {q3}, [r4]! @ 16 bytes store 221 vdup.u8 q0, r8 222 ldrb r9, [r0], r1 223 vst1.8 {q3}, [r4], r6 @ 16 bytes store 224 ldrb r10, [r0], r1 225 vst1.8 {q0}, [r4]! @ 16 bytes store 226 vdup.u8 q1, r9 227 vst1.8 {q0}, [r4], r6 @ 16 bytes store 228 ldrb r11, [r0], r1 229 vst1.8 {q1}, [r4]! @ 16 bytes store 230 vdup.u8 q2, r10 231 vst1.8 {q1}, [r4], r6 @ 16 bytes store 232 vst1.8 {q2}, [r4]! @ 16 bytes store 233 vdup.u8 q3, r11 234 vst1.8 {q2}, [r4], r6 @ 16 bytes store 235 subs r2, r2, #8 236 vst1.8 {q3}, [r4]! @ 16 bytes store 237 vst1.8 {q3}, [r4], r6 @ 16 bytes store 238 bne loop_32 239 240 241 242 end_func: 243 ldmfd sp!, {r4-r11, pc} @Reload the registers from SP 244 245 246 247 248 249 @** 250 @******************************************************************************* 251 @* 252 @* @brief 253 @* Padding (chroma block) at the left of a 2d array 254 @* 255 @* @par Description: 256 @* The left column of a 2d array is replicated for pad_size times at the left 257 @* 258 @* 259 @* @param[in] pu1_src 260 @* UWORD8 pointer to the source 261 @* 262 @* @param[in] src_strd 263 @* integer source stride 264 @* 265 @* @param[in] ht 266 @* integer height of the array 267 @* 268 @* @param[in] wd 269 @* integer width of the array (each colour component) 270 @* 271 @* @param[in] pad_size 272 @* integer -padding size of the array 273 @* 274 @* @param[in] ht 275 @* integer height of the array 276 @* 277 @* @param[in] wd 278 @* integer width of the array 279 @* 280 @* @returns 281 @* 282 @* @remarks 283 @* None 284 @* 285 @******************************************************************************* 286 @* 287 @#if PAD_LEFT_CHROMA == C 288 @void ih264_pad_left_chroma(UWORD8 *pu1_src, 289 @ WORD32 src_strd, 290 @ WORD32 ht, 291 @ WORD32 pad_size) 292 @{ 293 @ r0 => *pu1_src 294 @ r1 => src_strd 295 @ r2 => ht 296 @ r3 => pad_size 297 298 299 300 .global ih264_pad_left_chroma_a9q 301 302 ih264_pad_left_chroma_a9q: 303 304 stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments 305 306 sub r4, r0, r3 307 sub r6, r1, #16 308 309 310 loop_32_l_c: @ /*hard coded for width=32 ,height =4,8,12*/ 311 ldrh r8, [r0], r1 312 ldrh r9, [r0], r1 313 vdup.u16 q0, r8 314 ldrh r10, [r0], r1 315 vst1.8 {q0}, [r4]! @ 16 bytes store 316 vdup.u16 q1, r9 317 vst1.8 {q0}, [r4], r6 @ 16 bytes store 318 ldrh r11, [r0], r1 319 vst1.8 {q1}, [r4]! @ 16 bytes store 320 vdup.u16 q2, r10 321 vst1.8 {q1}, [r4], r6 @ 16 bytes store 322 vdup.u16 q3, r11 323 vst1.8 {q2}, [r4]! @ 16 bytes store 324 vst1.8 {q2}, [r4], r6 @ 16 bytes store 325 subs r2, r2, #4 326 vst1.8 {q3}, [r4]! @ 16 bytes store 327 vst1.8 {q3}, [r4], r6 @ 16 bytes store 328 329 330 beq end_func_l_c @/* Branching when ht=4*/ 331 332 ldrh r8, [r0], r1 333 ldrh r9, [r0], r1 334 vdup.u16 q0, r8 335 ldrh r10, [r0], r1 336 vst1.8 {q0}, [r4]! @ 16 bytes store 337 vdup.u16 q1, r9 338 vst1.8 {q0}, [r4], r6 339 ldrh r11, [r0], r1 340 vst1.8 {q1}, [r4]! @ 16 bytes store 341 vdup.u16 q2, r10 342 vst1.8 {q1}, [r4], r6 @ 16 bytes store 343 vdup.u16 q3, r11 344 vst1.8 {q2}, [r4]! @ 16 bytes store 345 vst1.8 {q2}, [r4], r6 @ 16 bytes store 346 subs r2, r2, #4 347 vst1.8 {q3}, [r4]! @ 16 bytes store 348 vst1.8 {q3}, [r4], r6 @ 16 bytes store 349 350 beq end_func_l_c @/* Branching when ht=8*/ 351 bne loop_32_l_c 352 353 ldrh r8, [r0], r1 354 ldrh r9, [r0], r1 355 vdup.u16 q0, r8 356 ldrh r10, [r0], r1 357 vst1.8 {q0}, [r4]! @ 16 bytes store 358 vdup.u16 q1, r9 359 vst1.8 {q0}, [r4], r6 360 ldrh r11, [r0], r1 361 vst1.8 {q1}, [r4]! @ 16 bytes store 362 vdup.u16 q2, r10 363 vst1.8 {q1}, [r4], r6 @ 16 bytes store 364 vdup.u16 q3, r11 365 vst1.8 {q2}, [r4]! @ 16 bytes store 366 vst1.8 {q2}, [r4], r6 @ 16 bytes store 367 vst1.8 {q3}, [r4]! @ 16 bytes store 368 vst1.8 {q3}, [r4], r6 @ 16 bytes store 369 370 end_func_l_c: 371 ldmfd sp!, {r4-r11, pc} @Reload the registers from SP 372 373 374 375 376 377 @** 378 @******************************************************************************* 379 @* 380 @* @brief 381 @* Padding (luma block) at the right of a 2d array 382 @* 383 @* @par Description: 384 @* The right column of a 2d array is replicated for pad_size times at the right 385 @* 386 @* 387 @* @param[in] pu1_src 388 @* UWORD8 pointer to the source 389 @* 390 @* @param[in] src_strd 391 @* integer source stride 392 @* 393 @* @param[in] ht 394 @* integer height of the array 395 @* 396 @* @param[in] wd 397 @* integer width of the array 398 @* 399 @* @param[in] pad_size 400 @* integer -padding size of the array 401 @* 402 @* @param[in] ht 403 @* integer height of the array 404 @* 405 @* @param[in] wd 406 @* integer width of the array 407 @* 408 @* @returns 409 @* 410 @* @remarks 411 @* None 412 @* 413 @******************************************************************************* 414 @* 415 @#if PAD_RIGHT_LUMA == C 416 @void ih264_pad_right_luma(UWORD8 *pu1_src, 417 @ WORD32 src_strd, 418 @ WORD32 ht, 419 @ WORD32 pad_size) 420 @{ 421 @ WORD32 row; 422 @ 423 @ for(row = 0; row < ht; row++) 424 @ { 425 @ memset(pu1_src, *(pu1_src -1), pad_size); 426 @ 427 @ pu1_src += src_strd; 428 @ } 429 @} 430 @ 431 @ r0 => *pu1_src 432 @ r1 => src_strd 433 @ r2 => ht 434 @ r3 => pad_size 435 436 437 438 .global ih264_pad_right_luma_a9q 439 440 ih264_pad_right_luma_a9q: 441 442 stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments 443 444 mov r4, r0 445 sub r6, r1, #16 446 sub r0, r0, #1 447 subs r5, r3, #16 448 bne loop_32 449 loop_16_r: @ /*hard coded for width=16 ,height =8,16*/ 450 ldrb r8, [r0], r1 451 ldrb r9, [r0], r1 452 vdup.u8 q0, r8 453 ldrb r10, [r0], r1 454 vst1.8 {q0}, [r4], r1 @ 16 bytes store 455 vdup.u8 q1, r9 456 vst1.8 {q1}, [r4], r1 @ 16 bytes store 457 ldrb r11, [r0], r1 458 vdup.u8 q2, r10 459 vdup.u8 q3, r11 460 vst1.8 {q2}, [r4], r1 @ 16 bytes store 461 ldrb r8, [r0], r1 462 vst1.8 {q3}, [r4], r1 @ 16 bytes store 463 ldrb r9, [r0], r1 464 vdup.u8 q0, r8 465 ldrb r10, [r0], r1 466 vst1.8 {q0}, [r4], r1 @ 16 bytes store 467 vdup.u8 q1, r9 468 ldrb r11, [r0], r1 469 vst1.8 {q1}, [r4], r1 @ 16 bytes store 470 vdup.u8 q2, r10 471 vdup.u8 q3, r11 472 subs r2, r2, #8 473 vst1.8 {q2}, [r4], r1 @ 16 bytes store 474 vst1.8 {q3}, [r4], r1 @ 16 bytes store 475 bne loop_16_r 476 b end_func_r 477 478 loop_32_r: @ /*hard coded for width=32 ,height =8,16*/ 479 ldrb r8, [r0], r1 480 ldrb r9, [r0], r1 481 vdup.u8 q0, r8 482 ldrb r10, [r0], r1 483 vst1.8 {q0}, [r4]! @ 16 bytes store 484 vdup.u8 q1, r9 485 vst1.8 {q0}, [r4], r6 486 vst1.8 {q1}, [r4]! @ 16 bytes store 487 vdup.u8 q2, r10 488 vst1.8 {q1}, [r4], r6 @ 16 bytes store 489 ldrb r11, [r0], r1 490 vst1.8 {q2}, [r4]! @ 16 bytes store 491 vdup.u8 q3, r11 492 vst1.8 {q2}, [r4], r6 @ 16 bytes store 493 ldrb r8, [r0], r1 494 vst1.8 {q3}, [r4]! @ 16 bytes store 495 ldrb r9, [r0], r1 496 vdup.u8 q0, r8 497 vst1.8 {q3}, [r4], r6 @ 16 bytes store 498 ldrb r10, [r0], r1 499 vst1.8 {q0}, [r4]! @ 16 bytes store 500 vdup.u8 q1, r9 501 vst1.8 {q0}, [r4], r6 @ 16 bytes store 502 ldrb r11, [r0], r1 503 vst1.8 {q1}, [r4]! @ 16 bytes store 504 vdup.u8 q2, r10 505 vst1.8 {q1}, [r4], r6 @ 16 bytes store 506 vst1.8 {q2}, [r4]! @ 16 bytes store 507 vdup.u8 q3, r11 508 vst1.8 {q2}, [r4], r6 @ 16 bytes store 509 subs r2, r2, #8 510 vst1.8 {q3}, [r4]! @ 16 bytes store 511 vst1.8 {q3}, [r4], r6 @ 16 bytes store 512 bne loop_32_r 513 514 515 516 end_func_r: 517 ldmfd sp!, {r4-r11, pc} @Reload the registers from SP 518 519 520 521 522 523 @** 524 @******************************************************************************* 525 @* 526 @* @brief 527 @;* Padding (chroma block) at the right of a 2d array 528 @* 529 @* @par Description: 530 @* The right column of a 2d array is replicated for pad_size times at the right 531 @* 532 @* 533 @* @param[in] pu1_src 534 @;* UWORD8 pointer to the source 535 @* 536 @* @param[in] src_strd 537 @* integer source stride 538 @* 539 @* @param[in] ht 540 @;* integer height of the array 541 @* 542 @* @param[in] wd 543 @* integer width of the array (each colour component) 544 @* 545 @* @param[in] pad_size 546 @* integer -padding size of the array 547 @* 548 @* @param[in] ht 549 @;* integer height of the array 550 @* 551 @* @param[in] wd 552 @* integer width of the array 553 @* 554 @* @returns 555 @* 556 @* @remarks 557 @* None 558 @* 559 @******************************************************************************* 560 @* 561 @#if PAD_RIGHT_CHROMA == C 562 @void ih264_pad_right_chroma(UWORD8 *pu1_src, 563 @ WORD32 src_strd, 564 @ WORD32 ht, 565 @ WORD32 pad_size) 566 @ r0 => *pu1_src 567 @ r1 => src_strd 568 @ r2 => ht 569 @ r3 => pad_size 570 571 572 573 .global ih264_pad_right_chroma_a9q 574 575 ih264_pad_right_chroma_a9q: 576 577 stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments 578 579 mov r4, r0 580 sub r6, r1, #16 581 sub r0, r0, #2 582 loop_32_r_c: @ /*hard coded for width=32 ,height =8,4*/ 583 ldrh r8, [r0], r1 584 ldrh r9, [r0], r1 585 vdup.u16 q0, r8 586 ldrh r10, [r0], r1 587 vst1.8 {q0}, [r4]! @ 16 bytes store 588 vdup.u16 q1, r9 589 vst1.8 {q0}, [r4], r6 590 vst1.8 {q1}, [r4]! @ 16 bytes store 591 vdup.u16 q2, r10 592 vst1.8 {q1}, [r4], r6 @ 16 bytes store 593 subs r2, r2, #4 594 ldrh r11, [r0], r1 595 vst1.8 {q2}, [r4]! @ 16 bytes store 596 vdup.u16 q3, r11 597 vst1.8 {q2}, [r4], r6 @ 16 bytes store 598 vst1.8 {q3}, [r4]! @ 16 bytes store 599 vst1.8 {q3}, [r4], r6 @ 16 bytes store 600 601 beq end_func_r_c @/* Branching when ht=4*/ 602 603 ldrh r8, [r0], r1 604 vdup.u16 q0, r8 605 ldrh r9, [r0], r1 606 ldrh r10, [r0], r1 607 vst1.8 {q0}, [r4]! @ 16 bytes store 608 vdup.u16 q1, r9 609 vst1.8 {q0}, [r4], r6 @ 16 bytes store 610 ldrh r11, [r0], r1 611 vst1.8 {q1}, [r4]! @ 16 bytes store 612 vdup.u16 q2, r10 613 vst1.8 {q1}, [r4], r6 @ 16 bytes store 614 vst1.8 {q2}, [r4]! @ 16 bytes store 615 vdup.u16 q3, r11 616 vst1.8 {q2}, [r4], r6 @ 16 bytes store 617 subs r2, r2, #4 618 vst1.8 {q3}, [r4]! @ 16 bytes store 619 vst1.8 {q3}, [r4], r6 @ 16 bytes store 620 621 beq end_func_r_c @/* Branching when ht=8*/ 622 bne loop_32_r_c 623 624 ldrh r8, [r0], r1 625 vdup.u16 q0, r8 626 ldrh r9, [r0], r1 627 ldrh r10, [r0], r1 628 vst1.8 {q0}, [r4]! @ 16 bytes store 629 vdup.u16 q1, r9 630 vst1.8 {q0}, [r4], r6 @ 16 bytes store 631 ldrh r11, [r0], r1 632 vst1.8 {q1}, [r4]! @ 16 bytes store 633 vdup.u16 q2, r10 634 vst1.8 {q1}, [r4], r6 @ 16 bytes store 635 vst1.8 {q2}, [r4]! @ 16 bytes store 636 vdup.u16 q3, r11 637 vst1.8 {q2}, [r4], r6 @ 16 bytes store 638 vst1.8 {q3}, [r4]! @ 16 bytes store 639 vst1.8 {q3}, [r4], r6 @ 16 bytes store 640 641 end_func_r_c: 642 ldmfd sp!, {r4-r11, pc} @Reload the registers from SP 643 644 645 646 647 648