1 //****************************************************************************** 2 //* 3 //* Copyright (C) 2015 The Android Open Source Project 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //***************************************************************************** 18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 //*/ 20 ///** 21 //****************************************************************************** 22 //* @file 23 //* ih264_intra_pred_chroma.s 24 //* 25 //* @brief 26 //* Contains function definitions for intra chroma prediction . 27 //* 28 //* @author 29 //* Ittiam 30 //* 31 //* @par List of Functions: 32 //* 33 //* - ih264_intra_pred_luma_chroma_mode_vert_av8() 34 //* - ih264_intra_pred_luma_chroma_mode_horz_av8() 35 //* - ih264_intra_pred_luma_chroma_mode_dc_av8() 36 //* - ih264_intra_pred_luma_chroma_mode_plane_av8() 37 //* 38 //* @remarks 39 //* None 40 //* 41 //******************************************************************************* 42 //*/ 43 44 ///* All the functions here are replicated from ih264_chroma_intra_pred_filters.c 45 // 46 47 ///** 48 ///** 49 ///** 50 // 51 52 53 .text 54 .p2align 2 55 .include "ih264_neon_macros.s" 56 57 .extern ih264_gai1_intrapred_chroma_plane_coeffs1 58 .extern ih264_gai1_intrapred_chroma_plane_coeffs2 59 60 61 62 ///** 63 //******************************************************************************* 64 //* 65 //*ih264_intra_pred_chroma_8x8_mode_dc 66 //* 67 //* @brief 68 //* Perform Intra prediction for chroma_8x8 mode:DC 69 //* 70 //* @par Description: 71 //* Perform Intra prediction for chroma_8x8 mode:DC ,described in sec 8.3.4.1 72 //* 73 //* @param[in] pu1_src 74 //* UWORD8 pointer to the source containing alternate U and V samples 75 //* 76 //* @param[out] pu1_dst 77 //* UWORD8 pointer to the destination with alternate U and V samples 78 //* 79 //* @param[in] src_strd 80 //* integer source stride 81 //* 82 //* @param[in] dst_strd 83 //* integer destination stride 84 //* 85 //** @param[in] ui_neighboravailability 86 //* availability of neighbouring pixels 87 //* 88 //* @returns 89 //* 90 //* @remarks 91 //* None 92 //* 93 //*******************************************************************************/ 94 //void ih264_intra_pred_chroma_8x8_mode_dc(UWORD8 *pu1_src, 95 // UWORD8 *pu1_dst, 96 // WORD32 src_strd, 97 // WORD32 dst_strd, 98 // WORD32 ui_neighboravailability) 99 100 //**************Variables Vs Registers***************************************** 101 // x0 => *pu1_src 102 // x1 => *pu1_dst 103 // x2 => src_strd 104 // x3 => dst_strd 105 // x4 => ui_neighboravailability 106 107 108 109 .global ih264_intra_pred_chroma_8x8_mode_dc_av8 110 111 ih264_intra_pred_chroma_8x8_mode_dc_av8: 112 113 114 push_v_regs 115 stp x19, x20, [sp, #-16]! 116 117 mov x19, #5 118 ands x6, x4, x19 119 beq none_available 120 cmp x6, #1 121 beq left_only_available 122 cmp x6, #4 123 beq top_only_available 124 125 all_available: 126 ld1 {v0.8b, v1.8b}, [x0] 127 add x6, x0, #18 128 ld1 {v2.8b, v3.8b}, [x6] 129 uxtl v0.8h, v0.8b 130 uxtl v1.8h, v1.8b 131 addp v0.4s, v0.4s , v0.4s 132 addp v1.4s, v1.4s , v1.4s 133 addp v0.4s, v0.4s , v0.4s 134 addp v1.4s, v1.4s , v1.4s 135 uxtl v2.8h, v2.8b 136 uxtl v3.8h, v3.8b 137 addp v2.4s, v2.4s , v2.4s 138 addp v3.4s, v3.4s , v3.4s 139 addp v2.4s, v2.4s , v2.4s 140 addp v3.4s, v3.4s , v3.4s 141 rshrn v5.8b, v0.8h, #2 142 dup v21.8h, v5.h[0] 143 rshrn v6.8b, v3.8h, #2 144 dup v20.8h, v6.h[0] 145 add v1.8h, v1.8h, v2.8h 146 rshrn v1.8b, v1.8h, #3 147 dup v23.8h, v1.h[0] 148 mov v20.d[0], v23.d[0] 149 add v0.8h, v0.8h, v3.8h 150 rshrn v0.8b, v0.8h, #3 151 dup v23.8h, v0.h[0] 152 mov v21.d[1], v23.d[0] 153 b store 154 left_only_available: 155 ld1 {v0.8b, v1.8b}, [x0] 156 uxtl v0.8h, v0.8b 157 uxtl v1.8h, v1.8b 158 addp v0.4s, v0.4s , v0.4s 159 addp v1.4s, v1.4s , v1.4s 160 addp v0.4s, v0.4s , v0.4s 161 addp v1.4s, v1.4s , v1.4s 162 rshrn v0.8b, v0.8h, #2 163 rshrn v1.8b, v1.8h, #2 164 dup v20.8h , v1.h[0] 165 dup v21.8h, v0.h[0] 166 b store 167 168 top_only_available: 169 add x6, x0, #18 170 ld1 {v0.8b, v1.8b}, [x6] 171 uxtl v0.8h, v0.8b 172 uxtl v1.8h, v1.8b 173 addp v0.4s, v0.4s , v0.4s 174 addp v1.4s, v1.4s , v1.4s 175 addp v0.4s, v0.4s , v0.4s 176 addp v1.4s, v1.4s , v1.4s 177 rshrn v0.8b, v0.8h, #2 178 rshrn v1.8b, v1.8h, #2 179 dup v20.8h , v0.h[0] 180 dup v21.8h, v1.h[0] 181 mov v20.d[1], v21.d[1] 182 mov v21.d[0], v20.d[0] 183 b store 184 none_available: 185 mov w15, #128 186 dup v20.16b, w15 187 dup v21.16b, w15 188 189 190 store: 191 192 st1 { v20.16b}, [x1], x3 193 st1 { v20.16b}, [x1], x3 194 st1 { v20.16b}, [x1], x3 195 st1 { v20.16b}, [x1], x3 196 st1 { v21.16b}, [x1], x3 197 st1 { v21.16b}, [x1], x3 198 st1 { v21.16b}, [x1], x3 199 st1 { v21.16b}, [x1], x3 200 end_func: 201 202 ldp x19, x20, [sp], #16 203 pop_v_regs 204 ret 205 206 207 208 209 210 ///****************************************************************************** 211 212 213 ///** 214 //******************************************************************************* 215 //* 216 //*ih264_intra_pred_chroma_8x8_mode_horz 217 //* 218 //* @brief 219 //* Perform Intra prediction for chroma_8x8 mode:Horizontal 220 //* 221 //* @par Description: 222 //* Perform Intra prediction for chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2 223 //* 224 //* @param[in] pu1_src 225 //* UWORD8 pointer to the source containing alternate U and V samples 226 //* 227 //* @param[out] pu1_dst 228 //* UWORD8 pointer to the destination with alternate U and V samples 229 //* 230 //* @param[in] src_strd 231 //* integer source stride 232 //* 233 //* @param[in] dst_strd 234 //* integer destination stride 235 //* 236 //* @param[in] ui_neighboravailability 237 //* availability of neighbouring pixels(Not used in this function) 238 //* 239 //* @returns 240 //* 241 //* @remarks 242 //* None 243 //* 244 //******************************************************************************* 245 //*/ 246 //void ih264_intra_pred_chroma_8x8_mode_horz(UWORD8 *pu1_src, 247 // UWORD8 *pu1_dst, 248 // WORD32 src_strd, 249 // WORD32 dst_strd, 250 // WORD32 ui_neighboravailability) 251 //**************Variables Vs Registers***************************************** 252 // x0 => *pu1_src 253 // x1 => *pu1_dst 254 // x2 => src_strd 255 // x3 => dst_strd 256 // x4 => ui_neighboravailability 257 258 259 .global ih264_intra_pred_chroma_8x8_mode_horz_av8 260 261 ih264_intra_pred_chroma_8x8_mode_horz_av8: 262 263 264 265 push_v_regs 266 ld1 {v0.8h}, [x0] 267 268 dup v10.8h, v0.h[7] 269 dup v11.8h, v0.h[6] 270 dup v12.8h, v0.h[5] 271 dup v13.8h, v0.h[4] 272 st1 {v10.8h}, [x1], x3 273 dup v14.8h, v0.h[3] 274 st1 {v11.8h}, [x1], x3 275 dup v15.8h, v0.h[2] 276 st1 {v12.8h}, [x1], x3 277 dup v16.8h, v0.h[1] 278 st1 {v13.8h}, [x1], x3 279 dup v17.8h, v0.h[0] 280 st1 {v14.8h}, [x1], x3 281 st1 {v15.8h}, [x1], x3 282 st1 {v16.8h}, [x1], x3 283 st1 {v17.8h}, [x1], x3 284 285 286 pop_v_regs 287 ret 288 289 290 291 292 293 294 ///** 295 //******************************************************************************* 296 //* 297 //*ih264_intra_pred_chroma_8x8_mode_vert 298 //* 299 //* @brief 300 //* Perform Intra prediction for chroma_8x8 mode:vertical 301 //* 302 //* @par Description: 303 //*Perform Intra prediction for chroma_8x8 mode:vertical ,described in sec 8.3.4.3 304 //* 305 //* @param[in] pu1_src 306 //* UWORD8 pointer to the source containing alternate U and V samples 307 //* 308 //* @param[out] pu1_dst 309 //* UWORD8 pointer to the destination with alternate U and V samples 310 //* 311 //* @param[in] src_strd 312 //* integer source stride 313 //* 314 //* @param[in] dst_strd 315 //* integer destination stride 316 //* 317 //* @param[in] ui_neighboravailability 318 //* availability of neighbouring pixels(Not used in this function) 319 //* 320 //* @returns 321 //* 322 //* @remarks 323 //* None 324 //* 325 //******************************************************************************* 326 //void ih264_intra_pred_chroma_8x8_mode_vert(UWORD8 *pu1_src, 327 // UWORD8 *pu1_dst, 328 // WORD32 src_strd, 329 // WORD32 dst_strd, 330 // WORD32 ui_neighboravailability) 331 332 //**************Variables Vs Registers***************************************** 333 // x0 => *pu1_src 334 // x1 => *pu1_dst 335 // x2 => src_strd 336 // x3 => dst_strd 337 // x4 => ui_neighboravailability 338 339 340 .global ih264_intra_pred_chroma_8x8_mode_vert_av8 341 342 ih264_intra_pred_chroma_8x8_mode_vert_av8: 343 344 push_v_regs 345 346 add x0, x0, #18 347 ld1 {v0.8b, v1.8b}, [x0] 348 349 st1 {v0.8b, v1.8b}, [x1], x3 350 st1 {v0.8b, v1.8b}, [x1], x3 351 st1 {v0.8b, v1.8b}, [x1], x3 352 st1 {v0.8b, v1.8b}, [x1], x3 353 st1 {v0.8b, v1.8b}, [x1], x3 354 st1 {v0.8b, v1.8b}, [x1], x3 355 st1 {v0.8b, v1.8b}, [x1], x3 356 st1 {v0.8b, v1.8b}, [x1], x3 357 358 pop_v_regs 359 ret 360 361 362 363 364 ///****************************************************************************** 365 366 367 ///** 368 //******************************************************************************* 369 //* 370 //*ih264_intra_pred_chroma_8x8_mode_plane 371 //* 372 //* @brief 373 //* Perform Intra prediction for chroma_8x8 mode:PLANE 374 //* 375 //* @par Description: 376 //* Perform Intra prediction for chroma_8x8 mode:PLANE ,described in sec 8.3.4.4 377 //* 378 //* @param[in] pu1_src 379 //* UWORD8 pointer to the source containing alternate U and V samples 380 //* 381 //* @param[out] pu1_dst 382 //* UWORD8 pointer to the destination with alternate U and V samples 383 //* 384 //* @param[in] src_strd 385 //* integer source stride 386 //* 387 //* @param[in] dst_strd 388 //* integer destination stride 389 //* 390 //* @param[in] ui_neighboravailability 391 //* availability of neighbouring pixels 392 //* 393 //* @returns 394 //* 395 //* @remarks 396 //* None 397 //* 398 //*******************************************************************************/ 399 //void ih264_intra_pred_chroma_8x8_mode_plane(UWORD8 *pu1_src, 400 // UWORD8 *pu1_dst, 401 // WORD32 src_strd, 402 // WORD32 dst_strd, 403 // WORD32 ui_neighboravailability) 404 405 //**************Variables Vs Registers***************************************** 406 // x0 => *pu1_src 407 // x1 => *pu1_dst 408 // x2 => src_strd 409 // x3 => dst_strd 410 // x4 => ui_neighboravailability 411 412 .global ih264_intra_pred_chroma_8x8_mode_plane_av8 413 ih264_intra_pred_chroma_8x8_mode_plane_av8: 414 415 push_v_regs 416 stp x19, x20, [sp, #-16]! 417 418 ld1 {v0.2s}, [x0] 419 add x10, x0, #10 420 ld1 {v1.2s}, [x10] 421 add x10, x10, #6 422 rev64 v5.4h, v0.4h 423 ld1 {v2.2s}, [x10], #8 424 add x10, x10, #2 425 rev64 v7.4h, v2.4h 426 ld1 {v3.2s}, [x10] 427 sub x5, x3, #8 428 adrp x12, :got:ih264_gai1_intrapred_chroma_plane_coeffs1 429 ldr x12, [x12, #:got_lo12:ih264_gai1_intrapred_chroma_plane_coeffs1] 430 usubl v10.8h, v5.8b, v1.8b 431 ld1 {v8.8b, v9.8b}, [x12] // Load multiplication factors 1 to 8 into D3 432 mov v8.d[1], v9.d[0] 433 usubl v12.8h, v3.8b, v7.8b 434 mul v14.8h, v10.8h , v8.8h 435 mul v16.8h, v12.8h , v8.8h 436 uzp1 v15.8h, v14.8h, v16.8h 437 uzp2 v16.8h, v14.8h, v16.8h 438 mov v14.16b, v15.16b 439 mov v15.d[0], v14.d[1] 440 mov v17.d[0], v16.d[1] 441 addp v14.4h, v14.4h, v14.4h 442 addp v15.4h, v15.4h, v15.4h 443 addp v16.4h, v16.4h, v16.4h 444 addp v17.4h, v17.4h, v17.4h 445 addp v14.4h, v14.4h, v14.4h 446 addp v15.4h, v15.4h, v15.4h 447 addp v16.4h, v16.4h, v16.4h 448 addp v17.4h, v17.4h, v17.4h 449 mov x6, #34 450 dup v18.8h, w6 451 smull v22.4s, v14.4h, v18.4h 452 smull v24.4s, v15.4h, v18.4h 453 smull v26.4s, v16.4h, v18.4h 454 smull v28.4s, v17.4h, v18.4h 455 rshrn v10.4h, v22.4s, #6 456 rshrn v12.4h, v24.4s, #6 457 rshrn v13.4h, v26.4s, #6 458 rshrn v14.4h, v28.4s, #6 459 ldrb w6, [x0], #1 460 sxtw x6, w6 461 add x10, x0, #31 462 ldrb w8, [x0], #1 463 sxtw x8, w8 464 ldrb w7, [x10], #1 465 sxtw x7, w7 466 ldrb w9, [x10], #1 467 sxtw x9, w9 468 add x6, x6, x7 469 add x8, x8, x9 470 lsl x6, x6, #4 471 lsl x8, x8, #4 472 dup v0.8h, w6 473 dup v2.8h, w8 474 dup v4.8h, v12.h[0] 475 dup v6.8h, v10.h[0] 476 dup v24.8h, v14.h[0] 477 dup v26.8h, v13.h[0] 478 zip1 v5.8h, v4.8h, v24.8h 479 zip2 v24.8h, v4.8h, v24.8h 480 mov v4.16b, v5.16b 481 zip1 v7.8h, v6.8h, v26.8h 482 zip2 v26.8h, v6.8h, v26.8h 483 mov v6.16b, v7.16b 484 zip1 v1.8h, v0.8h, v2.8h 485 zip2 v2.8h, v0.8h, v2.8h 486 mov v0.16b, v1.16b 487 488 adrp x12, :got:ih264_gai1_intrapred_chroma_plane_coeffs2 489 ldr x12, [x12, #:got_lo12:ih264_gai1_intrapred_chroma_plane_coeffs2] 490 491 ld1 {v8.2s, v9.2s}, [x12] 492 mov v8.d[1], v9.d[0] 493 mov v10.16b, v8.16b 494 mov v22.16b, v8.16b 495 zip1 v9.8h, v8.8h, v10.8h 496 zip2 v10.8h, v8.8h, v10.8h 497 mov v8.16b, v9.16b 498 mul v12.8h, v4.8h , v8.8h 499 mul v16.8h, v4.8h , v10.8h 500 add v12.8h, v0.8h , v12.8h 501 add v16.8h, v0.8h , v16.8h 502 dup v20.8h, v22.h[0] 503 mul v4.8h, v6.8h , v20.8h 504 dup v30.8h, v22.h[1] 505 mul v18.8h, v6.8h , v20.8h 506 mul v14.8h, v6.8h , v30.8h 507 mul v8.8h, v6.8h , v30.8h 508 add v24.8h, v12.8h , v4.8h 509 add v0.8h, v16.8h , v18.8h 510 add v2.8h, v12.8h , v14.8h 511 sqrshrun v28.8b, v24.8h, #5 512 add v26.8h, v16.8h , v8.8h 513 sqrshrun v29.8b, v0.8h, #5 514 dup v20.8h, v22.h[2] 515 st1 {v28.8b, v29.8b}, [x1], x3 516 sqrshrun v28.8b, v2.8h, #5 517 sqrshrun v29.8b, v26.8h, #5 518 mul v4.8h, v6.8h , v20.8h 519 mul v18.8h, v6.8h , v20.8h 520 st1 {v28.8b, v29.8b}, [x1], x3 521 add v24.8h, v12.8h , v4.8h 522 add v0.8h, v16.8h , v18.8h 523 dup v30.8h, v22.h[3] 524 sqrshrun v28.8b, v24.8h, #5 525 sqrshrun v29.8b, v0.8h, #5 526 mul v14.8h, v6.8h , v30.8h 527 mul v8.8h, v6.8h , v30.8h 528 st1 {v28.8b, v29.8b}, [x1], x3 529 add v2.8h, v12.8h , v14.8h 530 add v26.8h, v16.8h , v8.8h 531 dup v20.8h, v22.h[4] 532 sqrshrun v28.8b, v2.8h, #5 533 sqrshrun v29.8b, v26.8h, #5 534 mul v4.8h, v6.8h , v20.8h 535 mul v18.8h, v6.8h , v20.8h 536 st1 {v28.8b, v29.8b}, [x1], x3 537 add v24.8h, v12.8h , v4.8h 538 add v0.8h, v16.8h , v18.8h 539 dup v30.8h, v22.h[5] 540 sqrshrun v28.8b, v24.8h, #5 541 sqrshrun v29.8b, v0.8h, #5 542 mul v14.8h, v6.8h , v30.8h 543 mul v8.8h, v6.8h , v30.8h 544 st1 {v28.8b, v29.8b}, [x1], x3 545 add v2.8h, v12.8h , v14.8h 546 add v26.8h, v16.8h , v8.8h 547 dup v20.8h, v22.h[6] 548 sqrshrun v28.8b, v2.8h, #5 549 sqrshrun v29.8b, v26.8h, #5 550 mul v4.8h, v6.8h , v20.8h 551 mul v18.8h, v6.8h , v20.8h 552 st1 {v28.8b, v29.8b}, [x1], x3 553 add v24.8h, v12.8h , v4.8h 554 add v0.8h, v16.8h , v18.8h 555 dup v30.8h, v22.h[7] 556 sqrshrun v28.8b, v24.8h, #5 557 sqrshrun v29.8b, v0.8h, #5 558 mul v14.8h, v6.8h , v30.8h 559 mul v8.8h, v6.8h , v30.8h 560 st1 {v28.8b, v29.8b}, [x1], x3 561 add v2.8h, v12.8h , v14.8h 562 add v26.8h, v16.8h , v8.8h 563 sqrshrun v28.8b, v2.8h, #5 564 sqrshrun v29.8b, v26.8h, #5 565 st1 {v28.8b, v29.8b}, [x1], x3 566 567 end_func_plane: 568 569 ldp x19, x20, [sp], #16 570 pop_v_regs 571 ret 572 573 574 575