1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 //******************************************************************************* 20 //* @file 21 //* ihevc_intra_pred_filters_planar.s 22 //* 23 //* @brief 24 //* contains function definitions for inter prediction interpolation. 25 //* functions are coded using neon intrinsics and can be compiled using 26 27 //* rvct 28 //* 29 //* @author 30 //* akshaya mukund 31 //* 32 //* @par list of functions: 33 //* 34 //* 35 //* @remarks 36 //* none 37 //* 38 //******************************************************************************* 39 //*/ 40 ///** 41 //******************************************************************************* 42 //* 43 //* @brief 44 //* luma intraprediction filter for planar input 45 //* 46 //* @par description: 47 //* 48 //* @param[in] pu1_ref 49 //* uword8 pointer to the source 50 //* 51 //* @param[out] pu1_dst 52 //* uword8 pointer to the destination 53 //* 54 //* @param[in] src_strd 55 //* integer source stride 56 //* 57 //* @param[in] dst_strd 58 //* integer destination stride 59 //* 60 //* @param[in] pi1_coeff 61 //* word8 pointer to the planar coefficients 62 //* 63 //* @param[in] nt 64 //* size of tranform block 65 //* 66 //* @param[in] mode 67 //* type of filtering 68 //* 69 //* @returns 70 //* 71 //* @remarks 72 //* none 73 //* 74 //******************************************************************************* 75 //*/ 76 77 //void ihevc_intra_pred_luma_planar(uword8* pu1_ref, 78 // word32 src_strd, 79 // uword8* pu1_dst, 80 // word32 dst_strd, 81 // word32 nt, 82 // word32 mode, 83 // word32 pi1_coeff) 84 //**************variables vs registers***************************************** 85 //x0 => *pu1_ref 86 //x1 => src_strd 87 //x2 => *pu1_dst 88 //x3 => dst_strd 89 90 //stack contents from #40 91 // nt 92 // mode 93 // pi1_coeff 94 95 .text 96 .align 4 97 .include "ihevc_neon_macros.s" 98 99 100 101 .globl ihevc_intra_pred_luma_planar_av8 102 .extern gau1_ihevc_planar_factor 103 .extern gau1_ihevc_planar_factor_1 104 105 .type ihevc_intra_pred_luma_planar_av8, %function 106 107 ihevc_intra_pred_luma_planar_av8: 108 109 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 110 111 stp x19, x20,[sp,#-16]! 112 113 adrp x11, :got:gau1_ihevc_planar_factor //loads table of coeffs 114 ldr x11, [x11, #:got_lo12:gau1_ihevc_planar_factor] 115 116 clz w5,w4 117 sub x20, x5, #32 118 neg x5, x20 119 dup v29.8h,w5 120 neg v29.8h, v29.8h //shr value (so vneg) 121 dup v2.8b,w4 //nt 122 dup v16.8h,w4 //nt 123 124 sub x6, x4, #1 //nt-1 125 add x6, x6, x0 126 ldr w7, [x6] 127 sxtw x7,w7 128 dup v0.8b,w7 //src[nt-1] 129 130 add x6, x4, x4,lsl #1 //3nt 131 add x6, x6, #1 //3nt + 1 132 add x6, x6, x0 133 ldr w7, [x6] 134 sxtw x7,w7 135 dup v1.8b,w7 //src[3nt+1] 136 137 add x6, x4, x4 //2nt 138 add x14, x6, #1 //2nt+1 139 sub x6, x6, #1 //2nt-1 140 add x6, x6, x0 //&src[2nt-1] 141 add x14, x14, x0 //&src[2nt+1] 142 143 mov x8, #1 //row+1 (row is first 0) 144 sub x9, x4, x8 //nt-1-row (row is first 0) 145 146 dup v5.8b,w8 //row + 1 147 dup v6.8b,w9 //nt - 1 - row 148 mov v7.8b, v5.8b //mov #1 to d7 to used for inc for row+1 and dec for nt-1-row 149 150 add x12, x11, #1 //coeffs (to be reloaded after every row) 151 mov x1, x4 //nt (row counter) (dec after every row) 152 mov x5, x2 //dst (to be reloaded after every row and inc by dst_strd) 153 mov x10, #8 //increment for the coeffs 154 mov x0, x14 //&src[2nt+1] (to be reloaded after every row) 155 156 cmp x4, #4 157 beq tf_sz_4 158 159 //@ ========== ***************** ===================== 160 prolog: 161 tf_sz_8_16_32: 162 163 mov x7, x4 //column counter (set to no of cols) 164 lsr x9, x4, #3 //divide nt by 8 165 mul x7, x7, x9 //multiply width * height 166 adrp x5, :got:gau1_ihevc_planar_factor_1 //loads table of coeffs 167 ldr x5, [x5, #:got_lo12:gau1_ihevc_planar_factor_1] 168 sub x6, x6, #7 169 mov x8, x2 170 lsl x9, x3, #3 //4*stride 171 sub x20, x9, #8 //8-4*stride 172 neg x9, x20 173 mov x10, x4 //nt 174 sub x10, x10, #8 //nt - 8 175 176 col_loop_8_16_32: 177 178 ld1 {v17.8b},[x12] //(1-8)load 8 coeffs [col+1] 179 dup v27.8h,w4 //(1) 180 ld1 {v4.8b},[x6] //(1-8)src[2nt-1-row] 181 sub v19.8b, v2.8b , v17.8b //(1-8)[nt-1-col] 182 183 184 umlal v27.8h, v5.8b, v0.8b //(1)(row+1) * src[nt-1] 185 186 ld1 {v3.8b},[x14] //(1-8)load 8 src[2nt+1+col] 187 umlal v27.8h, v17.8b, v1.8b //(1)(col+1) * src[3nt+1] 188 189 dup v20.8b, v4.8b[7] //(1) 190 umlal v27.8h, v6.8b, v3.8b //(1)(nt-1-row) * src[2nt+1+col] 191 192 dup v21.8b, v4.8b[6] //(2) 193 umlal v27.8h, v19.8b, v20.8b //(1)(nt-1-col) * src[2nt-1-row] 194 195 dup v30.8h,w4 //(2) 196 add v5.8b, v5.8b , v7.8b //(1) 197 198 sub v6.8b, v6.8b , v7.8b //(1) 199 200 dup v22.8b, v4.8b[5] //(3) 201 umlal v30.8h, v5.8b, v0.8b //(2) 202 203 dup v28.8h,w4 //(3) 204 umlal v30.8h, v17.8b, v1.8b //(2) 205 206 umlal v30.8h, v6.8b, v3.8b //(2) 207 umlal v30.8h, v19.8b, v21.8b //(2) 208 209 sshl v27.8h, v27.8h, v29.8h //(1)shr 210 211 add v5.8b, v5.8b , v7.8b //(2) 212 sub v6.8b, v6.8b , v7.8b //(2) 213 214 xtn v27.8b, v27.8h //(1) 215 umlal v28.8h, v5.8b, v0.8b //(3) 216 217 dup v23.8b, v4.8b[4] //(4) 218 umlal v28.8h, v17.8b, v1.8b //(3) 219 220 dup v25.8h,w4 //(4) 221 umlal v28.8h, v6.8b, v3.8b //(3) 222 223 st1 {v27.8b},[x2], x3 //(1)str 8 values 224 umlal v28.8h, v19.8b, v22.8b //(3) 225 226 sshl v30.8h, v30.8h, v29.8h //(2)shr 227 228 add v5.8b, v5.8b , v7.8b //(3) 229 sub v6.8b, v6.8b , v7.8b //(3) 230 231 xtn v30.8b, v30.8h //(2) 232 umlal v25.8h, v5.8b, v0.8b //(4) 233 234 dup v20.8b, v4.8b[3] //(5) 235 umlal v25.8h, v17.8b, v1.8b //(4) 236 237 dup v16.8h,w4 //(5) 238 umlal v25.8h, v6.8b, v3.8b //(4) 239 240 st1 {v30.8b},[x2], x3 //(2)str 8 values 241 umlal v25.8h, v19.8b, v23.8b //(4) 242 243 sshl v28.8h, v28.8h, v29.8h //(3)shr 244 245 add v5.8b, v5.8b , v7.8b //(4) 246 sub v6.8b, v6.8b , v7.8b //(4) 247 248 xtn v28.8b, v28.8h //(3) 249 umlal v16.8h, v5.8b, v0.8b //(5) 250 251 dup v21.8b, v4.8b[2] //(6) 252 umlal v16.8h, v17.8b, v1.8b //(5) 253 254 dup v18.8h,w4 //(6) 255 umlal v16.8h, v6.8b, v3.8b //(5) 256 257 st1 {v28.8b},[x2], x3 //(3)str 8 values 258 umlal v16.8h, v19.8b, v20.8b //(5) 259 260 sshl v25.8h, v25.8h, v29.8h //(4)shr 261 add v5.8b, v5.8b , v7.8b //(5) 262 sub v6.8b, v6.8b , v7.8b //(5) 263 264 xtn v25.8b, v25.8h //(4) 265 umlal v18.8h, v5.8b, v0.8b //(6) 266 267 dup v22.8b, v4.8b[1] //(7) 268 umlal v18.8h, v17.8b, v1.8b //(6) 269 270 dup v26.8h,w4 //(7) 271 umlal v18.8h, v6.8b, v3.8b //(6) 272 273 st1 {v25.8b},[x2], x3 //(4)str 8 values 274 umlal v18.8h, v19.8b, v21.8b //(6) 275 276 sshl v16.8h, v16.8h, v29.8h //(5)shr 277 278 add v5.8b, v5.8b , v7.8b //(6) 279 sub v6.8b, v6.8b , v7.8b //(6) 280 281 xtn v16.8b, v16.8h //(5) 282 umlal v26.8h, v5.8b, v0.8b //(7) 283 284 dup v23.8b, v4.8b[0] //(8) 285 umlal v26.8h, v17.8b, v1.8b //(7) 286 287 dup v24.8h,w4 //(8) 288 umlal v26.8h, v6.8b, v3.8b //(7) 289 290 st1 {v16.8b},[x2], x3 //(5)str 8 values 291 umlal v26.8h, v19.8b, v22.8b //(7) 292 293 sshl v18.8h, v18.8h, v29.8h //(6)shr 294 295 add v5.8b, v5.8b , v7.8b //(7) 296 sub v6.8b, v6.8b , v7.8b //(7) 297 298 xtn v18.8b, v18.8h //(6) 299 umlal v24.8h, v5.8b, v0.8b //(8) 300 301 302 umlal v24.8h, v17.8b, v1.8b //(8) 303 304 umlal v24.8h, v6.8b, v3.8b //(8) 305 306 st1 {v18.8b},[x2], x3 //(6)str 8 values 307 umlal v24.8h, v19.8b, v23.8b //(8) 308 309 sshl v26.8h, v26.8h, v29.8h //(7)shr 310 311 subs x7, x7, #8 312 313 beq epilog 314 315 subs x1, x1, #8 //row counter 316 add x20, x12, #8 //col inc 317 csel x12, x20, x12,gt 318 add x20, x14, #8 //also for col inc 319 csel x14, x20, x14,gt 320 csel x1, x4, x1,le //nt reloaded (refresh the value) 321 add x20, x11, #1 //x12 reset 322 csel x12, x20, x12,le 323 324 csel x14, x0, x14,le //x14 reset 325 ld1 {v17.8b},[x12] //(1n)(1-8)load 8 coeffs [col+1] 326 327 sub x20, x6, #8 //for next set of rows 328 csel x6, x20, x6,le 329 ld1 {v3.8b},[x14] //(1n)(1-8)load 8 src[2nt+1+col] 330 331 add x20, x5, #8 332 csel x5, x20, x5,le 333 dup v27.8h,w4 //(1n)(1) 334 335 ld1 {v5.8b},[x5] 336 337 ld1 {v4.8b},[x6] //(1n)(1-8)src[2nt-1-row] 338 sub v19.8b, v2.8b , v17.8b //(1n)(1-8)[nt-1-col] 339 340 dup v20.8b, v4.8b[7] //(1n)(1) 341 sub v6.8b, v2.8b , v5.8b 342 343 beq epilog 344 345 kernel_plnr: 346 347 cmp x1, #0 // (cond loop) 348 sshl v24.8h, v24.8h, v29.8h //(8)shr 349 350 xtn v26.8b, v26.8h //(7) 351 umlal v27.8h, v5.8b, v0.8b //(1)(row+1) * src[nt-1] 352 353 xtn v24.8b, v24.8h //(8) 354 umlal v27.8h, v17.8b, v1.8b //(1)(col+1) * src[3nt+1] 355 356 dup v21.8b, v4.8b[6] //(2) 357 umlal v27.8h, v6.8b, v3.8b //(1)(nt-1-row) * src[2nt+1+col] 358 359 dup v30.8h,w4 //(2) 360 umlal v27.8h, v19.8b, v20.8b //(1)(nt-1-col) * src[2nt-1-row] 361 362 st1 {v26.8b},[x2], x3 //(7)str 8 values 363 add v5.8b, v5.8b , v7.8b //(1) 364 365 st1 {v24.8b},[x2], x3 //(8)str 8 values 366 sub v6.8b, v6.8b , v7.8b //(1) 367 368 add x20, x2, x9 //since more cols to fill, dst + 8 - 6*strd (cond loop) 369 csel x2, x20, x2,gt 370 umlal v30.8h, v5.8b, v0.8b //(2) 371 372 sub x20, x2, x10 //else go to next set of rows, dst - (nt-8) (cond loop) 373 csel x2, x20, x2,le 374 umlal v30.8h, v17.8b, v1.8b //(2) 375 376 dup v22.8b, v4.8b[5] //(3) 377 umlal v30.8h, v6.8b, v3.8b //(2) 378 379 dup v28.8h,w4 //(3) 380 umlal v30.8h, v19.8b, v21.8b //(2) 381 382 sshl v27.8h, v27.8h, v29.8h //(1)shr 383 384 add v5.8b, v5.8b , v7.8b //(2) 385 csel x1, x4, x1,le //nt reloaded (refresh the value) (cond loop) 386 387 sub v6.8b, v6.8b , v7.8b //(2) 388 subs x1, x1, #8 //row counter (loop) 389 390 xtn v27.8b, v27.8h //(1) 391 umlal v28.8h, v5.8b, v0.8b //(3) 392 393 dup v23.8b, v4.8b[4] //(4) 394 umlal v28.8h, v17.8b, v1.8b //(3) 395 396 dup v25.8h,w4 //(4) 397 umlal v28.8h, v6.8b, v3.8b //(3) 398 399 st1 {v27.8b},[x2], x3 //(1)str 8 values 400 umlal v28.8h, v19.8b, v22.8b //(3) 401 402 sshl v30.8h, v30.8h, v29.8h //(2)shr 403 404 add v5.8b, v5.8b , v7.8b //(3) 405 406 sub v6.8b, v6.8b , v7.8b //(3) 407 408 xtn v30.8b, v30.8h //(2) 409 umlal v25.8h, v5.8b, v0.8b //(4) 410 411 dup v20.8b, v4.8b[3] //(5) 412 umlal v25.8h, v17.8b, v1.8b //(4) 413 414 dup v16.8h,w4 //(5) 415 umlal v25.8h, v6.8b, v3.8b //(4) 416 417 st1 {v30.8b},[x2], x3 //(2)str 8 values 418 umlal v25.8h, v19.8b, v23.8b //(4) 419 420 sshl v28.8h, v28.8h, v29.8h //(3)shr 421 422 add v5.8b, v5.8b , v7.8b //(4) 423 424 sub v6.8b, v6.8b , v7.8b //(4) 425 426 xtn v28.8b, v28.8h //(3) 427 umlal v16.8h, v5.8b, v0.8b //(5) 428 429 dup v21.8b, v4.8b[2] //(6) 430 umlal v16.8h, v17.8b, v1.8b //(5) 431 432 dup v18.8h,w4 //(6) 433 umlal v16.8h, v6.8b, v3.8b //(5) 434 435 st1 {v28.8b},[x2], x3 //(3)str 8 values 436 umlal v16.8h, v19.8b, v20.8b //(5) 437 438 add x20, x11, #1 //x12 reset (cond loop) 439 csel x12, x20, x12,le 440 sshl v25.8h, v25.8h, v29.8h //(4)shr 441 442 add x20, x12, #8 //col inc (cond loop) 443 csel x12, x20, x12,gt 444 add v5.8b, v5.8b , v7.8b //(5) 445 446 add x20, x14, #8 //also for col inc (cond loop) 447 csel x14, x20, x14,gt 448 sub v6.8b, v6.8b , v7.8b //(5) 449 450 xtn v25.8b, v25.8h //(4) 451 umlal v18.8h, v5.8b, v0.8b //(6) 452 453 dup v22.8b, v4.8b[1] //(7) 454 umlal v18.8h, v17.8b, v1.8b //(6) 455 456 dup v26.8h,w4 //(7) 457 umlal v18.8h, v6.8b, v3.8b //(6) 458 459 st1 {v25.8b},[x2], x3 //(4)str 8 values 460 umlal v18.8h, v19.8b, v21.8b //(6) 461 462 csel x14, x0, x14,le //x14 reset (cond loop) 463 sshl v16.8h, v16.8h, v29.8h //(5)shr 464 465 sub x20, x6, #8 //for next set of rows (cond loop) 466 csel x6, x20, x6,le 467 add v5.8b, v5.8b , v7.8b //(6) 468 469 add x20, x5, #8 // (cond loop) 470 csel x5, x20, x5,le 471 sub v6.8b, v6.8b , v7.8b //(6) 472 473 xtn v16.8b, v16.8h //(5) 474 umlal v26.8h, v5.8b, v0.8b //(7) 475 476 dup v23.8b, v4.8b[0] //(8) 477 umlal v26.8h, v17.8b, v1.8b //(7) 478 479 dup v24.8h,w4 //(8) 480 umlal v26.8h, v6.8b, v3.8b //(7) 481 482 st1 {v16.8b},[x2], x3 //(5)str 8 values 483 umlal v26.8h, v19.8b, v22.8b //(7) 484 485 ld1 {v4.8b},[x6] //(1n)(1-8)src[2nt-1-row] 486 sshl v18.8h, v18.8h, v29.8h //(6)shr 487 488 add v5.8b, v5.8b , v7.8b //(7) 489 490 sub v6.8b, v6.8b , v7.8b //(7) 491 492 xtn v18.8b, v18.8h //(6) 493 umlal v24.8h, v5.8b, v0.8b //(8) 494 495 ld1 {v5.8b},[x5] //(row+1 value) 496 umlal v24.8h, v17.8b, v1.8b //(8) 497 498 dup v20.8b, v4.8b[7] //(1n)(1) 499 umlal v24.8h, v6.8b, v3.8b //(8) 500 501 st1 {v18.8b},[x2], x3 //(6)str 8 values 502 umlal v24.8h, v19.8b, v23.8b //(8) 503 504 ld1 {v17.8b},[x12] //(1n)(1-8)load 8 coeffs [col+1] 505 sub v6.8b, v2.8b , v5.8b //(nt-1-row) value 506 507 subs x7, x7, #8 //col counter 508 509 ld1 {v3.8b},[x14] //(1n)(1-8)load 8 src[2nt+1+col] 510 sshl v26.8h, v26.8h, v29.8h //(7)shr 511 512 dup v27.8h,w4 //(1n)(1) 513 sub v19.8b, v2.8b , v17.8b //(1n)(1-8)[nt-1-col] 514 515 bne kernel_plnr 516 517 epilog: 518 519 xtn v26.8b, v26.8h //(7) 520 st1 {v26.8b},[x2], x3 //(7)str 8 values 521 522 sshl v24.8h, v24.8h, v29.8h //(8)shr 523 xtn v24.8b, v24.8h //(8) 524 st1 {v24.8b},[x2], x3 //(8)str 8 values 525 526 //@ ========== ***************** ===================== 527 528 beq end_loop 529 530 tf_sz_4: 531 ld1 {v25.8b},[x14] //load src[2nt+1+col] 532 ld1 {v17.8b},[x12], x10 //load 8 coeffs [col+1] 533 loop_sz_4: 534 mov x10, #4 //reduce inc to #4 for 4x4 535 ldr w7, [x6], #-1 //src[2nt-1-row] (dec to take into account row) 536 sxtw x7,w7 537 dup v4.8b,w7 //src[2nt-1-row] 538 539 sub v19.8b, v2.8b , v17.8b //[nt-1-col] 540 541 umull v27.8h, v5.8b, v0.8b //(row+1) * src[nt-1] 542 umlal v27.8h, v6.8b, v25.8b //(nt-1-row) * src[2nt+1+col] 543 umlal v27.8h, v17.8b, v1.8b //(col+1) * src[3nt+1] 544 umlal v27.8h, v19.8b, v4.8b //(nt-1-col) * src[2nt-1-row] 545 // vadd.i16 q6, q6, q8 @add (nt) 546 // vshl.s16 q6, q6, q7 @shr 547 // vmovn.i16 d12, q6 548 rshrn v27.8b, v27.8h,#3 549 st1 {v27.s}[0],[x2], x3 550 551 add v5.8b, v5.8b , v7.8b //row++ [(row+1)++] 552 sub v6.8b, v6.8b , v7.8b //[nt-1-row]-- 553 subs x1, x1, #1 554 555 bne loop_sz_4 556 557 end_loop: 558 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 559 ldp x19, x20,[sp],#16 560 561 ret 562 563 564 565 566 567 568 569 570