1 @/***************************************************************************** 2 @* 3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 @* 5 @* Licensed under the Apache License, Version 2.0 (the "License"); 6 @* you may not use this file except in compliance with the License. 7 @* You may obtain a copy of the License at: 8 @* 9 @* http://www.apache.org/licenses/LICENSE-2.0 10 @* 11 @* Unless required by applicable law or agreed to in writing, software 12 @* distributed under the License is distributed on an "AS IS" BASIS, 13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @* See the License for the specific language governing permissions and 15 @* limitations under the License. 16 @* 17 @*****************************************************************************/ 18 @/** 19 @****************************************************************************** 20 @* @file 21 @* ihevc_inter_pred_luma_horz.s 22 @* 23 @* @brief 24 @* contains function definitions for inter prediction interpolation. 25 @* functions are coded using neon intrinsics and can be compiled using 26 27 @* rvct 28 @* 29 @* @author 30 @* parthiban v 31 @* 32 @* @par list of functions: 33 @* 34 @* - ihevc_inter_pred_luma_horz() 35 @* 36 @* @remarks 37 @* none 38 @* 39 @******************************************************************************* 40 @*/ 41 42 @/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */ 43 @/* include reconstruction */ 44 @ 45 46 @/** 47 @******************************************************************************* 48 @* 49 @* @brief 50 @* interprediction luma filter for vertical input 51 @* 52 @* @par description: 53 @* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to 54 @* the elements pointed by 'pu1_src' and writes to the location pointed by 55 @* 'pu1_dst' the output is downshifted by 6 and clipped to 8 bits 56 @* assumptions : the function is optimized considering the fact width is 57 @* multiple of 4 or 8. and height as multiple of 2. 58 @* 59 @* @param[in] pu1_src 60 @* uword8 pointer to the source 61 @* 62 @* @param[out] pu1_dst 63 @* uword8 pointer to the destination 64 @* 65 @* @param[in] src_strd 66 @* integer source stride 67 @* 68 @* @param[in] dst_strd 69 @* integer destination stride 70 @* 71 @* @param[in] pi1_coeff 72 @* word8 pointer to the filter coefficients 73 @* 74 @* @param[in] ht 75 @* integer height of the array 76 @* 77 @* @param[in] wd 78 @* integer width of the array 79 @* 80 @* @returns 81 @* 82 @* @remarks 83 @* none 84 @* 85 @******************************************************************************* 86 @*/ 87 88 @void ihevc_inter_pred_luma_horz ( 89 @ uword8 *pu1_src, 90 @ uword8 *pu1_dst, 91 @ word32 src_strd, 92 @ word32 dst_strd, 93 @ word8 *pi1_coeff, 94 @ word32 ht, 95 @ word32 wd ) 96 97 @**************variables vs registers***************************************** 98 @ r0 => *pu1_src 99 @ r1 => *pu1_dst 100 @ r2 => src_strd 101 @ r3 => dst_strd 102 @ r4 => *pi1_coeff 103 @ r5 => ht 104 @ r6 => wd 105 106 .text 107 .align 4 108 109 110 111 112 .globl ihevc_inter_pred_luma_horz_a9q 113 114 .type ihevc_inter_pred_luma_horz_a9q, %function 115 116 ihevc_inter_pred_luma_horz_a9q: 117 118 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 119 @str r1,[sp,#-4] 120 @ mov r7,#8192 121 start_loop_count: 122 @ ldr r1,[sp,#-4] 123 124 125 ldr r4,[sp,#40] @loads pi1_coeff 126 ldr r8,[sp,#44] @loads ht 127 ldr r10,[sp,#48] @loads wd 128 129 vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff) 130 mov r11,#1 131 subs r14,r8,#0 @checks for ht == 0 132 133 vabs.s8 d2,d0 @vabs_s8(coeff) 134 135 @ble end_loops 136 137 138 vdup.8 d24,d2[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0) 139 sub r12,r0,#3 @pu1_src - 3 140 vdup.8 d25,d2[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1) 141 add r4,r12,r2 @pu1_src_tmp2_8 = pu1_src + src_strd 142 vdup.8 d26,d2[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2) 143 rsb r9,r10,r2,lsl #1 @2*src_strd - wd 144 vdup.8 d27,d2[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3) 145 rsb r8,r10,r3,lsl #1 @2*dst_strd - wd 146 vdup.8 d28,d2[4] @coeffabs_4 = vdup_lane_u8(coeffabs, 4) 147 148 vdup.8 d29,d2[5] @coeffabs_5 = vdup_lane_u8(coeffabs, 5) 149 @ tst r10,#7 @checks wd for multiples 150 vdup.8 d30,d2[6] @coeffabs_6 = vdup_lane_u8(coeffabs, 6) 151 vdup.8 d31,d2[7] @coeffabs_7 = vdup_lane_u8(coeffabs, 7) 152 153 mov r7,r1 154 155 cmp r10,#4 156 ble outer_loop_4 157 158 cmp r10,#24 159 moveq r10,#16 160 addeq r8,#8 161 addeq r9,#8 162 163 cmp r10,#16 164 bge outer_loop_16 165 166 cmp r10,#12 167 addeq r8,#4 168 addeq r9,#4 169 b outer_loop_8 170 171 172 outer_loop8_residual: 173 sub r12,r0,#3 @pu1_src - 3 174 mov r1,r7 175 mov r14,#32 176 add r1,#16 177 add r12,#16 178 mov r10,#8 179 add r8,#8 180 add r9,#8 181 182 outer_loop_8: 183 184 add r6,r1,r3 @pu1_dst + dst_strd 185 add r4,r12,r2 @pu1_src + src_strd 186 subs r5,r10,#0 @checks wd 187 188 ble end_inner_loop_8 189 190 inner_loop_8: 191 vld1.u32 {d0},[r12],r11 @vector load pu1_src 192 vld1.u32 {d1},[r12],r11 193 vld1.u32 {d2},[r12],r11 194 vld1.u32 {d3},[r12],r11 195 196 197 198 199 200 @ vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] 201 @ vext.u8 d3,d0,d1,#3 @vector extract of src[0_3] 202 @ vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] 203 @ vext.u8 d5,d0,d1,#5 @vector extract of src[0_5] 204 @ vext.u8 d6,d0,d1,#6 @vector extract of src [0_6] 205 @ vext.u8 d7,d0,d1,#7 @vector extract of src[0_7] 206 @ vext.u8 d1,d0,d1,#1 @vector extract of src[0_1] 207 @ vext.u8 d14,d12,d13,#2 208 209 @vext.u8 d15,d12,d13,#3 @vector extract of src[0_3] 210 @ vext.u8 d16,d12,d13,#4 @vector extract of src[0_4] 211 @ vext.u8 d17,d12,d13,#5 @vector extract of src[0_5] 212 @vext.u8 d18,d12,d13,#6 @vector extract of src[0_6] 213 @vext.u8 d19,d12,d13,#7 @vector extract of src[0_7] 214 @vext.u8 d13,d12,d13,#1 @vector extract of src[0_1] 215 vld1.u32 {d4},[r12],r11 216 vmull.u8 q4,d1,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 217 vld1.u32 {d5},[r12],r11 218 vmlal.u8 q4,d3,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 219 vld1.u32 {d6},[r12],r11 220 vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 221 vld1.u32 {d7},[r12],r11 222 vmlsl.u8 q4,d2,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 223 vld1.u32 {d12},[r4],r11 @vector load pu1_src + src_strd 224 vmlal.u8 q4,d4,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ 225 vld1.u32 {d13},[r4],r11 226 vmlsl.u8 q4,d5,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ 227 vld1.u32 {d14},[r4],r11 228 vmlal.u8 q4,d6,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ 229 vld1.u32 {d15},[r4],r11 230 vmlsl.u8 q4,d7,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ 231 vld1.u32 {d16},[r4],r11 @vector load pu1_src + src_strd 232 233 vmull.u8 q5,d15,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 234 vld1.u32 {d17},[r4],r11 235 vmlsl.u8 q5,d14,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 236 vld1.u32 {d18},[r4],r11 237 vmlal.u8 q5,d16,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ 238 vld1.u32 {d19},[r4],r11 @vector load pu1_src + src_strd 239 vmlsl.u8 q5,d17,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ 240 vqrshrun.s16 d20,q4,#6 @right shift and saturating narrow result 1 241 vmlal.u8 q5,d18,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ 242 vmlsl.u8 q5,d19,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ 243 vst1.8 {d20},[r1]! @store the result pu1_dst 244 vmlsl.u8 q5,d12,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 245 vmlal.u8 q5,d13,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 246 247 248 249 vqrshrun.s16 d8,q5,#6 @right shift and saturating narrow result 2 250 subs r5,r5,#8 @decrement the wd loop 251 vst1.8 {d8},[r6]! @store the result pu1_dst 252 cmp r5,#4 253 bgt inner_loop_8 254 255 end_inner_loop_8: 256 subs r14,r14,#2 @decrement the ht loop 257 add r12,r12,r9 @increment the src pointer by 2*src_strd-wd 258 add r1,r1,r8 @increment the dst pointer by 2*dst_strd-wd 259 bgt outer_loop_8 260 261 262 263 264 265 ldr r10,[sp,#48] @loads wd 266 cmp r10,#12 267 268 beq outer_loop4_residual 269 270 271 end_loops: 272 273 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 274 275 276 277 278 279 280 outer_loop_16: 281 str r0, [sp, #-4]! 282 str r7, [sp, #-4]! 283 284 add r6,r1,r3 @pu1_dst + dst_strd 285 add r4,r12,r2 @pu1_src + src_strd 286 and r0, r12, #31 287 sub r5,r10,#0 @checks wd 288 @ble end_loops1 289 pld [r12, r2, lsl #1] 290 vld1.u32 {q0},[r12],r11 @vector load pu1_src 291 pld [r4, r2, lsl #1] 292 vld1.u32 {q1},[r12],r11 293 vld1.u32 {q2},[r12],r11 294 vld1.u32 {q3},[r12],r11 295 vld1.u32 {q6},[r12],r11 296 vmull.u8 q4,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 297 vld1.u32 {q7},[r12],r11 298 vmlal.u8 q4,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 299 vld1.u32 {q8},[r12],r11 300 vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 301 vld1.u32 {q9},[r12],r11 302 vmlsl.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 303 vmlal.u8 q4,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ 304 vmlsl.u8 q4,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ 305 vmlal.u8 q4,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ 306 vmlsl.u8 q4,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ 307 308 309 inner_loop_16: 310 311 312 subs r5,r5,#16 313 vmull.u8 q10,d3,d25 314 315 add r12,#8 316 vmlsl.u8 q10,d1,d24 317 318 subeq r14,r14,#2 319 vmlal.u8 q10,d7,d27 320 321 vld1.u32 {q0},[r4],r11 @vector load pu1_src 322 vmlsl.u8 q10,d5,d26 323 324 vld1.u32 {q1},[r4],r11 325 vmlal.u8 q10,d13,d28 326 327 vld1.u32 {q2},[r4],r11 328 vmlal.u8 q10,d17,d30 329 330 vld1.u32 {q3},[r4],r11 331 vmlsl.u8 q10,d15,d29 332 333 vld1.u32 {q6},[r4],r11 334 vmlsl.u8 q10,d19,d31 335 336 vld1.u32 {q7},[r4],r11 337 vqrshrun.s16 d8,q4,#6 @right shift and saturating narrow result 1 338 339 vld1.u32 {q8},[r4],r11 340 vmull.u8 q5,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 341 342 vld1.u32 {q9},[r4],r11 343 vmlal.u8 q5,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 344 345 pld [r12, r2, lsl #2] 346 pld [r4, r2, lsl #2] 347 348 add r4,#8 349 vmlsl.u8 q5,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 350 351 addeq r12,r12,r9 @increment the src pointer by 2*src_strd-wd 352 vmlsl.u8 q5,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 353 354 addeq r4,r12,r2 @pu1_src + src_strd 355 vqrshrun.s16 d9,q10,#6 356 357 vmlal.u8 q5,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ 358 359 @ and r7, r12, #31 360 vmlsl.u8 q5,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ 361 362 vmlal.u8 q5,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ 363 364 vmlsl.u8 q5,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ 365 366 vmull.u8 q11,d3,d25 367 368 vmlsl.u8 q11,d1,d24 369 370 vst1.8 {q4},[r1]! @store the result pu1_dst 371 vmlal.u8 q11,d7,d27 372 373 addeq r1,r1,r8 374 vqrshrun.s16 d10,q5,#6 @right shift and saturating narrow result 2 375 376 @ cmp r7, r0 377 vmlsl.u8 q11,d5,d26 378 379 vmlal.u8 q11,d13,d28 380 381 vmlal.u8 q11,d17,d30 382 383 @ mov r0, r7 384 vmlsl.u8 q11,d15,d29 385 386 cmp r14,#0 387 vmlsl.u8 q11,d19,d31 388 389 beq epilog_16 390 vld1.u32 {q0},[r12],r11 @vector load pu1_src 391 vld1.u32 {q1},[r12],r11 392 vld1.u32 {q2},[r12],r11 393 vld1.u32 {q3},[r12],r11 394 vld1.u32 {q6},[r12],r11 395 vqrshrun.s16 d11,q11,#6 396 vmull.u8 q4,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 397 vld1.u32 {q7},[r12],r11 398 vmlal.u8 q4,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 399 vld1.u32 {q8},[r12],r11 400 vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 401 vld1.u32 {q9},[r12],r11 402 vmlsl.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 403 vmlal.u8 q4,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ 404 cmp r5,#0 405 vmlsl.u8 q4,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ 406 moveq r5,r10 407 vmlal.u8 q4,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ 408 vst1.8 {q5},[r6]! @store the result pu1_dst 409 vmlsl.u8 q4,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ 410 addeq r6,r1,r3 @pu1_dst + dst_strd 411 b inner_loop_16 412 413 414 epilog_16: 415 vqrshrun.s16 d11,q11,#6 416 vst1.8 {q5},[r6]! @store the result pu1_dst 417 418 ldr r7, [sp], #4 419 ldr r0, [sp], #4 420 ldr r10,[sp,#48] 421 cmp r10,#24 422 423 beq outer_loop8_residual 424 425 426 427 end_loops1: 428 429 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 430 431 432 433 434 435 436 437 438 outer_loop4_residual: 439 sub r12,r0,#3 @pu1_src - 3 440 mov r1,r7 441 add r1,#8 442 mov r10,#4 443 add r12,#8 444 mov r14,#16 445 add r8,#4 446 add r9,#4 447 448 outer_loop_4: 449 add r6,r1,r3 @pu1_dst + dst_strd 450 add r4,r12,r2 @pu1_src + src_strd 451 452 subs r5,r10,#0 @checks wd 453 ble end_inner_loop_4 454 455 inner_loop_4: 456 vld1.u32 {d0},[r12],r11 @vector load pu1_src 457 vld1.u32 {d1},[r12],r11 458 vld1.u32 {d2},[r12],r11 459 vld1.u32 {d3},[r12],r11 460 vld1.u32 {d4},[r12],r11 461 vld1.u32 {d5},[r12],r11 462 vld1.u32 {d6},[r12],r11 463 vld1.u32 {d7},[r12],r11 464 @add r12,r12,#4 @increment the input pointer 465 sub r12,r12,#4 466 @vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] 467 @vext.u8 d3,d0,d1,#3 @vector extract of src[0_3] 468 @vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] 469 470 @vext.u8 d5,d0,d1,#5 @vector extract of src[0_5] 471 @vext.u8 d6,d0,d1,#6 @vector extract of src[0_6] 472 @vext.u8 d7,d0,d1,#7 @vector extract of src[0_7] 473 @vext.u8 d1,d0,d1,#1 @vector extract of src[0_1] 474 vld1.u32 {d12},[r4],r11 @vector load pu1_src + src_strd 475 vld1.u32 {d13},[r4],r11 476 vzip.32 d0,d12 @vector zip the i iteration and ii interation in single register 477 vld1.u32 {d14},[r4],r11 478 vzip.32 d1,d13 479 vld1.u32 {d15},[r4],r11 480 vzip.32 d2,d14 481 vld1.u32 {d16},[r4],r11 482 vzip.32 d3,d15 483 vld1.u32 {d17},[r4],r11 484 vzip.32 d4,d16 485 vld1.u32 {d18},[r4],r11 486 vzip.32 d5,d17 487 vld1.u32 {d19},[r4],r11 488 sub r4,r4,#4 489 @ add r4,r4,#4 @increment the input pointer 490 @ vext.u8 d14,d12,d13,#2 @vector extract of src[0_2] 491 @ vext.u8 d15,d12,d13,#3 @vector extract of src[0_3] 492 @ vext.u8 d16,d12,d13,#4 @vector extract of src[0_4] 493 @ vext.u8 d17,d12,d13,#5 @vector extract of src[0_5] 494 @ vext.u8 d18,d12,d13,#6 @vector extract of src[0_6] 495 @ vext.u8 d19,d12,d13,#7 @vector extract of src[0_7] 496 @vext.u8 d13,d12,d13,#1 @vector extract of src[0_1] 497 498 499 500 501 502 503 504 vzip.32 d6,d18 505 vzip.32 d7,d19 506 507 vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time 508 vmlsl.u8 q4,d0,d24 509 vmlsl.u8 q4,d2,d26 510 vmlal.u8 q4,d3,d27 511 vmlal.u8 q4,d4,d28 512 vmlsl.u8 q4,d5,d29 513 vmlal.u8 q4,d6,d30 514 vmlsl.u8 q4,d7,d31 515 516 vqrshrun.s16 d8,q4,#6 @narrow right shift and saturating the result 517 vst1.32 {d8[0]},[r1]! @store the i iteration result which is in upper part of the register 518 vst1.32 {d8[1]},[r6]! @store the ii iteration result which is in lower part of the register 519 subs r5,r5,#4 @decrement the wd by 4 520 bgt inner_loop_4 521 522 end_inner_loop_4: 523 subs r14,r14,#2 @decrement the ht by 4 524 add r12,r12,r9 @increment the input pointer 2*src_strd-wd 525 add r1,r1,r8 @increment the output pointer 2*dst_strd-wd 526 bgt outer_loop_4 527 @subs r7,r7,#1 528 @ bgt start_loop_count 529 530 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 531 532 533 534 535 536 537 538