1 @/***************************************************************************** 2 @* 3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 @* 5 @* Licensed under the Apache License, Version 2.0 (the "License"); 6 @* you may not use this file except in compliance with the License. 7 @* You may obtain a copy of the License at: 8 @* 9 @* http://www.apache.org/licenses/LICENSE-2.0 10 @* 11 @* Unless required by applicable law or agreed to in writing, software 12 @* distributed under the License is distributed on an "AS IS" BASIS, 13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @* See the License for the specific language governing permissions and 15 @* limitations under the License. 16 @* 17 @*****************************************************************************/ 18 @/** 19 @******************************************************************************* 20 @* @file 21 @* ihevc_intra_pred_luma_mode_11_to_17.s 22 @* 23 @* @brief 24 @* contains function definitions for intra prediction dc filtering. 25 @* functions are coded using neon intrinsics and can be compiled using 26 27 @* rvct 28 @* 29 @* @author 30 @* akshaya mukund 31 @* 32 @* @par list of functions: 33 @* 34 @* 35 @* @remarks 36 @* none 37 @* 38 @******************************************************************************* 39 @*/ 40 @/** 41 @******************************************************************************* 42 @* 43 @* @brief 44 @* luma intraprediction filter for dc input 45 @* 46 @* @par description: 47 @* 48 @* @param[in] pu1_ref 49 @* uword8 pointer to the source 50 @* 51 @* @param[out] pu1_dst 52 @* uword8 pointer to the destination 53 @* 54 @* @param[in] src_strd 55 @* integer source stride 56 @* 57 @* @param[in] dst_strd 58 @* integer destination stride 59 @* 60 @* @param[in] nt 61 @* size of tranform block 62 @* 63 @* @param[in] mode 64 @* type of filtering 65 @* 66 @* @returns 67 @* 68 @* @remarks 69 @* none 70 @* 71 @******************************************************************************* 72 @*/ 73 74 @void ihevc_intra_pred_luma_mode_11_to_17(uword8* pu1_ref, 75 @ word32 src_strd, 76 @ uword8* pu1_dst, 77 @ word32 dst_strd, 78 @ word32 nt, 79 @ word32 mode) 80 @ 81 @**************variables vs registers***************************************** 82 @r0 => *pu1_ref 83 @r1 => src_strd 84 @r2 => *pu1_dst 85 @r3 => dst_strd 86 87 @stack contents from #236 88 @ nt 89 @ mode 90 91 .equ nt_offset, 236 92 .equ mode_offset, 240 93 94 .text 95 .align 4 96 97 98 99 100 .globl ihevc_intra_pred_luma_mode_11_to_17_a9q 101 .extern gai4_ihevc_ang_table 102 .extern gai4_ihevc_inv_ang_table 103 .extern col_for_intra_luma 104 .extern idx_neg_idx_11_17 105 106 gai4_ihevc_ang_table_addr: 107 .long gai4_ihevc_ang_table - ulbl1 - 8 108 109 gai4_ihevc_inv_ang_table_addr: 110 .long gai4_ihevc_inv_ang_table - ulbl2 - 8 111 112 idx_neg_idx_11_17_addr_1: 113 .long idx_neg_idx_11_17 - ulbl3 - 8 114 115 idx_neg_idx_11_17_addr_2: 116 .long idx_neg_idx_11_17 - ulbl4 - 8 117 118 col_for_intra_luma_addr_1: 119 .long col_for_intra_luma - ulbl_1 - 8 120 121 col_for_intra_luma_addr_2: 122 .long col_for_intra_luma - ulbl_2 - 8 123 124 col_for_intra_luma_addr_3: 125 .long col_for_intra_luma - ulbl_3 - 8 126 127 col_for_intra_luma_addr_4: 128 .long col_for_intra_luma - ulbl_4 - 8 129 130 .type ihevc_intra_pred_luma_mode_11_to_17_a9q, %function 131 132 ihevc_intra_pred_luma_mode_11_to_17_a9q: 133 134 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 135 vpush {d8 - d15} 136 sub sp, sp, #132 @ref_temp[2 * max_cu_size + 1] 137 ldr r4,[sp,#nt_offset] @loads nt 138 ldr r7, gai4_ihevc_ang_table_addr 139 ulbl1: 140 add r7,r7,pc 141 142 ldr r5,[sp,#mode_offset] @mode (11 to 17) 143 ldr r8, gai4_ihevc_inv_ang_table_addr 144 ulbl2: 145 add r8,r8,pc 146 147 add r7, r7, r5, lsl #2 @gai4_ihevc_ang_table[mode] 148 add r8, r8, r5, lsl #2 @gai4_ihevc_inv_ang_table[mode - 11] 149 sub r8, r8, #44 150 151 ldr r7, [r7] @intra_pred_ang 152 153 ldr r8, [r8] @inv_ang 154 add r6, sp, r4 @ref_temp + nt 155 156 mul r9, r4, r7 @nt*intra_pred_ang 157 158 sub r6, r6, #1 @ref_temp + nt - 1 159 160 add r1, r0, r4, lsl #1 @r1 = &src[2nt] 161 vdup.8 d30, r7 @intra_pred_ang 162 163 mov r7, r4 164 165 ldrb r11, [r1], #-1 166 167 asr r9, r9, #5 168 169 ldrb r12, [r1], #-1 170 ldrb r10, [r1], #-1 171 ldrb r14, [r1], #-1 172 173 strb r11, [r6], #1 174 strb r12, [r6], #1 175 strb r10, [r6], #1 176 strb r14, [r6], #1 177 178 subs r7, r7, #4 179 beq end_loop_copy 180 181 sub r6,#4 182 sub r1,#3 183 184 subs r7,r7,#4 185 beq loop_copy_8 186 subs r7,r7,#8 187 beq loop_copy_16 188 189 loop_copy_32: 190 vld1.8 d0,[r1] 191 sub r1,#8 192 vld1.8 d1,[r1] 193 sub r1,#8 194 vld1.8 d2,[r1] 195 sub r1,#8 196 vld1.8 d3,[r1] 197 198 vrev64.8 d0,d0 199 vrev64.8 d1,d1 200 vst1.8 d0,[r6]! 201 vrev64.8 d2,d2 202 vst1.8 d1,[r6]! 203 vrev64.8 d3,d3 204 vst1.8 d2,[r6]! 205 vst1.8 d3,[r6]! 206 sub r1,#1 207 b end_loop_copy 208 209 loop_copy_16: 210 vld1.8 d0,[r1] 211 sub r1,#8 212 vld1.8 d1,[r1] 213 214 vrev64.8 d0,d0 215 vrev64.8 d1,d1 216 217 vst1.8 d0,[r6]! 218 vst1.8 d1,[r6]! 219 sub r1,#1 220 b end_loop_copy 221 222 loop_copy_8: 223 vld1.8 d0,[r1] 224 vrev64.8 d0,d0 225 vst1.8 d0,[r6]! 226 sub r1,#1 227 end_loop_copy: 228 229 ldrb r11, [r1], #-1 230 strb r11, [r6], #1 231 232 cmp r9, #-1 233 bge prologue_8_16_32 234 235 add r6, sp, r4 @ref_temp + nt 236 sub r6, r6, #2 @ref_temp + nt - 2 237 238 mov r12, #0xffffffff 239 240 rsb r9, r9, r12 @count to take care off ref_idx 241 242 add r1, r0, r4, lsl #1 @r1 = &src[2nt] 243 244 mov r7, #128 @inv_ang_sum 245 246 loop_copy_ref_idx: 247 248 add r7, r7, r8 @inv_ang_sum += inv_ang 249 250 ldrb r11, [r1, r7, lsr #8] 251 strb r11, [r6], #-1 252 253 subs r9, r9, #1 254 255 bne loop_copy_ref_idx 256 257 prologue_8_16_32: 258 cmp r4, #4 259 beq sz_4_proc 260 ldr r14, col_for_intra_luma_addr_1 261 ulbl_1: 262 add r14,r14,pc 263 264 lsr r10, r4, #3 265 vld1.8 d31, [r14]! 266 mul r10, r4, r10 @block counter (dec by #8) 267 268 mov r11, r4 @col counter to be inc/dec by #8 269 vmull.s8 q11, d30, d31 @(col+1)*intra_pred_angle [0:7](col) 270 mov r0, #1 271 272 sub r7, r5, #11 273 vdup.8 d2, r0 @contains #1 for adding to get ref_main_idx + 1 274 ldr r12, idx_neg_idx_11_17_addr_1 @load least idx table 275 ulbl3: 276 add r12,r12,pc 277 278 mov r0, #2 279 vdup.8 d3, r0 280 281 add r12, r12, r7, lsl #4 282 mov r8, r12 283 284 mov r7, #8 285 sub r7, r7, r3, lsl #3 @r7 = 8-8r3 286 287 ldr r9, [r8] 288 add r1, sp, r4 @ref_temp + nt 289 290 vmovn.s16 d6, q11 291 vdup.8 d26, r9 @least idx added to final idx values 292 sub r1, r1, #1 @ref_temp + nt - 1 293 294 add r6, r1, r9 295 296 vld1.8 {d0,d1}, [r6] @stores the 32 values reqd based on indices values (from least idx) 297 vshr.s16 q11, q11, #5 298 299 mov r0, #31 300 vdup.8 d29, r0 @contains #31 for vand operation 301 302 mov r0, #32 303 vdup.8 d28, r0 304 305 vqmovn.s16 d8, q11 306 307 vand d6, d6, d29 @fract values in d1/ idx values in d0 308 309 mov r0, #1 310 vdup.8 d27, r0 @row value inc or reset accordingly 311 312 vadd.s8 d8, d8, d27 @ref_main_idx (add row) 313 vsub.s8 d8, d8, d26 @ref_main_idx (row 0) 314 vadd.s8 d9, d8, d2 @ref_main_idx + 1 (row 0) 315 vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 0) 316 vsub.s8 d7, d28, d6 @32-fract 317 318 vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 0) 319 vadd.s8 d4, d8, d2 @ref_main_idx (row 1) 320 vadd.s8 d5, d9, d2 @ref_main_idx + 1 (row 1) 321 322 vtbl.8 d16, {d0,d1}, d4 @load from ref_main_idx (row 1) 323 vmull.u8 q12, d12, d7 @mul (row 0) 324 vmlal.u8 q12, d13, d6 @mul (row 0) 325 326 vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 1) 327 vadd.s8 d8, d8, d3 @ref_main_idx (row 2) 328 vadd.s8 d9, d9, d3 @ref_main_idx + 1 (row 2) 329 330 vrshrn.i16 d24, q12, #5 @round shft (row 0) 331 332 vtbl.8 d14, {d0,d1}, d8 @load from ref_main_idx (row 2) 333 vmull.u8 q11, d16, d7 @mul (row 1) 334 vmlal.u8 q11, d17, d6 @mul (row 1) 335 336 vtbl.8 d15, {d0,d1}, d9 @load from ref_main_idx + 1 (row 2) 337 vadd.s8 d4, d4, d3 @ref_main_idx (row 3) 338 vadd.s8 d5, d5, d3 @ref_main_idx + 1 (row 3) 339 340 vst1.8 d24, [r2], r3 @st (row 0) 341 vrshrn.i16 d22, q11, #5 @round shft (row 1) 342 343 vtbl.8 d10, {d0,d1}, d4 @load from ref_main_idx (row 3) 344 vmull.u8 q10, d14, d7 @mul (row 2) 345 vmlal.u8 q10, d15, d6 @mul (row 2) 346 347 vtbl.8 d11, {d0,d1}, d5 @load from ref_main_idx + 1 (row 3) 348 vadd.s8 d8, d8, d3 @ref_main_idx (row 4) 349 vadd.s8 d9, d9, d3 @ref_main_idx + 1 (row 4) 350 351 vst1.8 d22, [r2], r3 @st (row 1) 352 vrshrn.i16 d20, q10, #5 @round shft (row 2) 353 354 vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 4) 355 vmull.u8 q9, d10, d7 @mul (row 3) 356 vmlal.u8 q9, d11, d6 @mul (row 3) 357 358 vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 4) 359 vadd.s8 d4, d4, d3 @ref_main_idx (row 5) 360 vadd.s8 d5, d5, d3 @ref_main_idx + 1 (row 5) 361 362 vst1.8 d20, [r2], r3 @st (row 2) 363 vrshrn.i16 d18, q9, #5 @round shft (row 3) 364 365 vtbl.8 d16, {d0,d1}, d4 @load from ref_main_idx (row 5) 366 vmull.u8 q12, d12, d7 @mul (row 4) 367 vmlal.u8 q12, d13, d6 @mul (row 4) 368 369 vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 5) 370 vadd.s8 d8, d8, d3 @ref_main_idx (row 6) 371 vadd.s8 d9, d9, d3 @ref_main_idx + 1 (row 6) 372 373 vst1.8 d18, [r2], r3 @st (row 3) 374 vrshrn.i16 d24, q12, #5 @round shft (row 4) 375 376 vtbl.8 d14, {d0,d1}, d8 @load from ref_main_idx (row 6) 377 vmull.u8 q11, d16, d7 @mul (row 5) 378 vmlal.u8 q11, d17, d6 @mul (row 5) 379 380 vtbl.8 d15, {d0,d1}, d9 @load from ref_main_idx + 1 (row 6) 381 vadd.s8 d4, d4, d3 @ref_main_idx (row 7) 382 vadd.s8 d5, d5, d3 @ref_main_idx + 1 (row 7) 383 384 vst1.8 d24, [r2], r3 @st (row 4) 385 vrshrn.i16 d22, q11, #5 @round shft (row 5) 386 387 vtbl.8 d10, {d0,d1}, d4 @load from ref_main_idx (row 7) 388 vmull.u8 q10, d14, d7 @mul (row 6) 389 vmlal.u8 q10, d15, d6 @mul (row 6) 390 391 vtbl.8 d11, {d0,d1}, d5 @load from ref_main_idx + 1 (row 7) 392 vmull.u8 q9, d10, d7 @mul (row 7) 393 vmlal.u8 q9, d11, d6 @mul (row 7) 394 395 vst1.8 d22, [r2], r3 @st (row 5) 396 vrshrn.i16 d20, q10, #5 @round shft (row 6) 397 vrshrn.i16 d18, q9, #5 @round shft (row 7) 398 399 vst1.8 d20, [r2], r3 @st (row 6) 400 401 subs r10, r10, #8 @subtract 8 and go to end if 8x8 402 403 vst1.8 d18, [r2], r3 @st (row 7) 404 405 beq end_func 406 407 subs r11, r11, #8 408 addgt r8, r8, #4 409 addgt r2, r2, r7 410 movle r8, r12 411 suble r2, r2, r4 412 addle r2, r2, #8 413 movle r11, r4 414 ldrle r14, col_for_intra_luma_addr_2 415 ulbl_2: 416 addle r14,r14,pc 417 addle r0, r0, #8 418 419 mov r5,r2 420 vld1.8 d31, [r14]! 421 vmull.s8 q6, d30, d31 @(col+1)*intra_pred_angle [0:7](col) 422 vmovn.s16 d10, q6 423 vshr.s16 q6, q6, #5 424 vqmovn.s16 d11, q6 425 vdup.8 d27, r0 @row value inc or reset accordingly 426 ldr r9, [r8] 427 add r9, r0, r9 428 sub r9, r9, #1 429 vdup.8 d26, r9 430 vadd.s8 d8, d27, d11 @ref_main_idx (add row) 431 432 sub r4,r4,#8 433 434 kernel_8_16_32: 435 436 vsub.s8 d8, d8, d26 @ref_main_idx 437 vmov d26,d10 438 439 subs r11, r11, #8 440 add r6, r1, r9 441 vtbl.8 d10, {d0,d1}, d4 @load from ref_main_idx (row 7) 442 vadd.s8 d9, d2, d8 @ref_main_idx + 1 443 444 vmull.u8 q10, d14, d7 @mul (row 6) 445 vtbl.8 d11, {d0,d1}, d5 @load from ref_main_idx + 1 (row 7) 446 vmlal.u8 q10, d15, d6 @mul (row 6) 447 448 addle r0, r0, #8 449 addgt r8, r8, #4 450 vld1.8 {d0,d1}, [r6] @stores the 32 values reqd based on indices values (from least idx) 451 452 vst1.8 d24, [r5], r3 @st (row 4) 453 vrshrn.i16 d24, q11, #5 @round shft (row 5) 454 455 ldrle r14, col_for_intra_luma_addr_3 456 ulbl_3: 457 addle r14,r14,pc 458 movle r8, r12 459 vdup.8 d27, r0 @row value inc or reset accordingly 460 461 vadd.s8 d4, d2, d8 @ref_main_idx (row 1) 462 vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 0) 463 vadd.s8 d5, d2, d9 @ref_main_idx + 1 (row 1) 464 465 466 vmull.u8 q9, d10, d7 @mul (row 7) 467 vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 0) 468 vmlal.u8 q9, d11, d6 @mul (row 7) 469 470 vld1.8 d31, [r14]! 471 vand d6, d29, d26 @fract values in d1/ idx values in d0 472 473 vst1.8 d24, [r5], r3 @(from previous loop)st (row 5) 474 vrshrn.i16 d20, q10, #5 @(from previous loop)round shft (row 6) 475 476 vadd.s8 d8, d3, d8 @ref_main_idx (row 2) 477 vtbl.8 d16, {d0,d1}, d4 @load from ref_main_idx (row 1) 478 vadd.s8 d9, d3, d9 @ref_main_idx + 1 (row 2) 479 480 addle r11, r4, #8 481 ldr r9, [r8] 482 vsub.s8 d7, d28, d6 @32-fract 483 484 vmull.u8 q12, d12, d7 @mul (row 0) 485 vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 1) 486 vmlal.u8 q12, d13, d6 @mul (row 0) 487 488 vst1.8 d20, [r5], r3 @(from previous loop)st (row 6) 489 vrshrn.i16 d18, q9, #5 @(from previous loop)round shft (row 7) 490 491 vadd.s8 d4, d4, d3 @ref_main_idx (row 3) 492 vtbl.8 d14, {d0,d1}, d8 @load from ref_main_idx (row 2) 493 vadd.s8 d5, d5, d3 @ref_main_idx + 1 (row 3) 494 495 vmull.u8 q11, d16, d7 @mul (row 1) 496 vtbl.8 d15, {d0,d1}, d9 @load from ref_main_idx + 1 (row 2) 497 vmlal.u8 q11, d17, d6 @mul (row 1) 498 499 vrshrn.i16 d24, q12, #5 @round shft (row 0) 500 vst1.8 d18, [r5], r3 @(from previous loop)st (row 7) 501 502 vadd.s8 d8, d8, d3 @ref_main_idx (row 4) 503 vtbl.8 d10, {d0,d1}, d4 @load from ref_main_idx (row 3) 504 vadd.s8 d9, d9, d3 @ref_main_idx + 1 (row 4) 505 506 vmull.u8 q10, d14, d7 @mul (row 2) 507 vtbl.8 d11, {d0,d1}, d5 @load from ref_main_idx + 1 (row 3) 508 vmlal.u8 q10, d15, d6 @mul (row 2) 509 510 vmull.s8 q7, d30, d31 @(col+1)*intra_pred_angle [0:7](col) 511 add r5,r2,r3,lsl#2 512 add r9, r0, r9 513 514 515 vst1.8 d24, [r2], r3 @st (row 0) 516 vrshrn.i16 d22, q11, #5 @round shft (row 1) 517 518 vadd.s8 d4, d4, d3 @ref_main_idx (row 5) 519 vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 4) 520 vadd.s8 d5, d5, d3 @ref_main_idx + 1 (row 5) 521 522 vmull.u8 q9, d10, d7 @mul (row 3) 523 vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 4) 524 vmlal.u8 q9, d11, d6 @mul (row 3) 525 526 vst1.8 d22, [r2], r3 @st (row 1) 527 vrshrn.i16 d20, q10, #5 @round shft (row 2) 528 529 vmovn.s16 d10, q7 530 vshr.s16 q7, q7, #5 531 532 vadd.s8 d8, d8, d3 @ref_main_idx (row 6) 533 vtbl.8 d16, {d0,d1}, d4 @load from ref_main_idx (row 5) 534 vadd.s8 d9, d9, d3 @ref_main_idx + 1 (row 6) 535 536 vmull.u8 q12, d12, d7 @mul (row 4) 537 vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 5) 538 vmlal.u8 q12, d13, d6 @mul (row 4) 539 540 vst1.8 d20, [r2], r3 @st (row 2) 541 vrshrn.i16 d18, q9, #5 @round shft (row 3) 542 543 sub r9, r9, #1 544 vqmovn.s16 d11, q7 545 546 vadd.s8 d4, d4, d3 @ref_main_idx (row 7) 547 vtbl.8 d14, {d0,d1}, d8 @load from ref_main_idx (row 6) 548 vadd.s8 d5, d5, d3 @ref_main_idx + 1 (row 7) 549 550 vmull.u8 q11, d16, d7 @mul (row 5) 551 vtbl.8 d15, {d0,d1}, d9 @load from ref_main_idx + 1 (row 6) 552 vmlal.u8 q11, d17, d6 @mul (row 5) 553 554 vadd.s8 d8, d27, d11 @ref_main_idx (add row) 555 vdup.8 d26, r9 556 557 vst1.8 d18, [r2], r3 @st (row 3) 558 vrshrn.i16 d24, q12, #5 @round shft (row 4) 559 560 561 add r2,r3, lsl #2 562 addgt r2, r7, r2 563 suble r2, r2, r4 564 565 subs r10, r10, #8 @subtract 8 and go to end if 8x8 566 567 bne kernel_8_16_32 568 epil_8_16_32: 569 570 vtbl.8 d10, {d0,d1}, d4 @load from ref_main_idx (row 7) 571 572 vmull.u8 q10, d14, d7 @mul (row 6) 573 vtbl.8 d11, {d0,d1}, d5 @load from ref_main_idx + 1 (row 7) 574 vmlal.u8 q10, d15, d6 @mul (row 6) 575 576 vst1.8 d24, [r5], r3 @st (row 4) 577 vrshrn.i16 d24, q11, #5 @round shft (row 5) 578 579 vmull.u8 q9, d10, d7 @mul (row 7) 580 vmlal.u8 q9, d11, d6 @mul (row 7) 581 582 vst1.8 d24, [r5], r3 @(from previous loop)st (row 5) 583 vrshrn.i16 d20, q10, #5 @(from previous loop)round shft (row 6) 584 585 vst1.8 d20, [r5], r3 @(from previous loop)st (row 6) 586 vrshrn.i16 d18, q9, #5 @(from previous loop)round shft (row 7) 587 588 vst1.8 d18, [r5], r3 @st (row 7) 589 590 591 b end_func 592 593 sz_4_proc: 594 ldr r14, col_for_intra_luma_addr_4 595 ulbl_4: 596 add r14,r14,pc 597 598 vld1.8 d31, [r14] 599 mov r12, #1 600 601 vdup.8 d2, r12 @contains #1 for adding to get ref_main_idx + 1 602 mov r0, #2 603 604 vdup.8 d3, r0 605 ldr r12, idx_neg_idx_11_17_addr_2 @load least idx table 606 ulbl4: 607 add r12,r12,pc 608 609 vmull.s8 q11, d30, d31 @(col+1)*intra_pred_angle [0:7](col) 610 sub r7, r5, #11 611 612 add r12, r12, r7, lsl #4 613 mov r8, r12 614 615 ldr r9, [r8] 616 617 vdup.8 d26, r9 @least idx added to final idx values 618 add r6, sp, r4 @ref_temp + nt 619 620 sub r6, r6, #1 @ref_temp + nt - 1 621 vmovn.s16 d6, q11 622 add r6, r6, r9 623 624 vld1.8 {d0,d1}, [r6] @stores the 32 values reqd based on indices values (from least idx) 625 mov r0, #31 626 627 vdup.8 d29, r0 @contains #31 for vand operation 628 mov r1, #32 629 630 vdup.8 d28, r1 631 632 vshr.s16 q11, q11, #5 633 vqmovn.s16 d8, q11 634 635 vand d6, d6, d29 @fract values in d1/ idx values in d0 636 vsub.s8 d7, d28, d6 @32-fract 637 638 vadd.s8 d8, d8, d2 @ref_main_idx (add 1) 639 vsub.s8 d8, d8, d26 @ref_main_idx 640 vadd.s8 d9, d8, d2 @ref_main_idx + 1 641 642 vadd.s8 d4, d8, d2 @row 1 ref_main_idx 643 vadd.s8 d5, d9, d2 644 645 vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 0) 646 vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 0) 647 648 649 vmull.u8 q12, d12, d7 @mul (row 0) 650 vtbl.8 d16, {d0,d1}, d4 @load from ref_main_idx (row 1) 651 vmlal.u8 q12, d13, d6 @mul (row 0) 652 653 vadd.s8 d8, d8, d3 @idx (row 2) 654 vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 1) 655 vadd.s8 d9, d9, d3 @idx+1 (row 2) 656 657 vmull.u8 q11, d16, d7 @mul (row 1) 658 vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 2) 659 vmlal.u8 q11, d17, d6 @mul (row 1) 660 661 vrshrn.i16 d24, q12, #5 @round shift (row 0) 662 663 vadd.s8 d4, d4, d3 @idx (row 3) 664 vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 2) 665 vadd.s8 d5, d5, d3 @idx+1 (row 3) 666 667 vmull.u8 q10, d12, d7 @mul (row 2) 668 vtbl.8 d16, {d0,d1}, d4 @load from ref_main_idx (row 3) 669 vmlal.u8 q10, d13, d6 @mul (row 2) 670 671 vst1.32 d24[0], [r2], r3 @st row 0 672 vrshrn.i16 d22, q11, #5 @round shift (row 1) 673 674 vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 3) 675 676 vmull.u8 q9, d16, d7 @mul (row 3) 677 vmlal.u8 q9, d17, d6 @mul (row 3) 678 679 vst1.32 d22[0], [r2], r3 @st row 1 680 vrshrn.i16 d20, q10, #5 @round shift (row 2) 681 682 vst1.32 d20[0], [r2], r3 @st row 2 683 684 vrshrn.i16 d18, q9, #5 @round shift (row 3) 685 686 vst1.32 d18[0], [r2], r3 @st (row 3) 687 688 end_func: 689 add sp, sp, #132 690 vpop {d8 - d15} 691 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 692 693 694 695 696 697 698