1 @/***************************************************************************** 2 @* 3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 @* 5 @* Licensed under the Apache License, Version 2.0 (the "License"); 6 @* you may not use this file except in compliance with the License. 7 @* You may obtain a copy of the License at: 8 @* 9 @* http://www.apache.org/licenses/LICENSE-2.0 10 @* 11 @* Unless required by applicable law or agreed to in writing, software 12 @* distributed under the License is distributed on an "AS IS" BASIS, 13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @* See the License for the specific language governing permissions and 15 @* limitations under the License. 16 @* 17 @*****************************************************************************/ 18 @/** 19 @******************************************************************************* 20 @* @file 21 @* ihevc_intra_pred_luma_mode_11_to_17.s 22 @* 23 @* @brief 24 @* contains function definitions for intra prediction dc filtering. 25 @* functions are coded using neon intrinsics and can be compiled using 26 27 @* rvct 28 @* 29 @* @author 30 @* akshaya mukund 31 @* 32 @* @par list of functions: 33 @* 34 @* 35 @* @remarks 36 @* none 37 @* 38 @******************************************************************************* 39 @*/ 40 @/** 41 @******************************************************************************* 42 @* 43 @* @brief 44 @* luma intraprediction filter for dc input 45 @* 46 @* @par description: 47 @* 48 @* @param[in] pu1_ref 49 @* uword8 pointer to the source 50 @* 51 @* @param[out] pu1_dst 52 @* uword8 pointer to the destination 53 @* 54 @* @param[in] src_strd 55 @* integer source stride 56 @* 57 @* @param[in] dst_strd 58 @* integer destination stride 59 @* 60 @* @param[in] nt 61 @* size of tranform block 62 @* 63 @* @param[in] mode 64 @* type of filtering 65 @* 66 @* @returns 67 @* 68 @* @remarks 69 @* none 70 @* 71 @******************************************************************************* 72 @*/ 73 74 @void ihevc_intra_pred_luma_mode_11_to_17(uword8* pu1_ref, 75 @ word32 src_strd, 76 @ uword8* pu1_dst, 77 @ word32 dst_strd, 78 @ word32 nt, 79 @ word32 mode) 80 @ 81 @**************variables vs registers***************************************** 82 @r0 => *pu1_ref 83 @r1 => src_strd 84 @r2 => *pu1_dst 85 @r3 => dst_strd 86 87 @stack contents from #40 88 @ nt 89 @ mode 90 91 .text 92 .align 4 93 94 95 96 97 .globl ihevc_intra_pred_luma_mode_11_to_17_a9q 98 .extern gai4_ihevc_ang_table 99 .extern gai4_ihevc_inv_ang_table 100 .extern col_for_intra_luma 101 .extern idx_neg_idx_11_17 102 103 gai4_ihevc_ang_table_addr: 104 .long gai4_ihevc_ang_table - ulbl1 - 8 105 106 gai4_ihevc_inv_ang_table_addr: 107 .long gai4_ihevc_inv_ang_table - ulbl2 - 8 108 109 idx_neg_idx_11_17_addr_1: 110 .long idx_neg_idx_11_17 - ulbl3 - 8 111 112 idx_neg_idx_11_17_addr_2: 113 .long idx_neg_idx_11_17 - ulbl4 - 8 114 115 col_for_intra_luma_addr_1: 116 .long col_for_intra_luma - ulbl_1 - 8 117 118 col_for_intra_luma_addr_2: 119 .long col_for_intra_luma - ulbl_2 - 8 120 121 col_for_intra_luma_addr_3: 122 .long col_for_intra_luma - ulbl_3 - 8 123 124 col_for_intra_luma_addr_4: 125 .long col_for_intra_luma - ulbl_4 - 8 126 127 .type ihevc_intra_pred_luma_mode_11_to_17_a9q, %function 128 129 ihevc_intra_pred_luma_mode_11_to_17_a9q: 130 131 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 132 133 ldr r4,[sp,#40] @loads nt 134 ldr r7, gai4_ihevc_ang_table_addr 135 ulbl1: 136 add r7,r7,pc 137 138 ldr r5,[sp,#44] @mode (11 to 17) 139 ldr r8, gai4_ihevc_inv_ang_table_addr 140 ulbl2: 141 add r8,r8,pc 142 143 add r7, r7, r5, lsl #2 @gai4_ihevc_ang_table[mode] 144 add r8, r8, r5, lsl #2 @gai4_ihevc_inv_ang_table[mode - 11] 145 sub r8, r8, #44 146 147 ldr r7, [r7] @intra_pred_ang 148 sub sp, sp, #132 @ref_temp[2 * max_cu_size + 1] 149 150 ldr r8, [r8] @inv_ang 151 add r6, sp, r4 @ref_temp + nt 152 153 mul r9, r4, r7 @nt*intra_pred_ang 154 155 sub r6, r6, #1 @ref_temp + nt - 1 156 157 add r1, r0, r4, lsl #1 @r1 = &src[2nt] 158 vdup.8 d30, r7 @intra_pred_ang 159 160 mov r7, r4 161 162 ldrb r11, [r1], #-1 163 164 asr r9, r9, #5 165 166 ldrb r12, [r1], #-1 167 ldrb r10, [r1], #-1 168 ldrb r14, [r1], #-1 169 170 strb r11, [r6], #1 171 strb r12, [r6], #1 172 strb r10, [r6], #1 173 strb r14, [r6], #1 174 175 subs r7, r7, #4 176 beq end_loop_copy 177 178 sub r6,#4 179 sub r1,#3 180 181 subs r7,r7,#4 182 beq loop_copy_8 183 subs r7,r7,#8 184 beq loop_copy_16 185 186 loop_copy_32: 187 vld1.8 d0,[r1] 188 sub r1,#8 189 vld1.8 d1,[r1] 190 sub r1,#8 191 vld1.8 d2,[r1] 192 sub r1,#8 193 vld1.8 d3,[r1] 194 195 vrev64.8 d0,d0 196 vrev64.8 d1,d1 197 vst1.8 d0,[r6]! 198 vrev64.8 d2,d2 199 vst1.8 d1,[r6]! 200 vrev64.8 d3,d3 201 vst1.8 d2,[r6]! 202 vst1.8 d3,[r6]! 203 sub r1,#1 204 b end_loop_copy 205 206 loop_copy_16: 207 vld1.8 d0,[r1] 208 sub r1,#8 209 vld1.8 d1,[r1] 210 211 vrev64.8 d0,d0 212 vrev64.8 d1,d1 213 214 vst1.8 d0,[r6]! 215 vst1.8 d1,[r6]! 216 sub r1,#1 217 b end_loop_copy 218 219 loop_copy_8: 220 vld1.8 d0,[r1] 221 vrev64.8 d0,d0 222 vst1.8 d0,[r6]! 223 sub r1,#1 224 end_loop_copy: 225 226 ldrb r11, [r1], #-1 227 strb r11, [r6], #1 228 229 cmp r9, #-1 230 bge prologue_8_16_32 231 232 add r6, sp, r4 @ref_temp + nt 233 sub r6, r6, #2 @ref_temp + nt - 2 234 235 mov r12, #0xffffffff 236 237 rsb r9, r9, r12 @count to take care off ref_idx 238 239 add r1, r0, r4, lsl #1 @r1 = &src[2nt] 240 241 mov r7, #128 @inv_ang_sum 242 243 loop_copy_ref_idx: 244 245 add r7, r7, r8 @inv_ang_sum += inv_ang 246 247 ldrb r11, [r1, r7, lsr #8] 248 strb r11, [r6], #-1 249 250 subs r9, r9, #1 251 252 bne loop_copy_ref_idx 253 254 prologue_8_16_32: 255 cmp r4, #4 256 beq sz_4_proc 257 ldr r14, col_for_intra_luma_addr_1 258 ulbl_1: 259 add r14,r14,pc 260 261 lsr r10, r4, #3 262 vld1.8 d31, [r14]! 263 mul r10, r4, r10 @block counter (dec by #8) 264 265 mov r11, r4 @col counter to be inc/dec by #8 266 vmull.s8 q11, d30, d31 @(col+1)*intra_pred_angle [0:7](col) 267 mov r0, #1 268 269 sub r7, r5, #11 270 vdup.8 d2, r0 @contains #1 for adding to get ref_main_idx + 1 271 ldr r12, idx_neg_idx_11_17_addr_1 @load least idx table 272 ulbl3: 273 add r12,r12,pc 274 275 mov r0, #2 276 vdup.8 d3, r0 277 278 add r12, r12, r7, lsl #4 279 mov r8, r12 280 281 mov r7, #8 282 sub r7, r7, r3, lsl #3 @r7 = 8-8r3 283 284 ldr r9, [r8] 285 add r1, sp, r4 @ref_temp + nt 286 287 vmovn.s16 d6, q11 288 vdup.8 d26, r9 @least idx added to final idx values 289 sub r1, r1, #1 @ref_temp + nt - 1 290 291 add r6, r1, r9 292 293 vld1.8 {d0,d1}, [r6] @stores the 32 values reqd based on indices values (from least idx) 294 vshr.s16 q11, q11, #5 295 296 mov r0, #31 297 vdup.8 d29, r0 @contains #31 for vand operation 298 299 mov r0, #32 300 vdup.8 d28, r0 301 302 vqmovn.s16 d8, q11 303 304 vand d6, d6, d29 @fract values in d1/ idx values in d0 305 306 mov r0, #1 307 vdup.8 d27, r0 @row value inc or reset accordingly 308 309 vadd.s8 d8, d8, d27 @ref_main_idx (add row) 310 vsub.s8 d8, d8, d26 @ref_main_idx (row 0) 311 vadd.s8 d9, d8, d2 @ref_main_idx + 1 (row 0) 312 vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 0) 313 vsub.s8 d7, d28, d6 @32-fract 314 315 vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 0) 316 vadd.s8 d4, d8, d2 @ref_main_idx (row 1) 317 vadd.s8 d5, d9, d2 @ref_main_idx + 1 (row 1) 318 319 vtbl.8 d16, {d0,d1}, d4 @load from ref_main_idx (row 1) 320 vmull.u8 q12, d12, d7 @mul (row 0) 321 vmlal.u8 q12, d13, d6 @mul (row 0) 322 323 vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 1) 324 vadd.s8 d8, d8, d3 @ref_main_idx (row 2) 325 vadd.s8 d9, d9, d3 @ref_main_idx + 1 (row 2) 326 327 vrshrn.i16 d24, q12, #5 @round shft (row 0) 328 329 vtbl.8 d14, {d0,d1}, d8 @load from ref_main_idx (row 2) 330 vmull.u8 q11, d16, d7 @mul (row 1) 331 vmlal.u8 q11, d17, d6 @mul (row 1) 332 333 vtbl.8 d15, {d0,d1}, d9 @load from ref_main_idx + 1 (row 2) 334 vadd.s8 d4, d4, d3 @ref_main_idx (row 3) 335 vadd.s8 d5, d5, d3 @ref_main_idx + 1 (row 3) 336 337 vst1.8 d24, [r2], r3 @st (row 0) 338 vrshrn.i16 d22, q11, #5 @round shft (row 1) 339 340 vtbl.8 d10, {d0,d1}, d4 @load from ref_main_idx (row 3) 341 vmull.u8 q10, d14, d7 @mul (row 2) 342 vmlal.u8 q10, d15, d6 @mul (row 2) 343 344 vtbl.8 d11, {d0,d1}, d5 @load from ref_main_idx + 1 (row 3) 345 vadd.s8 d8, d8, d3 @ref_main_idx (row 4) 346 vadd.s8 d9, d9, d3 @ref_main_idx + 1 (row 4) 347 348 vst1.8 d22, [r2], r3 @st (row 1) 349 vrshrn.i16 d20, q10, #5 @round shft (row 2) 350 351 vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 4) 352 vmull.u8 q9, d10, d7 @mul (row 3) 353 vmlal.u8 q9, d11, d6 @mul (row 3) 354 355 vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 4) 356 vadd.s8 d4, d4, d3 @ref_main_idx (row 5) 357 vadd.s8 d5, d5, d3 @ref_main_idx + 1 (row 5) 358 359 vst1.8 d20, [r2], r3 @st (row 2) 360 vrshrn.i16 d18, q9, #5 @round shft (row 3) 361 362 vtbl.8 d16, {d0,d1}, d4 @load from ref_main_idx (row 5) 363 vmull.u8 q12, d12, d7 @mul (row 4) 364 vmlal.u8 q12, d13, d6 @mul (row 4) 365 366 vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 5) 367 vadd.s8 d8, d8, d3 @ref_main_idx (row 6) 368 vadd.s8 d9, d9, d3 @ref_main_idx + 1 (row 6) 369 370 vst1.8 d18, [r2], r3 @st (row 3) 371 vrshrn.i16 d24, q12, #5 @round shft (row 4) 372 373 vtbl.8 d14, {d0,d1}, d8 @load from ref_main_idx (row 6) 374 vmull.u8 q11, d16, d7 @mul (row 5) 375 vmlal.u8 q11, d17, d6 @mul (row 5) 376 377 vtbl.8 d15, {d0,d1}, d9 @load from ref_main_idx + 1 (row 6) 378 vadd.s8 d4, d4, d3 @ref_main_idx (row 7) 379 vadd.s8 d5, d5, d3 @ref_main_idx + 1 (row 7) 380 381 vst1.8 d24, [r2], r3 @st (row 4) 382 vrshrn.i16 d22, q11, #5 @round shft (row 5) 383 384 vtbl.8 d10, {d0,d1}, d4 @load from ref_main_idx (row 7) 385 vmull.u8 q10, d14, d7 @mul (row 6) 386 vmlal.u8 q10, d15, d6 @mul (row 6) 387 388 vtbl.8 d11, {d0,d1}, d5 @load from ref_main_idx + 1 (row 7) 389 vmull.u8 q9, d10, d7 @mul (row 7) 390 vmlal.u8 q9, d11, d6 @mul (row 7) 391 392 vst1.8 d22, [r2], r3 @st (row 5) 393 vrshrn.i16 d20, q10, #5 @round shft (row 6) 394 vrshrn.i16 d18, q9, #5 @round shft (row 7) 395 396 vst1.8 d20, [r2], r3 @st (row 6) 397 398 subs r10, r10, #8 @subtract 8 and go to end if 8x8 399 400 vst1.8 d18, [r2], r3 @st (row 7) 401 402 beq end_func 403 404 subs r11, r11, #8 405 addgt r8, r8, #4 406 addgt r2, r2, r7 407 movle r8, r12 408 suble r2, r2, r4 409 addle r2, r2, #8 410 movle r11, r4 411 ldrle r14, col_for_intra_luma_addr_2 412 ulbl_2: 413 addle r14,r14,pc 414 addle r0, r0, #8 415 416 mov r5,r2 417 vld1.8 d31, [r14]! 418 vmull.s8 q6, d30, d31 @(col+1)*intra_pred_angle [0:7](col) 419 vmovn.s16 d10, q6 420 vshr.s16 q6, q6, #5 421 vqmovn.s16 d11, q6 422 vdup.8 d27, r0 @row value inc or reset accordingly 423 ldr r9, [r8] 424 add r9, r0, r9 425 sub r9, r9, #1 426 vdup.8 d26, r9 427 vadd.s8 d8, d27, d11 @ref_main_idx (add row) 428 429 sub r4,r4,#8 430 431 kernel_8_16_32: 432 433 vsub.s8 d8, d8, d26 @ref_main_idx 434 vmov d26,d10 435 436 subs r11, r11, #8 437 add r6, r1, r9 438 vtbl.8 d10, {d0,d1}, d4 @load from ref_main_idx (row 7) 439 vadd.s8 d9, d2, d8 @ref_main_idx + 1 440 441 vmull.u8 q10, d14, d7 @mul (row 6) 442 vtbl.8 d11, {d0,d1}, d5 @load from ref_main_idx + 1 (row 7) 443 vmlal.u8 q10, d15, d6 @mul (row 6) 444 445 addle r0, r0, #8 446 addgt r8, r8, #4 447 vld1.8 {d0,d1}, [r6] @stores the 32 values reqd based on indices values (from least idx) 448 449 vst1.8 d24, [r5], r3 @st (row 4) 450 vrshrn.i16 d24, q11, #5 @round shft (row 5) 451 452 ldrle r14, col_for_intra_luma_addr_3 453 ulbl_3: 454 addle r14,r14,pc 455 movle r8, r12 456 vdup.8 d27, r0 @row value inc or reset accordingly 457 458 vadd.s8 d4, d2, d8 @ref_main_idx (row 1) 459 vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 0) 460 vadd.s8 d5, d2, d9 @ref_main_idx + 1 (row 1) 461 462 463 vmull.u8 q9, d10, d7 @mul (row 7) 464 vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 0) 465 vmlal.u8 q9, d11, d6 @mul (row 7) 466 467 vld1.8 d31, [r14]! 468 vand d6, d29, d26 @fract values in d1/ idx values in d0 469 470 vst1.8 d24, [r5], r3 @(from previous loop)st (row 5) 471 vrshrn.i16 d20, q10, #5 @(from previous loop)round shft (row 6) 472 473 vadd.s8 d8, d3, d8 @ref_main_idx (row 2) 474 vtbl.8 d16, {d0,d1}, d4 @load from ref_main_idx (row 1) 475 vadd.s8 d9, d3, d9 @ref_main_idx + 1 (row 2) 476 477 addle r11, r4, #8 478 ldr r9, [r8] 479 vsub.s8 d7, d28, d6 @32-fract 480 481 vmull.u8 q12, d12, d7 @mul (row 0) 482 vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 1) 483 vmlal.u8 q12, d13, d6 @mul (row 0) 484 485 vst1.8 d20, [r5], r3 @(from previous loop)st (row 6) 486 vrshrn.i16 d18, q9, #5 @(from previous loop)round shft (row 7) 487 488 vadd.s8 d4, d4, d3 @ref_main_idx (row 3) 489 vtbl.8 d14, {d0,d1}, d8 @load from ref_main_idx (row 2) 490 vadd.s8 d5, d5, d3 @ref_main_idx + 1 (row 3) 491 492 vmull.u8 q11, d16, d7 @mul (row 1) 493 vtbl.8 d15, {d0,d1}, d9 @load from ref_main_idx + 1 (row 2) 494 vmlal.u8 q11, d17, d6 @mul (row 1) 495 496 vrshrn.i16 d24, q12, #5 @round shft (row 0) 497 vst1.8 d18, [r5], r3 @(from previous loop)st (row 7) 498 499 vadd.s8 d8, d8, d3 @ref_main_idx (row 4) 500 vtbl.8 d10, {d0,d1}, d4 @load from ref_main_idx (row 3) 501 vadd.s8 d9, d9, d3 @ref_main_idx + 1 (row 4) 502 503 vmull.u8 q10, d14, d7 @mul (row 2) 504 vtbl.8 d11, {d0,d1}, d5 @load from ref_main_idx + 1 (row 3) 505 vmlal.u8 q10, d15, d6 @mul (row 2) 506 507 vmull.s8 q7, d30, d31 @(col+1)*intra_pred_angle [0:7](col) 508 add r5,r2,r3,lsl#2 509 add r9, r0, r9 510 511 512 vst1.8 d24, [r2], r3 @st (row 0) 513 vrshrn.i16 d22, q11, #5 @round shft (row 1) 514 515 vadd.s8 d4, d4, d3 @ref_main_idx (row 5) 516 vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 4) 517 vadd.s8 d5, d5, d3 @ref_main_idx + 1 (row 5) 518 519 vmull.u8 q9, d10, d7 @mul (row 3) 520 vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 4) 521 vmlal.u8 q9, d11, d6 @mul (row 3) 522 523 vst1.8 d22, [r2], r3 @st (row 1) 524 vrshrn.i16 d20, q10, #5 @round shft (row 2) 525 526 vmovn.s16 d10, q7 527 vshr.s16 q7, q7, #5 528 529 vadd.s8 d8, d8, d3 @ref_main_idx (row 6) 530 vtbl.8 d16, {d0,d1}, d4 @load from ref_main_idx (row 5) 531 vadd.s8 d9, d9, d3 @ref_main_idx + 1 (row 6) 532 533 vmull.u8 q12, d12, d7 @mul (row 4) 534 vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 5) 535 vmlal.u8 q12, d13, d6 @mul (row 4) 536 537 vst1.8 d20, [r2], r3 @st (row 2) 538 vrshrn.i16 d18, q9, #5 @round shft (row 3) 539 540 sub r9, r9, #1 541 vqmovn.s16 d11, q7 542 543 vadd.s8 d4, d4, d3 @ref_main_idx (row 7) 544 vtbl.8 d14, {d0,d1}, d8 @load from ref_main_idx (row 6) 545 vadd.s8 d5, d5, d3 @ref_main_idx + 1 (row 7) 546 547 vmull.u8 q11, d16, d7 @mul (row 5) 548 vtbl.8 d15, {d0,d1}, d9 @load from ref_main_idx + 1 (row 6) 549 vmlal.u8 q11, d17, d6 @mul (row 5) 550 551 vadd.s8 d8, d27, d11 @ref_main_idx (add row) 552 vdup.8 d26, r9 553 554 vst1.8 d18, [r2], r3 @st (row 3) 555 vrshrn.i16 d24, q12, #5 @round shft (row 4) 556 557 558 add r2,r3, lsl #2 559 addgt r2, r7, r2 560 suble r2, r2, r4 561 562 subs r10, r10, #8 @subtract 8 and go to end if 8x8 563 564 bne kernel_8_16_32 565 epil_8_16_32: 566 567 vtbl.8 d10, {d0,d1}, d4 @load from ref_main_idx (row 7) 568 569 vmull.u8 q10, d14, d7 @mul (row 6) 570 vtbl.8 d11, {d0,d1}, d5 @load from ref_main_idx + 1 (row 7) 571 vmlal.u8 q10, d15, d6 @mul (row 6) 572 573 vst1.8 d24, [r5], r3 @st (row 4) 574 vrshrn.i16 d24, q11, #5 @round shft (row 5) 575 576 vmull.u8 q9, d10, d7 @mul (row 7) 577 vmlal.u8 q9, d11, d6 @mul (row 7) 578 579 vst1.8 d24, [r5], r3 @(from previous loop)st (row 5) 580 vrshrn.i16 d20, q10, #5 @(from previous loop)round shft (row 6) 581 582 vst1.8 d20, [r5], r3 @(from previous loop)st (row 6) 583 vrshrn.i16 d18, q9, #5 @(from previous loop)round shft (row 7) 584 585 vst1.8 d18, [r5], r3 @st (row 7) 586 587 588 b end_func 589 590 sz_4_proc: 591 ldr r14, col_for_intra_luma_addr_4 592 ulbl_4: 593 add r14,r14,pc 594 595 vld1.8 d31, [r14] 596 mov r12, #1 597 598 vdup.8 d2, r12 @contains #1 for adding to get ref_main_idx + 1 599 mov r0, #2 600 601 vdup.8 d3, r0 602 ldr r12, idx_neg_idx_11_17_addr_2 @load least idx table 603 ulbl4: 604 add r12,r12,pc 605 606 vmull.s8 q11, d30, d31 @(col+1)*intra_pred_angle [0:7](col) 607 sub r7, r5, #11 608 609 add r12, r12, r7, lsl #4 610 mov r8, r12 611 612 ldr r9, [r8] 613 614 vdup.8 d26, r9 @least idx added to final idx values 615 add r6, sp, r4 @ref_temp + nt 616 617 sub r6, r6, #1 @ref_temp + nt - 1 618 vmovn.s16 d6, q11 619 add r6, r6, r9 620 621 vld1.8 {d0,d1}, [r6] @stores the 32 values reqd based on indices values (from least idx) 622 mov r0, #31 623 624 vdup.8 d29, r0 @contains #31 for vand operation 625 mov r1, #32 626 627 vdup.8 d28, r1 628 629 vshr.s16 q11, q11, #5 630 vqmovn.s16 d8, q11 631 632 vand d6, d6, d29 @fract values in d1/ idx values in d0 633 vsub.s8 d7, d28, d6 @32-fract 634 635 vadd.s8 d8, d8, d2 @ref_main_idx (add 1) 636 vsub.s8 d8, d8, d26 @ref_main_idx 637 vadd.s8 d9, d8, d2 @ref_main_idx + 1 638 639 vadd.s8 d4, d8, d2 @row 1 ref_main_idx 640 vadd.s8 d5, d9, d2 641 642 vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 0) 643 vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 0) 644 645 646 vmull.u8 q12, d12, d7 @mul (row 0) 647 vtbl.8 d16, {d0,d1}, d4 @load from ref_main_idx (row 1) 648 vmlal.u8 q12, d13, d6 @mul (row 0) 649 650 vadd.s8 d8, d8, d3 @idx (row 2) 651 vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 1) 652 vadd.s8 d9, d9, d3 @idx+1 (row 2) 653 654 vmull.u8 q11, d16, d7 @mul (row 1) 655 vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 2) 656 vmlal.u8 q11, d17, d6 @mul (row 1) 657 658 vrshrn.i16 d24, q12, #5 @round shift (row 0) 659 660 vadd.s8 d4, d4, d3 @idx (row 3) 661 vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 2) 662 vadd.s8 d5, d5, d3 @idx+1 (row 3) 663 664 vmull.u8 q10, d12, d7 @mul (row 2) 665 vtbl.8 d16, {d0,d1}, d4 @load from ref_main_idx (row 3) 666 vmlal.u8 q10, d13, d6 @mul (row 2) 667 668 vst1.32 d24[0], [r2], r3 @st row 0 669 vrshrn.i16 d22, q11, #5 @round shift (row 1) 670 671 vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 3) 672 673 vmull.u8 q9, d16, d7 @mul (row 3) 674 vmlal.u8 q9, d17, d6 @mul (row 3) 675 676 vst1.32 d22[0], [r2], r3 @st row 1 677 vrshrn.i16 d20, q10, #5 @round shift (row 2) 678 679 vst1.32 d20[0], [r2], r3 @st row 2 680 681 vrshrn.i16 d18, q9, #5 @round shift (row 3) 682 683 vst1.32 d18[0], [r2], r3 @st (row 3) 684 685 end_func: 686 add sp, sp, #132 687 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 688 689 690 691 692 693 694