1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 //******************************************************************************* 20 //* @file 21 //* ihevc_intra_pred_chroma_mode_11_to_17.s 22 //* 23 //* @brief 24 //* contains function definitions for intra prediction chroma mode 11 to 17 25 //* functions are coded using neon intrinsics and can be compiled using 26 27 //* rvct 28 //* 29 //* @author 30 //* akshaya mukund 31 //* 32 //* @par list of functions: 33 //* 34 //* 35 //* @remarks 36 //* none 37 //* 38 //******************************************************************************* 39 //*/ 40 ///** 41 //******************************************************************************* 42 //* 43 //* @brief 44 //* luma intraprediction filter for dc input 45 //* 46 //* @par description: 47 //* 48 //* @param[in] pu1_ref 49 //* uword8 pointer to the source 50 //* 51 //* @param[out] pu1_dst 52 //* uword8 pointer to the destination 53 //* 54 //* @param[in] src_strd 55 //* integer source stride 56 //* 57 //* @param[in] dst_strd 58 //* integer destination stride 59 //* 60 //* @param[in] nt 61 //* size of tranform block 62 //* 63 //* @param[in] mode 64 //* type of filtering 65 //* 66 //* @returns 67 //* 68 //* @remarks 69 //* none 70 //* 71 //******************************************************************************* 72 //*/ 73 74 //void ihevc_intra_pred_chroma_mode_11_to_17(uword8* pu1_ref, 75 // word32 src_strd, 76 // uword8* pu1_dst, 77 // word32 dst_strd, 78 // word32 nt, 79 // word32 mode) 80 // 81 //**************variables vs registers***************************************** 82 //x0 => *pu1_ref 83 //x1 => src_strd 84 //x2 => *pu1_dst 85 //x3 => dst_strd 86 87 //stack contents from #40 88 // nt 89 // mode 90 91 .text 92 .align 4 93 .include "ihevc_neon_macros.s" 94 95 96 97 .globl ihevc_intra_pred_chroma_mode_11_to_17_av8 98 .extern gai4_ihevc_ang_table 99 .extern gai4_ihevc_inv_ang_table 100 .extern col_for_intra_chroma 101 .extern idx_neg_idx_chroma_11_17 102 103 .type ihevc_intra_pred_chroma_mode_11_to_17_av8, %function 104 105 ihevc_intra_pred_chroma_mode_11_to_17_av8: 106 107 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 108 109 stp d12,d13,[sp,#-16]! 110 stp d14,d15,[sp,#-16]! 111 stp x19, x20,[sp,#-16]! 112 113 adrp x7, :got:gai4_ihevc_ang_table 114 ldr x7, [x7, #:got_lo12:gai4_ihevc_ang_table] 115 116 adrp x8, :got:gai4_ihevc_inv_ang_table 117 ldr x8, [x8, #:got_lo12:gai4_ihevc_inv_ang_table] 118 119 add x7, x7, x5, lsl #2 //gai4_ihevc_ang_table[mode] 120 add x8, x8, x5, lsl #2 //gai4_ihevc_inv_ang_table[mode - 11] 121 sub x8, x8, #44 122 123 ldr w7, [x7] //intra_pred_ang 124 sxtw x7,w7 125 sub sp, sp, #132 //ref_temp[2 * max_cu_size + 2] 126 127 ldr w8, [x8] //inv_ang 128 sxtw x8,w8 129 add x6, sp, x4, lsl #1 //ref_temp + 2 * nt 130 131 mul x9, x4, x7 //nt*intra_pred_ang 132 133 sub x6, x6, #2 //ref_temp + 2*nt - 2 134 135 add x1, x0, x4, lsl #2 //x1 = &src[4nt] 136 dup v30.8b,w7 //intra_pred_ang 137 138 mov x7, x4 139 140 sub x1,x1,#6 //address calculation for copying 4 halfwords 141 142 asr x9, x9, #5 143 144 ld1 {v0.8b},[x1] 145 rev64 v0.4h, v0.4h 146 st1 {v0.8b},[x6],#8 147 148 sub x1, x1,#8 149 150 subs x7, x7, #4 151 add x20, x1,#8 152 csel x1, x20, x1,eq 153 beq end_loop_copy 154 subs x7,x7,#4 155 beq loop_copy_8 156 subs x7,x7,#8 157 beq loop_copy_16 158 159 loop_copy_32: 160 sub x1, x1,#24 161 ld1 {v0.16b, v1.16b},[x1] 162 163 sub x1, x1,#24 164 ld1 {v0.16b, v1.16b},[x1],#32 165 166 rev64 v6.4h, v6.4h 167 rev64 v5.4h, v5.4h 168 rev64 v4.4h, v4.4h 169 rev64 v3.4h, v3.4h 170 rev64 v2.4h, v2.4h 171 rev64 v1.4h, v1.4h 172 rev64 v0.4h, v0.4h 173 174 st1 {v6.8b},[x6],#8 175 st1 {v5.8b},[x6],#8 176 st1 {v4.8b},[x6],#8 177 st1 {v3.8b},[x6],#8 178 st1 {v2.8b},[x6],#8 179 st1 {v1.8b},[x6],#8 180 st1 {v0.8b},[x6],#8 181 182 ld1 {v4.8b, v5.8b, v6.8b},[x1],#24 183 b end_loop_copy 184 185 loop_copy_16: 186 sub x1, x1,#16 187 ld1 {v0.8b, v1.8b, v2.8b},[x1] 188 189 rev64 v2.4h, v2.4h 190 rev64 v1.4h, v1.4h 191 rev64 v0.4h, v0.4h 192 193 st1 {v2.8b},[x6],#8 194 st1 {v1.8b},[x6],#8 195 st1 {v0.8b},[x6],#8 196 197 b end_loop_copy 198 loop_copy_8: 199 ld1 {v0.8b},[x1] 200 rev64 v0.4h, v0.4h 201 st1 {v0.8b},[x6],#8 202 end_loop_copy: 203 sub x1, x1,#2 204 205 ldrh w11, [x1], #-2 206 sxtw x11,w11 207 strh w11, [x6], #2 208 sxtw x11,w11 209 210 cmn x9, #1 211 bge prologue_8_16_32 212 213 add x6, sp, x4, lsl #1 //ref_temp + 2 * nt 214 sub x6, x6, #4 //ref_temp + 2 * nt - 2 - 2 215 216 mov x12, #-1 217 218 sub x20, x9, x12 //count to take care off ref_idx 219 neg x9, x20 220 221 add x1, x0, x4, lsl #2 //x1 = &src[4nt] 222 223 mov x7, #128 //inv_ang_sum 224 225 loop_copy_ref_idx: 226 227 add x7, x7, x8 //inv_ang_sum += inv_ang 228 229 lsr x0, x7, #8 230 lsl x0, x0, #1 231 232 ldrh w11, [x1, x0] 233 sxtw x11,w11 234 strh w11, [x6], #-2 235 sxtw x11,w11 236 237 subs x9, x9, #1 238 239 bne loop_copy_ref_idx 240 241 prologue_8_16_32: 242 243 adrp x14, :got:col_for_intra_chroma 244 ldr x14, [x14, #:got_lo12:col_for_intra_chroma] 245 246 lsr x10, x4, #3 247 ld1 {v31.8b},[x14],#8 248 mul x10, x4, x10 //block counter (dec by #8) 249 250 lsl x11, x4, #1 //col counter to be inc/dec by #8 251 smull v22.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) 252 253 sub x7, x5, #11 254 255 adrp x12, :got:idx_neg_idx_chroma_11_17 //load least idx table 256 ldr x12, [x12, #:got_lo12:idx_neg_idx_chroma_11_17] 257 258 add x12, x12, x7, lsl #4 259 mov x8, x12 260 261 mov x7, #8 262 sub x7, x7, x3, lsl #3 //x7 = 8-8x3 263 264 ldr w9, [x8] 265 sxtw x9,w9 266 lsl x9, x9, #1 267 add x1, sp, x4, lsl #1 //ref_temp + 2nt 268 269 xtn v6.8b, v22.8h 270 dup v26.8b,w9 //least idx added to final idx values 271 sub x1, x1, #2 //ref_temp + 2nt - 2 272 273 add x6, x1, x9 274 275 ld1 {v0.16b, v1.16b}, [x6] //stores the 32 values reqd based on indices values (from least idx) 276 sshr v22.8h, v22.8h,#5 277 278 // mov x0, #31 279 movi v29.8b, #31 //contains #31 for vand operation 280 281 // mov x0, #32 282 movi v28.8b, #32 283 284 sqxtn v19.8b, v22.8h 285 shl v19.8b, v19.8b,#1 // 2 * idx 286 287 and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0 288 289 // mov x0, #2 290 movi v29.8b, #2 //contains #2 for adding to get ref_main_idx + 1 291 292 mov x0,#0x100 // idx value for v is +1 of u 293 dup v27.4h,w0 294 add v27.8b, v27.8b , v29.8b 295 mov x0,#0 296 297 add v19.8b, v19.8b , v27.8b //ref_main_idx (add row) 298 sub v19.8b, v19.8b , v26.8b //ref_main_idx (row 0) 299 add v21.8b, v19.8b , v29.8b //ref_main_idx + 1 (row 0) 300 tbl v12.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 0) 301 sub v7.8b, v28.8b , v6.8b //32-fract 302 303 tbl v13.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 0) 304 add v4.8b, v19.8b , v29.8b //ref_main_idx (row 1) 305 add v5.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 1) 306 307 // mov x0, #4 @ 2 *(row * 2 ) 308 movi v29.8b, #4 309 310 tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1) 311 umull v24.8h, v12.8b, v7.8b //mul (row 0) 312 umlal v24.8h, v13.8b, v6.8b //mul (row 0) 313 314 tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1) 315 add v19.8b, v19.8b , v29.8b //ref_main_idx (row 2) 316 add v21.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 2) 317 318 rshrn v24.8b, v24.8h,#5 //round shft (row 0) 319 320 tbl v14.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 2) 321 umull v22.8h, v16.8b, v7.8b //mul (row 1) 322 umlal v22.8h, v17.8b, v6.8b //mul (row 1) 323 324 tbl v15.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 2) 325 add v4.8b, v4.8b , v29.8b //ref_main_idx (row 3) 326 add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 3) 327 328 st1 {v24.8b},[x2], x3 //st (row 0) 329 rshrn v22.8b, v22.8h,#5 //round shft (row 1) 330 331 tbl v23.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3) 332 umull v20.8h, v14.8b, v7.8b //mul (row 2) 333 umlal v20.8h, v15.8b, v6.8b //mul (row 2) 334 335 tbl v25.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3) 336 add v19.8b, v19.8b , v29.8b //ref_main_idx (row 4) 337 add v21.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 4) 338 339 st1 {v22.8b},[x2], x3 //st (row 1) 340 rshrn v20.8b, v20.8h,#5 //round shft (row 2) 341 342 tbl v12.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 4) 343 umull v18.8h, v23.8b, v7.8b //mul (row 3) 344 umlal v18.8h, v25.8b, v6.8b //mul (row 3) 345 346 tbl v13.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 4) 347 add v4.8b, v4.8b , v29.8b //ref_main_idx (row 5) 348 add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 5) 349 350 st1 {v20.8b},[x2], x3 //st (row 2) 351 rshrn v18.8b, v18.8h,#5 //round shft (row 3) 352 353 tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5) 354 umull v24.8h, v12.8b, v7.8b //mul (row 4) 355 umlal v24.8h, v13.8b, v6.8b //mul (row 4) 356 357 tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5) 358 add v19.8b, v19.8b , v29.8b //ref_main_idx (row 6) 359 add v21.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 6) 360 361 st1 {v18.8b},[x2], x3 //st (row 3) 362 cmp x4,#4 363 beq end_func 364 rshrn v24.8b, v24.8h,#5 //round shft (row 4) 365 366 tbl v14.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 6) 367 umull v22.8h, v16.8b, v7.8b //mul (row 5) 368 umlal v22.8h, v17.8b, v6.8b //mul (row 5) 369 370 tbl v15.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 6) 371 add v4.8b, v4.8b , v29.8b //ref_main_idx (row 7) 372 add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 7) 373 374 st1 {v24.8b},[x2], x3 //st (row 4) 375 rshrn v22.8b, v22.8h,#5 //round shft (row 5) 376 377 tbl v23.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) 378 umull v20.8h, v14.8b, v7.8b //mul (row 6) 379 umlal v20.8h, v15.8b, v6.8b //mul (row 6) 380 381 tbl v25.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7) 382 umull v18.8h, v23.8b, v7.8b //mul (row 7) 383 umlal v18.8h, v25.8b, v6.8b //mul (row 7) 384 385 st1 {v22.8b},[x2], x3 //st (row 5) 386 rshrn v20.8b, v20.8h,#5 //round shft (row 6) 387 rshrn v18.8b, v18.8h,#5 //round shft (row 7) 388 389 st1 {v20.8b},[x2], x3 //st (row 6) 390 391 subs x10, x10, #4 //subtract 8 and go to end if 8x8 392 393 st1 {v18.8b},[x2], x3 //st (row 7) 394 395 beq end_func 396 397 subs x11, x11, #8 398 add x20, x8, #4 399 csel x8, x20, x8,gt 400 add x20, x2, x7 401 csel x2, x20, x2,gt 402 csel x8, x12, x8,le 403 sub x20, x2, x4 404 csel x2, x20, x2,le 405 add x20, x2, #8 406 csel x2, x20, x2,le 407 lsl x20, x4, #1 408 csel x11,x20,x11,le 409 bgt lbl400 410 adrp x14, :got:col_for_intra_chroma 411 ldr x14, [x14, #:got_lo12:col_for_intra_chroma] 412 lbl400: 413 add x20, x0, #8 414 csel x0, x20, x0,le 415 416 ld1 {v31.8b},[x14],#8 417 smull v12.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) 418 xtn v23.8b, v12.8h 419 sshr v12.8h, v12.8h,#5 420 sqxtn v25.8b, v12.8h 421 shl v25.8b, v25.8b,#1 422 orr x5,x0,x0, lsl#8 423 add x5, x5,#0x002 424 add x5, x5,#0x300 425 dup v27.4h,w5 //row value inc or reset accordingly 426 ldr w9, [x8] 427 sxtw x9,w9 428 lsl x9, x9, #1 429 add x9, x9, x0, lsl #1 430 // sub x9, x9, #1 431 dup v26.8b,w9 432 add v19.8b, v27.8b , v25.8b //ref_main_idx (add row) 433 mov x5,x2 434 435 // sub x4,x4,#8 436 437 kernel_8_16_32: 438 movi v29.8b, #2 //contains #2 for adding to get ref_main_idx + 1 439 440 sub v19.8b, v19.8b , v26.8b //ref_main_idx 441 mov v26.8b, v23.8b 442 443 subs x11, x11, #8 444 add x6, x1, x9 445 tbl v23.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) 446 add v21.8b, v29.8b , v19.8b //ref_main_idx + 1 447 448 umull v20.8h, v14.8b, v7.8b //mul (row 6) 449 tbl v25.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7) 450 umlal v20.8h, v15.8b, v6.8b //mul (row 6) 451 452 add x20, x0, #8 453 csel x0, x20, x0,le 454 add x20, x8, #4 455 csel x8, x20, x8,gt 456 ld1 {v0.16b, v1.16b}, [x6] //stores the 32 values reqd based on indices values (from least idx) 457 458 st1 {v24.8b},[x5], x3 //st (row 4) 459 rshrn v24.8b, v22.8h,#5 //round shft (row 5) 460 461 csel x8, x12, x8,le 462 orr x9,x0,x0, lsl#8 463 lsl x9, x9, #1 464 add x9, x9,#0x002 465 add x9, x9,#0x300 466 dup v27.4h,w9 //row value inc or reset accordingly 467 468 bgt lbl452 469 adrp x14, :got:col_for_intra_chroma 470 ldr x14, [x14, #:got_lo12:col_for_intra_chroma] 471 lbl452: 472 473 add v4.8b, v29.8b , v19.8b //ref_main_idx (row 1) 474 tbl v12.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 0) 475 add v5.8b, v29.8b , v21.8b //ref_main_idx + 1 (row 1) 476 477 movi v29.8b, #31 //contains #2 for adding to get ref_main_idx + 1 478 479 umull v18.8h, v23.8b, v7.8b //mul (row 7) 480 tbl v13.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 0) 481 umlal v18.8h, v25.8b, v6.8b //mul (row 7) 482 483 ld1 {v31.8b},[x14],#8 484 and v6.8b, v29.8b , v26.8b //fract values in d1/ idx values in d0 485 486 movi v29.8b, #4 //contains #2 for adding to get ref_main_idx + 1 487 488 st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5) 489 rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6) 490 491 add v19.8b, v29.8b , v19.8b //ref_main_idx (row 2) 492 tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1) 493 add v21.8b, v29.8b , v21.8b //ref_main_idx + 1 (row 2) 494 495 lsl x20, x4, #1 496 csel x11,x20,x11,le 497 ldr w9, [x8] 498 sxtw x9,w9 499 lsl x9, x9, #1 500 sub v7.8b, v28.8b , v6.8b //32-fract 501 502 umull v24.8h, v12.8b, v7.8b //mul (row 0) 503 tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1) 504 umlal v24.8h, v13.8b, v6.8b //mul (row 0) 505 506 st1 {v20.8b},[x5], x3 //(from previous loop)st (row 6) 507 rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7) 508 509 add v4.8b, v4.8b , v29.8b //ref_main_idx (row 3) 510 tbl v14.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 2) 511 add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 3) 512 513 umull v22.8h, v16.8b, v7.8b //mul (row 1) 514 tbl v15.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 2) 515 umlal v22.8h, v17.8b, v6.8b //mul (row 1) 516 517 rshrn v24.8b, v24.8h,#5 //round shft (row 0) 518 st1 {v18.8b},[x5], x3 //(from previous loop)st (row 7) 519 520 add v19.8b, v19.8b , v29.8b //ref_main_idx (row 4) 521 tbl v23.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3) 522 add v21.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 4) 523 524 umull v20.8h, v14.8b, v7.8b //mul (row 2) 525 tbl v25.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3) 526 umlal v20.8h, v15.8b, v6.8b //mul (row 2) 527 528 smull v14.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) 529 add x5,x2,x3,lsl#2 530 add x9, x9, x0, lsl #1 531 532 533 st1 {v24.8b},[x2], x3 //st (row 0) 534 rshrn v22.8b, v22.8h,#5 //round shft (row 1) 535 536 add v4.8b, v4.8b , v29.8b //ref_main_idx (row 5) 537 tbl v12.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 4) 538 add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 5) 539 540 umull v18.8h, v23.8b, v7.8b //mul (row 3) 541 tbl v13.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 4) 542 umlal v18.8h, v25.8b, v6.8b //mul (row 3) 543 544 st1 {v22.8b},[x2], x3 //st (row 1) 545 rshrn v20.8b, v20.8h,#5 //round shft (row 2) 546 547 xtn v23.8b, v14.8h 548 sshr v14.8h, v14.8h,#5 549 550 add v19.8b, v19.8b , v29.8b //ref_main_idx (row 6) 551 tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5) 552 add v21.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 6) 553 554 umull v24.8h, v12.8b, v7.8b //mul (row 4) 555 tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5) 556 umlal v24.8h, v13.8b, v6.8b //mul (row 4) 557 558 st1 {v20.8b},[x2], x3 //st (row 2) 559 rshrn v18.8b, v18.8h,#5 //round shft (row 3) 560 561 // sub x9, x9, #1 562 sqxtn v25.8b, v14.8h 563 564 add v4.8b, v4.8b , v29.8b //ref_main_idx (row 7) 565 tbl v14.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 6) 566 add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 7) 567 568 shl v25.8b, v25.8b,#1 569 570 umull v22.8h, v16.8b, v7.8b //mul (row 5) 571 tbl v15.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 6) 572 umlal v22.8h, v17.8b, v6.8b //mul (row 5) 573 574 add v19.8b, v27.8b , v25.8b //ref_main_idx (add row) 575 dup v26.8b,w9 576 577 st1 {v18.8b},[x2], x3 //st (row 3) 578 rshrn v24.8b, v24.8h,#5 //round shft (row 4) 579 580 581 add x2, x2, x3, lsl #2 582 add x20, x7, x2 583 csel x2, x20, x2,gt 584 sub x20, x2, x4, lsl #1 585 csel x2, x20, x2,le 586 add x20,x2,#8 587 csel x2, x20, x2,le 588 589 subs x10, x10, #4 //subtract 8 and go to end if 8x8 590 591 bne kernel_8_16_32 592 epil_8_16_32: 593 594 tbl v23.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) 595 596 umull v20.8h, v14.8b, v7.8b //mul (row 6) 597 tbl v25.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7) 598 umlal v20.8h, v15.8b, v6.8b //mul (row 6) 599 600 st1 {v24.8b},[x5], x3 //st (row 4) 601 rshrn v24.8b, v22.8h,#5 //round shft (row 5) 602 603 umull v18.8h, v23.8b, v7.8b //mul (row 7) 604 umlal v18.8h, v25.8b, v6.8b //mul (row 7) 605 606 st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5) 607 rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6) 608 609 st1 {v20.8b},[x5], x3 //(from previous loop)st (row 6) 610 rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7) 611 612 st1 {v18.8b},[x5], x3 //st (row 7) 613 614 end_func: 615 add sp, sp, #132 616 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 617 ldp x19, x20,[sp],#16 618 ldp d14,d15,[sp],#16 619 ldp d12,d13,[sp],#16 620 ret 621 622 623 624 625 626 627