1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 20 ///** 21 //****************************************************************************** 22 //* //file 23 //* ihevc_inter_pred_luma_horz_w16out.s 24 //* 25 //* //brief 26 //* contains function definitions for inter prediction interpolation. 27 //* functions are coded using neon intrinsics and can be compiled using 28 29 //* rvct 30 //* 31 //* //author 32 //* parthiban v 33 //* 34 //* //par list of functions: 35 //* 36 //* - ihevc_inter_pred_luma_horz_w16out() 37 //* 38 //* //remarks 39 //* none 40 //* 41 //******************************************************************************* 42 //*/ 43 ///** 44 //******************************************************************************* 45 //* 46 //* //brief 47 //* interprediction luma filter for horizontal 16bit output 48 //* 49 //* //par description: 50 //* applies a horizontal filter with coefficients pointed to by 'pi1_coeff' 51 //* to the elements pointed by 'pu1_src' and writes to the location pointed 52 //* by 'pu1_dst' no downshifting or clipping is done and the output is used 53 //* as an input for vertical filtering or weighted prediction assumptions : 54 //* the function is optimized considering the fact width is multiple of 4 or 55 //* 8. if width is multiple of 4 then height should be multiple of 2, width 8 56 //* is optimized further. 57 //* 58 //* //param[in] pu1_src 59 //* uword8 pointer to the source 60 //* 61 //* //param[out] pi2_dst 62 //* word16 pointer to the destination 63 //* 64 //* //param[in] src_strd 65 //* integer source stride 66 //* 67 //* //param[in] dst_strd 68 //* integer destination stride 69 //* 70 //* //param[in] pi1_coeff 71 //* word8 pointer to the filter coefficients 72 //* 73 //* //param[in] ht 74 //* integer height of the array 75 //* 76 //* //param[in] wd 77 //* integer width of the array 78 //* 79 //* //returns 80 //* 81 //* //remarks 82 //* none 83 //* 84 //******************************************************************************* 85 //*/ 86 87 //void ihevc_inter_pred_luma_horz_w16out(uword8 *pu1_src, 88 // word16 *pi2_dst, 89 // word32 src_strd, 90 // word32 dst_strd, 91 // word8 *pi1_coeff, 92 // word32 ht, 93 // word32 wd 94 95 96 //x0 - free 97 //x1 - dst_ptr 98 //x2 - src_strd 99 //x3 - dst_strd 100 //x8 - src_ptx2 101 //x9 - inner loop counter 102 //x10 - dst_ptx2 103 //x11 - free 104 //x12 - dst_strd2 105 //x13 - src_strd1 106 //x14 - wd 107 //x15 - #1 108 //x16 - src_ptx1 109 //x19 - loop_counter 110 .text 111 .align 4 112 113 .include "ihevc_neon_macros.s" 114 115 .globl ihevc_inter_pred_luma_horz_w16out_av8 116 117 .type ihevc_inter_pred_luma_horz_w16out_av8, %function 118 119 ihevc_inter_pred_luma_horz_w16out_av8: 120 121 // stmfd sp!, {x8-x16, x19} //stack stores the values of the arguments 122 push_v_regs 123 stp x19, x20,[sp,#-16]! 124 mov x20,#1 125 bic x19, x19, x20 // clearing bit[0], so that it goes back to mode 126 mov x8,x4 //loads pi1_coeff 127 mov x11,x5 //loads ht 128 129 130 ld1 {v0.8b},[x8] //coeff = vld1_s8(pi1_coeff) 131 sub x19,x11,#0 //checks for ht == 0 132 abs v2.8b, v0.8b //vabs_s8(coeff) 133 mov x15,#1 134 //ble end_loops 135 mov x14,x6 //loads wd 136 dup v24.8b, v2.8b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0) 137 sub x16,x0,#3 //pu1_src - 3 138 dup v25.8b, v2.8b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1) 139 add x8,x16,x2 //pu1_src_tmp2_8 = pu1_src + src_strd 140 dup v26.8b, v2.8b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2) 141 sub x20,x14,x2,lsl #1 //2*src_strd - wd 142 neg x13, x20 143 dup v27.8b, v2.8b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3) 144 sub x20,x14,x3 //dst_strd - wd 145 neg x12, x20 146 dup v28.8b, v2.8b[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4) 147 148 dup v29.8b, v2.8b[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5) 149 and x11,x19,#1 //calculating ht_residue ht_residue = (ht & 1) 150 dup v30.8b, v2.8b[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6) 151 sub x19,x19,x11 //decrement height by ht_residue(residue value is calculated outside) 152 dup v31.8b, v2.8b[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7) 153 154 cmp x11,#1 155 beq odd_height_decision 156 157 even_height_decision: 158 mov x11,x1 159 cmp x14,#4 160 ble outer_loop_4 161 162 cmp x14,#24 163 mov x20,#16 164 csel x14, x20, x14,eq 165 add x20, x12,#8 166 csel x12, x20, x12,eq 167 add x20, x13,#8 168 csel x13, x20, x13,eq 169 170 cmp x14,#16 171 bge outer_loop_16_branch 172 173 cmp x14,#12 174 add x20, x12,#4 175 csel x12, x20, x12,eq 176 add x20, x13,#4 177 csel x13, x20, x13,eq 178 outer_loop_8_branch: 179 b outer_loop_8 180 181 outer_loop_16_branch: 182 b outer_loop_16 183 184 185 odd_height_decision: 186 cmp x14,#24 187 beq outer_loop_8_branch 188 cmp x14,#12 189 beq outer_loop_4 190 b even_height_decision 191 192 outer_loop4_residual: 193 sub x16,x0,#3 //pu1_src - 3 194 mov x1,x11 195 add x1, x1,#16 196 mov x14,#4 197 add x16, x16,#8 198 mov x19,#16 199 add x12, x12,#4 200 add x13, x13,#4 201 202 outer_loop_4: 203 add x10,x1,x3,lsl #1 //pu1_dst + dst_strd 204 add x8,x16,x2 //pu1_src + src_strd 205 206 subs x9,x14,#0 //checks wd 207 ble end_inner_loop_4 208 209 inner_loop_4: 210 mov x15,#1 211 ld1 {v20.2s},[x16],x15 //vector load pu1_src 212 ld1 {v21.2s},[x16],x15 213 ld1 {v22.2s},[x8],x15 //vector load pu1_src + src_strd 214 ld1 {v23.2s},[x8],x15 215 216 zip1 v0.2s, v20.2s, v22.2s 217 zip2 v12.2s, v20.2s, v22.2s //vector zip the i iteration and ii interation in single register 218 zip1 v1.2s, v21.2s, v23.2s 219 zip2 v13.2s, v21.2s, v23.2s 220 221 ld1 {v20.2s},[x16],x15 222 ld1 {v21.2s},[x16],x15 223 ld1 {v22.2s},[x8],x15 224 ld1 {v23.2s},[x8],x15 225 226 zip1 v2.2s, v20.2s, v22.2s 227 zip2 v14.2s, v20.2s, v22.2s 228 zip1 v3.2s, v21.2s, v23.2s 229 zip2 v15.2s, v21.2s, v23.2s 230 231 ld1 {v20.2s},[x16],x15 232 ld1 {v21.2s},[x16],x15 233 ld1 {v22.2s},[x8],x15 234 ld1 {v23.2s},[x8],x15 235 236 zip1 v4.2s, v20.2s, v22.2s 237 zip2 v16.2s, v20.2s, v22.2s 238 zip1 v5.2s, v21.2s, v23.2s 239 zip2 v17.2s, v21.2s, v23.2s 240 241 ld1 {v20.2s},[x16],x15 242 ld1 {v21.2s},[x16],x15 243 ld1 {v22.2s},[x8],x15 244 ld1 {v23.2s},[x8],x15 245 246 //add x16,x16,#4 //increment the input pointer 247 sub x16,x16,#4 248 //vext.u8 d2,d0,d1,#2 //vector extract of src[0_2] 249 //vext.u8 d3,d0,d1,#3 //vector extract of src[0_3] 250 //vext.u8 d4,d0,d1,#4 //vector extract of src[0_4] 251 252 //vext.u8 d5,d0,d1,#5 //vector extract of src[0_5] 253 //vext.u8 d6,d0,d1,#6 //vector extract of src[0_6] 254 //vext.u8 d7,d0,d1,#7 //vector extract of src[0_7] 255 //vext.u8 d1,d0,d1,#1 //vector extract of src[0_1] 256 sub x8,x8,#4 257 // add x8,x8,#4 //increment the input pointer 258 // vext.u8 d14,d12,d13,#2 //vector extract of src[0_2] 259 // vext.u8 d15,d12,d13,#3 //vector extract of src[0_3] 260 // vext.u8 d16,d12,d13,#4 //vector extract of src[0_4] 261 // vext.u8 d17,d12,d13,#5 //vector extract of src[0_5] 262 // vext.u8 d18,d12,d13,#6 //vector extract of src[0_6] 263 // vext.u8 d19,d12,d13,#7 //vector extract of src[0_7] 264 //vext.u8 d13,d12,d13,#1 //vector extract of src[0_1] 265 266 267 268 269 270 271 zip1 v6.2s, v20.2s, v22.2s 272 zip2 v18.2s, v20.2s, v22.2s 273 zip1 v7.2s, v21.2s, v23.2s 274 zip2 v19.2s, v21.2s, v23.2s 275 276 umull v8.8h, v1.8b, v25.8b //arithmetic operations for ii iteration in the same time 277 umlsl v8.8h, v0.8b, v24.8b 278 umlsl v8.8h, v2.8b, v26.8b 279 umlal v8.8h, v3.8b, v27.8b 280 umlal v8.8h, v4.8b, v28.8b 281 umlsl v8.8h, v5.8b, v29.8b 282 umlal v8.8h, v6.8b, v30.8b 283 umlsl v8.8h, v7.8b, v31.8b 284 285 // vqrshrun.s16 d8,q4,#6 //narrow right shift and saturating the result 286 st1 {v8.d}[0],[x1],#8 //store the i iteration result which is in upper part of the register 287 st1 {v8.d}[1],[x10],#8 //store the ii iteration result which is in lower part of the register 288 subs x9,x9,#4 //decrement the wd by 4 289 bgt inner_loop_4 290 291 end_inner_loop_4: 292 subs x19,x19,#2 //decrement the ht by 4 293 add x16,x16,x13 //increment the input pointer 2*src_strd-wd 294 add x1,x10,x12,lsl #1 //increment the output pointer 2*dst_strd-wd 295 bgt outer_loop_4 296 297 298 height_residue_4: 299 300 mov x11,x5 //loads ht 301 and x11,x11,#1 //calculating ht_residue ht_residue = (ht & 1) 302 cmp x11,#0 303 //beq end_loops 304 // ldmeqfd sp!,{x8-x16,pc} //reload the registers from sp 305 bne lbl280 306 ldp x19, x20,[sp], #16 307 pop_v_regs 308 ret 309 lbl280: 310 311 outer_loop_height_residue_4: 312 313 314 subs x9,x14,#0 //checks wd 315 ble end_inner_loop_height_residue_4 316 317 inner_loop_height_residue_4: 318 mov x15, #1 319 ld1 {v0.2s},[x16],x15 //vector load pu1_src 320 ld1 {v1.2s},[x16],x15 321 322 323 324 325 326 327 // vext.u8 d2,d0,d1,#2 //vector extract of src[0_2] 328 // vext.u8 d3,d0,d1,#3 //vector extract of src[0_3] 329 // vext.u8 d4,d0,d1,#4 //vector extract of src[0_4] 330 331 332 333 //add x16,x16,#4 //increment the input pointer 334 // vext.u8 d5,d0,d1,#5 //vector extract of src[0_5] 335 // vext.u8 d6,d0,d1,#6 //vector extract of src[0_6] 336 // vext.u8 d7,d0,d1,#7 //vector extract of src[0_7] 337 // vext.u8 d1,d0,d1,#1 //vector extract of src[0_1] 338 ld1 {v2.2s},[x16],x15 339 umull v8.8h, v1.8b, v25.8b //arithmetic operations for ii iteration in the same time 340 ld1 {v3.2s},[x16],x15 341 umlsl v8.8h, v0.8b, v24.8b 342 ld1 {v4.2s},[x16],x15 343 umlsl v8.8h, v2.8b, v26.8b 344 ld1 {v5.2s},[x16],x15 345 umlal v8.8h, v3.8b, v27.8b 346 ld1 {v6.2s},[x16],x15 347 umlal v8.8h, v4.8b, v28.8b 348 ld1 {v7.2s},[x16],x15 349 umlsl v8.8h, v5.8b, v29.8b 350 sub x16,x16,#4 351 umlal v8.8h, v6.8b, v30.8b 352 umlsl v8.8h, v7.8b, v31.8b //store the i iteration result which is in upper part of the register 353 subs x9,x9,#4 //decrement the wd by 4 354 st1 {v8.d}[0],[x1],#8 355 bgt inner_loop_height_residue_4 356 357 end_inner_loop_height_residue_4: 358 subs x11,x11,#1 //decrement the ht by 4 359 sub x20,x14,x2 360 neg x13, x20 361 add x16,x16,x13 //increment the input pointer src_strd-wd 362 add x1,x1,x12 //increment the output pointer dst_strd-wd 363 bgt outer_loop_height_residue_4 364 365 // ldmfd sp!,{x8-x16,pc} //reload the registers from sp 366 ldp x19, x20,[sp], #16 367 pop_v_regs 368 ret 369 370 outer_loop8_residual: 371 sub x16,x0,#3 //pu1_src - 3 372 mov x1,x11 373 mov x19,#32 374 add x1, x1,#32 375 add x16, x16,#16 376 mov x14,#8 377 add x12, x12,#8 378 add x13, x13,#8 379 380 outer_loop_8: 381 382 add x10,x1,x3,lsl #1 //pu1_dst + dst_strd 383 add x8,x16,x2 //pu1_src + src_strd 384 subs x9,x14,#0 //checks wd 385 386 ble end_inner_loop_8 387 388 inner_loop_8: 389 mov x15, #1 390 ld1 {v0.2s},[x16],x15 //vector load pu1_src 391 ld1 {v1.2s},[x16],x15 392 ld1 {v2.2s},[x16],x15 393 ld1 {v3.2s},[x16],x15 394 395 396 397 398 399 // vext.u8 d2,d0,d1,#2 //vector extract of src[0_2] 400 // vext.u8 d3,d0,d1,#3 //vector extract of src[0_3] 401 // vext.u8 d4,d0,d1,#4 //vector extract of src[0_4] 402 // vext.u8 d5,d0,d1,#5 //vector extract of src[0_5] 403 // vext.u8 d6,d0,d1,#6 //vector extract of src [0_6] 404 // vext.u8 d7,d0,d1,#7 //vector extract of src[0_7] 405 // vext.u8 d1,d0,d1,#1 //vector extract of src[0_1] 406 // vext.u8 d14,d12,d13,#2 407 408 //vext.u8 d15,d12,d13,#3 //vector extract of src[0_3] 409 // vext.u8 d16,d12,d13,#4 //vector extract of src[0_4] 410 // vext.u8 d17,d12,d13,#5 //vector extract of src[0_5] 411 //vext.u8 d18,d12,d13,#6 //vector extract of src[0_6] 412 //vext.u8 d19,d12,d13,#7 //vector extract of src[0_7] 413 //vext.u8 d13,d12,d13,#1 //vector extract of src[0_1] 414 ld1 {v4.2s},[x16],x15 415 umull v8.8h, v1.8b, v25.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 416 ld1 {v5.2s},[x16],x15 417 umlal v8.8h, v3.8b, v27.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 418 ld1 {v6.2s},[x16],x15 419 umlsl v8.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 420 ld1 {v7.2s},[x16],x15 421 umlsl v8.8h, v2.8b, v26.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 422 ld1 {v12.2s},[x8],x15 //vector load pu1_src + src_strd 423 umlal v8.8h, v4.8b, v28.8b //mul_res = vmlal_u8(src[0_4], coeffabs_4)// 424 ld1 {v13.2s},[x8],x15 425 umlsl v8.8h, v5.8b, v29.8b //mul_res = vmlsl_u8(src[0_5], coeffabs_5)// 426 ld1 {v14.2s},[x8],x15 427 umlal v8.8h, v6.8b, v30.8b //mul_res = vmlal_u8(src[0_6], coeffabs_6)// 428 ld1 {v15.2s},[x8],x15 429 umlsl v8.8h, v7.8b, v31.8b //mul_res = vmlsl_u8(src[0_7], coeffabs_7)// 430 ld1 {v16.2s},[x8],x15 //vector load pu1_src + src_strd 431 432 umull v10.8h, v15.8b, v27.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 433 ld1 {v17.2s},[x8],x15 434 umlsl v10.8h, v14.8b, v26.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 435 ld1 {v18.2s},[x8],x15 436 umlal v10.8h, v16.8b, v28.8b //mul_res = vmlal_u8(src[0_4], coeffabs_4)// 437 ld1 {v19.2s},[x8],x15 //vector load pu1_src + src_strd 438 umlsl v10.8h, v17.8b, v29.8b //mul_res = vmlsl_u8(src[0_5], coeffabs_5)// 439 // vqrshrun.s16 d20,q4,#6 //right shift and saturating narrow result 1 440 umlal v10.8h, v18.8b, v30.8b //mul_res = vmlal_u8(src[0_6], coeffabs_6)// 441 umlsl v10.8h, v19.8b, v31.8b //mul_res = vmlsl_u8(src[0_7], coeffabs_7)// 442 st1 {v8.8h},[x1],#16 //store the result pu1_dst 443 umlsl v10.8h, v12.8b, v24.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 444 umlal v10.8h, v13.8b, v25.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 445 446 447 448 // vqrshrun.s16 d8,q5,#6 //right shift and saturating narrow result 2 449 subs x9,x9,#8 //decrement the wd loop 450 st1 {v10.8h},[x10],#16 //store the result pu1_dst 451 cmp x9,#4 452 bgt inner_loop_8 453 454 end_inner_loop_8: 455 subs x19,x19,#2 //decrement the ht loop 456 add x16,x16,x13 //increment the src pointer by 2*src_strd-wd 457 add x1,x10,x12,lsl #1 //increment the dst pointer by 2*dst_strd-wd 458 bgt outer_loop_8 459 460 461 462 463 464 mov x14,x6 //loads wd 465 cmp x14,#12 466 467 beq outer_loop4_residual 468 469 mov x11,x5 //loads ht 470 and x11,x11,#1 471 cmp x11,#1 472 beq height_residue_4 473 474 //end_loops 475 476 // ldmfd sp!,{x8-x16,pc} //reload the registers from sp 477 ldp x19, x20,[sp], #16 478 pop_v_regs 479 ret 480 481 482 483 484 485 outer_loop_16: 486 mov x15, #-7 487 stp x0,x11,[sp,#-16]! 488 add x10,x1,x3,lsl #1 //pu1_dst + dst_strd 489 add x8,x16,x2 //pu1_src + src_strd 490 and x0, x16, #31 491 sub x9,x14,#0 //checks wd 492 //ble end_loops1 493 add x20,x16, x2, lsl #1 494 prfm PLDL1KEEP,[x20] 495 ld1 {v0.2s},[x16],#8 //vector load pu1_src 496 ld1 {v1.2s},[x16],x15 //vector load pu1_src 497 add x20,x8, x2, lsl #1 498 prfm PLDL1KEEP,[x20] 499 ld1 {v2.2s},[x16],#8 500 ld1 {v3.2s},[x16],x15 501 ld1 {v4.2s},[x16],#8 502 ld1 {v5.2s},[x16],x15 503 ld1 {v6.2s},[x16],#8 504 ld1 {v7.2s},[x16],x15 505 ld1 {v12.2s},[x16],#8 506 ld1 {v13.2s},[x16],x15 507 umull v8.8h, v2.8b, v25.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 508 ld1 {v14.2s},[x16],#8 509 ld1 {v15.2s},[x16],x15 510 umlal v8.8h, v6.8b, v27.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 511 ld1 {v16.2s},[x16],#8 512 ld1 {v17.2s},[x16],x15 513 umlsl v8.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 514 ld1 {v18.2s},[x16],#8 515 ld1 {v19.2s},[x16],x15 516 umlsl v8.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 517 umlal v8.8h, v12.8b, v28.8b //mul_res = vmlal_u8(src[0_4], coeffabs_4)// 518 umlsl v8.8h, v14.8b, v29.8b //mul_res = vmlsl_u8(src[0_5], coeffabs_5)// 519 umlal v8.8h, v16.8b, v30.8b //mul_res = vmlal_u8(src[0_6], coeffabs_6)// 520 umlsl v8.8h, v18.8b, v31.8b //mul_res = vmlsl_u8(src[0_7], coeffabs_7)// 521 522 523 inner_loop_16: 524 525 526 subs x9,x9,#16 527 umull v20.8h, v3.8b, v25.8b 528 529 add x16, x16,#8 530 umlsl v20.8h, v1.8b, v24.8b 531 532 ld1 {v0.2s},[x8],#8 //vector load pu1_src 533 ld1 {v1.2s},[x8],x15 //vector load pu1_src 534 umlal v20.8h, v7.8b, v27.8b 535 536 ld1 {v2.2s},[x8],#8 537 ld1 {v3.2s},[x8],x15 538 umlsl v20.8h, v5.8b, v26.8b 539 540 ld1 {v4.2s},[x8],#8 541 ld1 {v5.2s},[x8],x15 542 umlal v20.8h, v13.8b, v28.8b 543 544 ld1 {v6.2s},[x8],#8 545 ld1 {v7.2s},[x8],x15 546 umlal v20.8h, v17.8b, v30.8b 547 548 ld1 {v12.2s},[x8],#8 549 ld1 {v13.2s},[x8],x15 550 umlsl v20.8h, v15.8b, v29.8b 551 552 ld1 {v14.2s},[x8],#8 553 ld1 {v15.2s},[x8],x15 554 umlsl v20.8h, v19.8b, v31.8b 555 556 ld1 {v16.2s},[x8],#8 557 ld1 {v17.2s},[x8],x15 558 umull v10.8h, v2.8b, v25.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 559 560 ld1 {v18.2s},[x8],#8 561 ld1 {v19.2s},[x8],x15 562 umlal v10.8h, v6.8b, v27.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 563 564 add x8, x8,#8 565 umlsl v10.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 566 add x20,x16, x2, lsl #2 567 prfm PLDL1KEEP,[x20] 568 add x20,x8, x2, lsl #2 569 prfm PLDL1KEEP,[x20] 570 st1 {v8.16b},[x1],#16 //store the result pu1_dst 571 umlsl v10.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 572 573 add x20,x16,x13 //increment the src pointer by 2*src_strd-wd 574 csel x16, x20, x16,eq 575 umlal v10.8h, v12.8b, v28.8b //mul_res = vmlal_u8(src[0_4], coeffabs_4)// 576 577 add x20,x16,x2 //pu1_src + src_strd 578 csel x8, x20, x8,eq 579 umlsl v10.8h, v14.8b, v29.8b //mul_res = vmlsl_u8(src[0_5], coeffabs_5)// 580 581 // and x11, x16, #31 582 umlal v10.8h, v16.8b, v30.8b //mul_res = vmlal_u8(src[0_6], coeffabs_6)// 583 584 sub x20,x19,#2 585 csel x19, x20, x19,eq 586 umlsl v10.8h, v18.8b, v31.8b //mul_res = vmlsl_u8(src[0_7], coeffabs_7)// 587 588 //cmp x11, x0 589 umull v22.8h, v3.8b, v25.8b 590 591 // add x20,x16, x2, lsl #2 592 prfm PLDL1KEEP,[x20] 593 umlsl v22.8h, v1.8b, v24.8b 594 595 st1 {v20.8h},[x1],#16 596 umlal v22.8h, v7.8b, v27.8b 597 598 // add x20,x8, x2, lsl #2 599 prfm PLDL1KEEP,[x20] 600 umlsl v22.8h, v5.8b, v26.8b 601 602 // mov x0, x11 603 umlal v22.8h, v13.8b, v28.8b 604 605 cmp x19,#0 606 umlal v22.8h, v17.8b, v30.8b 607 608 st1 {v10.8h},[x10],#16 609 umlsl v22.8h, v15.8b, v29.8b 610 611 umlsl v22.8h, v19.8b, v31.8b 612 613 beq epilog_16 614 615 ld1 {v0.2s},[x16],#8 //vector load pu1_src 616 ld1 {v1.2s},[x16],x15 //vector load pu1_src 617 ld1 {v2.2s},[x16],#8 618 ld1 {v3.2s},[x16],x15 619 ld1 {v4.2s},[x16],#8 620 ld1 {v5.2s},[x16],x15 621 ld1 {v6.2s},[x16],#8 622 ld1 {v7.2s},[x16],x15 623 ld1 {v12.2s},[x16],#8 624 ld1 {v13.2s},[x16],x15 625 umull v8.8h, v2.8b, v25.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 626 ld1 {v14.2s},[x16],#8 627 ld1 {v15.2s},[x16],x15 628 umlal v8.8h, v6.8b, v27.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 629 ld1 {v16.2s},[x16],#8 630 ld1 {v17.2s},[x16],x15 631 umlsl v8.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 632 ld1 {v18.2s},[x16],#8 633 ld1 {v19.2s},[x16],x15 634 umlsl v8.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 635 umlal v8.8h, v12.8b, v28.8b //mul_res = vmlal_u8(src[0_4], coeffabs_4)// 636 cmp x9,#0 637 umlsl v8.8h, v14.8b, v29.8b //mul_res = vmlsl_u8(src[0_5], coeffabs_5)// 638 mov x20,x14 639 csel x9, x20, x9,eq 640 umlal v8.8h, v16.8b, v30.8b //mul_res = vmlal_u8(src[0_6], coeffabs_6)// 641 st1 {v22.16b},[x10],#16 //store the result pu1_dst 642 umlsl v8.8h, v18.8b, v31.8b //mul_res = vmlsl_u8(src[0_7], coeffabs_7)// 643 add x20,x10,x12,lsl #1 644 csel x1, x20, x1,eq 645 add x20,x1,x3,lsl #1 //pu1_dst + dst_strd 646 csel x10, x20, x10,eq 647 b inner_loop_16 648 649 650 epilog_16: 651 // vqrshrun.s16 d11,q11,#6 652 st1 {v22.16b},[x10],#16 //store the result pu1_dst 653 654 ldp x0,x11,[sp],#16 655 mov x14,x6 656 cmp x14,#24 657 beq outer_loop8_residual 658 add x1,x10,x12,lsl #1 659 mov x11,x5 //loads ht 660 and x11,x11,#1 661 cmp x11,#1 662 beq height_residue_4 663 664 end_loops1: 665 666 // ldmfd sp!,{x8-x16,pc} //reload the registers from sp 667 ldp x19, x20,[sp], #16 668 pop_v_regs 669 ret 670 671 672 673 674 675 676 677 678 679