1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 //******************************************************************************* 20 //* //file 21 //* ihevc_inter_pred_chroma_horz_neon.s 22 //* 23 //* //brief 24 //* contains function definitions for inter prediction interpolation. 25 //* functions are coded using neon intrinsics and can be compiled using 26 27 //* rvct 28 //* 29 //* //author 30 //* yogeswaran rs / akshaya mukund 31 //* 32 //* //par list of functions: 33 //* 34 //* 35 //* //remarks 36 //* none 37 //* 38 //******************************************************************************* 39 //*/ 40 ///** 41 //******************************************************************************* 42 //* 43 //* //brief 44 //* chroma interprediction filter to store horizontal 16bit ouput 45 //* 46 //* //par description: 47 //* applies a horizontal filter with coefficients pointed to by 'pi1_coeff' 48 //* to the elements pointed by 'pu1_src' and writes to the location pointed 49 //* by 'pu1_dst' no downshifting or clipping is done and the output is used 50 //* as an input for vertical filtering or weighted prediction 51 //* 52 //* //param[in] pu1_src 53 //* uword8 pointer to the source 54 //* 55 //* //param[out] pi2_dst 56 //* word16 pointer to the destination 57 //* 58 //* //param[in] src_strd 59 //* integer source stride 60 //* 61 //* //param[in] dst_strd 62 //* integer destination stride 63 //* 64 //* //param[in] pi1_coeff 65 //* word8 pointer to the filter coefficients 66 //* 67 //* //param[in] ht 68 //* integer height of the array 69 //* 70 //* //param[in] wd 71 //* integer width of the array 72 //* 73 //* //returns 74 //* 75 //* //remarks 76 //* none 77 //* 78 //******************************************************************************* 79 //*/ 80 //void ihevc_inter_pred_chroma_horz_w16out(uword8 *pu1_src, 81 // word16 *pi2_dst, 82 // word32 src_strd, 83 // word32 dst_strd, 84 // word8 *pi1_coeff, 85 // word32 ht, 86 // word32 wd) 87 //**************variables vs registers***************************************** 88 //x0 => *pu1_src 89 //x1 => *pi2_dst 90 //x2 => src_strd 91 //x3 => dst_strd 92 93 94 .text 95 .align 4 96 97 .include "ihevc_neon_macros.s" 98 99 .globl ihevc_inter_pred_chroma_horz_w16out_av8 100 101 102 .type ihevc_inter_pred_chroma_horz_w16out_av8, %function 103 104 ihevc_inter_pred_chroma_horz_w16out_av8: 105 106 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 107 108 stp d10,d11,[sp,#-16]! 109 stp d12,d13,[sp,#-16]! 110 stp d14,d15,[sp,#-16]! 111 stp x19, x20,[sp,#-16]! 112 113 mov x15,x4 // pi1_coeff 114 mov x16,x5 // ht 115 mov x17,x6 // wd 116 117 mov x4,x15 //loads pi1_coeff 118 mov x6,x16 //loads ht 119 mov x10,x17 //loads wd 120 121 ld1 {v0.8b},[x4] //coeff = vld1_s8(pi1_coeff) 122 subs x14,x6,#0 //checks for ht == 0 123 abs v2.8b, v0.8b //vabs_s8(coeff) 124 125 //******* added 126 mov x11, #2 127 //******* added ends 128 129 ble end_loops 130 131 dup v24.8b, v2.b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0) 132 sub x12,x0,#2 //pu1_src - 2 133 dup v25.8b, v2.b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1) 134 add x4,x12,x2 //pu1_src_tmp2_8 = pu1_src + src_strd 135 dup v26.8b, v2.b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2) 136 137 tst x10,#3 //checks wd for multiples of 4 138 lsl x5, x10, #1 //2wd 139 140 dup v27.8b, v2.b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3) 141 142 and x7,x14,#1 //added //calculating ht_residue ht_residue = (ht & 1) 143 sub x14,x14,x7 //added //decrement height by ht_residue(residue value is calculated outside) 144 145 bne outer_loop_4 // this branching happens when the width is 2 or 6 146 147 cmp x10,#12 148 beq skip_16 149 150 cmp x10,#8 151 bge outer_loop_16 152 153 skip_16: 154 tst x6,#3 155 156 //******* removal 157 //mov x11,#8 158 //******* removal ends 159 160 sub x9,x0,#2 161 beq outer_loop_ht_4 //this branching happens when the height is a a multiple of 4 162 163 164 165 // cmp x10,#12 166 // beq outer_loop_8 167 // cmp x10,#16 168 // bge outer_loop_16 169 b outer_loop_8 170 171 172 173 outer_loop_16: 174 add x4,x12,x2 175 176 177 and x0, x12, #31 178 add x20,x12, x2 , lsl #1 179 prfm PLDL1KEEP,[x20] 180 181 182 183 184 185 186 add x19,x12,#8 187 ld1 { v0.2s},[x12],x11 //vector load pu1_src 188 ld1 { v1.2s},[x19],x11 //vector load pu1_src 189 mov x10,x5 //2wd 190 mul x14, x14 , x10 191 ld1 { v2.2s},[x12],x11 //vector load pu1_src 192 ld1 { v3.2s},[x19],x11 //vector load pu1_src 193 add x20,x4, x2 , lsl #1 194 prfm PLDL1KEEP,[x20] 195 mov x9,#10 196 ld1 { v4.2s},[x12],x11 //vector load pu1_src 197 ld1 { v5.2s},[x19],x11 //vector load pu1_src 198 sub x20,x3,#8 199 neg x6, x20 200 sub x8,x3,#8 201 ld1 { v6.2s},[x12],x9 //vector load pu1_src 202 ld1 { v7.2s},[x19],x9 //vector load pu1_src 203 204 205 add x19,x4,#8 206 umull v30.8h, v2.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 207 ld1 { v29.2s},[x4],x11 //vector load pu1_src 208 ld1 { v31.2s},[x19],x11 //vector load pu1_src 209 210 umlsl v30.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 211 212 ld1 { v10.2s},[x4],x11 //vector load pu1_src 213 ld1 { v11.2s},[x19],x11 //vector load pu1_src 214 215 umlal v30.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 216 217 ld1 { v12.2s},[x4],x11 //vector load pu1_src 218 ld1 { v13.2s},[x19],x11 //vector load pu1_src 219 220 umlsl v30.8h, v6.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 221 222 ld1 { v14.4s},[x4],x9 //vector load pu1_src 223 ld1 { v15.2s},[x19],x9 //vector load pu1_src 224 225 umull v28.8h, v3.8b, v25.8b 226 lsl x6,x6,#1 227 sub x20,x5,x3,lsl #1 228 neg x3, x20 229 umlsl v28.8h, v1.8b, v24.8b 230 lsl x8,x8,#1 231 sub x20,x5,x2,lsl #1 232 neg x7, x20 233 umlal v28.8h, v5.8b, v26.8b 234 235 umlsl v28.8h, v7.8b, v27.8b 236 cmp x14,#32 237 beq epilog_end 238 sub x14, x14,#64 239 240 inner_loop_16: 241 242 // and x7, x12, #31 //decrement the wd loop 243 // cmp x7, x0 244 add x20,x12, x2 , lsl #2 245 prfm PLDL1KEEP,[x20] 246 add x20,x4, x2 , lsl #2 247 prfm PLDL1KEEP,[x20] 248 249 250 subs x10,x10,#16 251 252 umull v22.8h, v10.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 253 254 255 256 // add x20,x12,x2,lsl #1 257 //csel x12, x20, x12,eq 258 // sub x20,x12,x5 259 //csel x12, x20, x12,eq 260 add x20,x12,x7 261 csel x12, x20, x12,eq 262 add x20,x12,x2 263 csel x4, x20, x4,eq 264 265 266 st1 { v30.8h}, [x1],#16 267 umlsl v22.8h, v29.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 268 269 270 271 272 add x19,x12,#8 273 ld1 { v0.2s},[x12],x11 //vector load pu1_src 274 ld1 { v1.2s},[x19],x11 //vector load pu1_src 275 umlal v22.8h, v12.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 276 277 278 279 280 ld1 { v2.2s},[x12],x11 //vector load pu1_src 281 ld1 { v3.2s},[x19],x11 //vector load pu1_src 282 umlsl v22.8h, v14.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 283 284 285 ld1 { v4.2s},[x12],x11 //vector load pu1_src 286 ld1 { v5.2s},[x19],x11 //vector load pu1_src 287 umull v20.8h, v11.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 288 289 st1 { v28.8h}, [x1],x8 290 umlsl v20.8h, v31.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 291 292 ld1 { v6.2s},[x12],x9 //vector load pu1_src 293 ld1 { v7.2s},[x19],x9 //vector load pu1_src 294 umlal v20.8h, v13.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 295 296 add x19,x4,#8 297 ld1 { v29.2s},[x4],x11 //vector load pu1_src 298 ld1 { v31.2s},[x19],x11 //vector load pu1_src 299 umlsl v20.8h, v15.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 300 301 302 ld1 { v10.2s},[x4],x11 //vector load pu1_src 303 ld1 { v11.2s},[x19],x11 //vector load pu1_src 304 umull v30.8h, v2.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 305 306 ld1 { v12.2s},[x4],x11 //vector load pu1_src 307 ld1 { v13.2s},[x19],x11 //vector load pu1_src 308 umlsl v30.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 309 310 ld1 { v14.2s},[x4],x9 //vector load pu1_src 311 ld1 { v15.2s},[x19],x9 //vector load pu1_src 312 umlal v30.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 313 314 st1 { v22.8h},[x1],#16 //store the result pu1_dst 315 umlsl v30.8h, v6.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 316 317 csel x10, x5, x10,eq //2wd 318 umull v28.8h, v3.8b, v25.8b 319 320 321 322 umlsl v28.8h, v1.8b, v24.8b 323 st1 { v20.8h},[x1],x6 //store the result pu1_dst 324 325 326 add x20,x1,x3,lsl #1 327 csel x1, x20, x1,eq 328 umlal v28.8h, v5.8b, v26.8b 329 330 subs x14,x14,#32 //decrement the ht loop 331 umlsl v28.8h, v7.8b, v27.8b 332 333 334 335 // mov x0, x7 336 bgt inner_loop_16 337 338 339 340 add x14,x14,#64 341 cmp x14,#32 342 beq epilog_end 343 344 epilog: 345 346 st1 { v30.8h}, [x1],#16 347 umull v22.8h, v10.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 348 st1 { v28.8h}, [x1],x8 349 350 351 352 umlsl v22.8h, v29.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 353 subs x10,x10,#16 //decrement the wd loop 354 umlal v22.8h, v12.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 355 // add x20,x12,x2,lsl #1 356 //csel x12, x20, x12,eq 357 add x20,x12,x7 358 csel x12, x20, x12,eq 359 umlsl v22.8h, v14.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 360 // sub x20,x12,x5 361 //csel x12, x20, x12,eq 362 csel x10, x5, x10,eq //2wd 363 add x20,x12,x2 364 csel x4, x20, x4,eq 365 umull v20.8h, v11.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 366 367 add x19,x12,#8 368 ld1 { v0.2s},[x12],x11 //vector load pu1_src 369 ld1 { v1.2s},[x19],x11 //vector load pu1_src 370 371 umlsl v20.8h, v31.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 372 373 ld1 { v2.2s},[x12],x11 //vector load pu1_src 374 ld1 { v3.2s},[x19],x11 //vector load pu1_src 375 376 umlal v20.8h, v13.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 377 378 ld1 { v4.2s},[x12],x11 //vector load pu1_src 379 ld1 { v5.2s},[x19],x11 //vector load pu1_src 380 381 umlsl v20.8h, v15.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 382 ld1 { v6.2s},[x12],x9 //vector load pu1_src 383 ld1 { v7.2s},[x19],x9 //vector load pu1_src 384 umull v30.8h, v2.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 385 386 add x19,x4,#8 387 ld1 { v29.2s},[x4],x11 //vector load pu1_src 388 ld1 { v31.2s},[x19],x11 //vector load pu1_src 389 umlsl v30.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 390 391 ld1 { v10.2s},[x4],x11 //vector load pu1_src 392 ld1 { v11.2s},[x19],x11 //vector load pu1_src 393 umlal v30.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 394 395 umlsl v30.8h, v6.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 396 397 ld1 { v12.2s},[x4],x11 //vector load pu1_src 398 ld1 { v13.2s},[x19],x11 //vector load pu1_src 399 umull v28.8h, v3.8b, v25.8b 400 401 ld1 { v14.2s},[x4],x9 //vector load pu1_src 402 ld1 { v15.2s},[x19],x9 //vector load pu1_src 403 404 umlsl v28.8h, v1.8b, v24.8b 405 st1 { v22.8h},[x1],#16 //store the result pu1_dst 406 umlal v28.8h, v5.8b, v26.8b 407 st1 { v20.8h},[x1],x6 //store the result pu1_dst 408 umlsl v28.8h, v7.8b, v27.8b 409 add x20,x1,x3,lsl #1 410 csel x1, x20, x1,eq 411 412 413 epilog_end: 414 415 umull v22.8h, v10.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 416 umlsl v22.8h, v29.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 417 umlal v22.8h, v12.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 418 umlsl v22.8h, v14.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 419 420 421 umull v20.8h, v11.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 422 umlsl v20.8h, v31.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 423 umlal v20.8h, v13.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 424 umlsl v20.8h, v15.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 425 426 427 st1 { v30.8h}, [x1],#16 428 st1 { v28.8h}, [x1],x8 429 st1 { v22.8h},[x1],#16 //store the result pu1_dst 430 st1 { v20.8h},[x1],x6 //store the result pu1_dst 431 432 433 mov x6,x16 //loads ht 434 435 and x7,x6,#1 436 437 cmp x7,#0 438 mov x10,x5 439 add x20,x12,x2,lsl #1 440 csel x12, x20, x12,ne 441 sub x20,x12,x5 442 csel x12, x20, x12,ne 443 add x20,x1,x3,lsl #1 444 csel x1, x20, x1,ne 445 446 447 bgt loop_residue_4 448 449 b end_loops 450 451 452 453 454 outer_loop_8: 455 456 add x6,x1,x3,lsl #1 //pu1_dst + dst_strd 457 mov x10,x5 //2wd 458 add x4,x12,x2 //pu1_src + src_strd 459 460 inner_loop_8: 461 //ld1 {v0.2s, v1.2s},[x12],x11 //vector load pu1_src 462 ld1 {v0.2s},[x12],x11 //vector load pu1_src 463 ld1 {v1.2s},[x12],x11 //vector load pu1_src 464 ld1 {v2.2s},[x12],x11 //vector load pu1_src 465 ld1 {v3.2s},[x12],x11 //vector load pu1_src 466 467 468 //vext.u8 d2,d0,d1,#2 //vector extract of src[0_2] 469 umull v29.8h, v1.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 470 umlsl v29.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 471 //vext.u8 d4,d0,d1,#4 //vector extract of src[0_4] 472 //vext.u8 d6,d0,d1,#6 //vector extract of src[0_6] 473 umlal v29.8h, v2.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 474 umlsl v29.8h, v3.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 475 476 //ld1 {v12.2s, v13.2s},[x4],x11 //vector load pu1_src + src_strd 477 ld1 {v4.2s},[x4],x11 //vector load pu1_src 478 ld1 {v5.2s},[x4],x11 //vector load pu1_src 479 ld1 {v6.2s},[x4],x11 //vector load pu1_src 480 ld1 {v7.2s},[x4],x11 //vector load pu1_src 481 //vext.u8 d14,d12,d13,#2 //vector extract of src[0_2] 482 umull v10.8h, v5.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 483 umlsl v10.8h, v4.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 484 //vext.u8 d16,d12,d13,#4 //vector extract of src[0_4] 485 //vext.u8 d18,d12,d13,#6 //vector extract of src[0_6] 486 umlal v10.8h, v6.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 487 umlsl v10.8h, v7.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 488 489 st1 {v29.8h}, [x1],#16 490 491 subs x10,x10,#8 //decrement the wd loop 492 st1 {v10.8h},[x6],#16 //store the result pu1_dst 493 bgt inner_loop_8 494 495 sub x12,x12,x5 496 subs x14,x14,#2 //decrement the ht loop 497 sub x1,x1,x5,lsl #1 498 add x12,x12,x2,lsl #1 499 add x1,x1,x3,lsl #2 500 bgt outer_loop_8 501 502 cmp x7,#0 503 mov x10,x5 504 bgt loop_residue_4 505 506 b end_loops 507 508 509 510 //height if 4 comes 511 outer_loop_ht_4: 512 513 mov x10,x5 514 515 prologue_ht_4: 516 lsl x8, x3, #1 517 518 inner_loop_ht_4: 519 520 mov x12,x9 521 mov x4,x1 522 523 sub x0, x2, #6 // not sure if x0 needs to be preserved 524 525 ld1 {v0.2s},[x12],x11 //(1)vector load pu1_src 526 ld1 {v1.2s},[x12],x11 //(1)vector load pu1_src 527 ld1 {v2.2s},[x12],x11 //(1)vector load pu1_src 528 ld1 {v3.2s},[x12],x0 //(1)vector load pu1_src 529 530 ld1 {v4.2s},[x12],x11 //(2)vector load pu1_src 531 ld1 {v5.2s},[x12],x11 //(2)vector load pu1_src 532 ld1 {v6.2s},[x12],x11 //(2)vector load pu1_src 533 ld1 {v7.2s},[x12],x0 //(2)vector load pu1_src 534 535 ld1 {v14.2s},[x12],x11 //(3)vector load pu1_src 536 umull v29.8h, v1.8b, v25.8b //(1)mul_res = vmull_u8(src[0_3], coeffabs_3)// 537 538 ld1 {v15.2s},[x12],x11 //(3)vector load pu1_src 539 umlsl v29.8h, v0.8b, v24.8b //(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 540 541 ld1 {v16.2s},[x12],x11 //(3)vector load pu1_src 542 umlal v29.8h, v2.8b, v26.8b //(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 543 544 ld1 {v17.2s},[x12],x0 //(3)vector load pu1_src 545 umlsl v29.8h, v3.8b, v27.8b //(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)// 546 547 ld1 {v18.2s},[x12],x11 //(4)vector load pu1_src 548 umull v10.8h, v5.8b, v25.8b //(2)mul_res = vmull_u8(src[0_3], coeffabs_3)// 549 550 ld1 {v19.2s},[x12],x11 //(4)vector load pu1_src 551 umlsl v10.8h, v4.8b, v24.8b //(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 552 553 ld1 {v20.2s},[x12],x11 //(4)vector load pu1_src 554 umlal v10.8h, v6.8b, v26.8b //(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 555 556 ld1 {v21.2s},[x12],x2 //(4)vector load pu1_src 557 umlsl v10.8h, v7.8b, v27.8b //(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)// 558 559 add x9,x9,#8 //(core loop) 560 561 subs x10,x10,#8 //(prologue)decrement the wd loop 562 beq epilogue 563 564 core_loop: 565 st1 {v29.8h},[x4],x8 //(1)store the result pu1_dst 566 mov x12,x9 567 568 ld1 {v0.2s},[x12],x11 //(1_1)vector load pu1_src 569 umull v12.8h, v15.8b, v25.8b //(3)mul_res = vmull_u8(src[0_3], coeffabs_3)// 570 571 ld1 {v1.2s},[x12],x11 //(1_1)vector load pu1_src 572 umlsl v12.8h, v14.8b, v24.8b //(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 573 574 ld1 {v2.2s},[x12],x11 //(1_1)vector load pu1_src 575 umlal v12.8h, v16.8b, v26.8b //(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 576 577 ld1 {v3.2s},[x12],x0 //(1_1)vector load pu1_src 578 umlsl v12.8h, v17.8b, v27.8b //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)// 579 580 st1 {v10.8h},[x4],x8 //(2)store the result pu1_dst 581 add x9,x9,#8 //(core loop) 582 583 ld1 {v4.2s},[x12],x11 //(2_1)vector load pu1_src 584 umull v22.8h, v19.8b, v25.8b //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)// 585 586 ld1 {v5.2s},[x12],x11 //(2_1)vector load pu1_src 587 umlsl v22.8h, v18.8b, v24.8b //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 588 589 ld1 {v6.2s},[x12],x11 //(2_1)vector load pu1_src 590 umlal v22.8h, v20.8b, v26.8b //(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 591 592 ld1 {v7.2s},[x12],x0 //(2_1)vector load pu1_src 593 umlsl v22.8h, v21.8b, v27.8b //(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)// 594 595 st1 {v12.8h},[x4],x8 //(3)store the result pu1_dst 596 add x1,x1,#16 //(core loop) 597 598 ld1 {v14.2s},[x12],x11 //(3_1)vector load pu1_src 599 umull v29.8h, v1.8b, v25.8b //(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)// 600 601 ld1 {v15.2s},[x12],x11 //(3_1)vector load pu1_src 602 umlsl v29.8h, v0.8b, v24.8b //(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 603 604 ld1 {v16.2s},[x12],x11 //(3_1)vector load pu1_src 605 umlal v29.8h, v2.8b, v26.8b //(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 606 607 ld1 {v17.2s},[x12],x0 //(3_1)vector load pu1_src 608 umlsl v29.8h, v3.8b, v27.8b //(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)// 609 610 st1 {v22.8h}, [x4], x8 //(4)store the result pu1_dst 611 subs x10,x10,#8 //(core loop) 612 613 umull v10.8h, v5.8b, v25.8b //(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)// 614 ld1 {v18.2s},[x12],x11 //(4_1)vector load pu1_src 615 616 ld1 {v19.2s},[x12],x11 //(4_1)vector load pu1_src 617 umlsl v10.8h, v4.8b, v24.8b //(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 618 619 ld1 {v20.2s},[x12],x11 //(4_1)vector load pu1_src 620 umlal v10.8h, v6.8b, v26.8b //(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 621 622 mov x4, x1 //(core loop) 623 624 ld1 {v21.2s},[x12],x0 //(4_1)vector load pu1_src 625 umlsl v10.8h, v7.8b, v27.8b //(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)// 626 627 628 629 bgt core_loop //loopback 630 631 epilogue: 632 umull v12.8h, v15.8b, v25.8b //(3)mul_res = vmull_u8(src[0_3], coeffabs_3)// 633 634 umlsl v12.8h, v14.8b, v24.8b //(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 635 636 umlal v12.8h, v16.8b, v26.8b //(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 637 638 umlsl v12.8h, v17.8b, v27.8b //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)// 639 640 st1 {v29.8h},[x4], x8 //(1)store the result pu1_dst 641 642 umull v22.8h, v19.8b, v25.8b //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)// 643 umlsl v22.8h, v18.8b, v24.8b //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 644 645 umlal v22.8h, v20.8b, v26.8b //(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 646 647 umlsl v22.8h, v21.8b, v27.8b //(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)// 648 649 st1 {v10.8h},[x4], x8 //(2)store the result pu1_dst 650 651 st1 {v12.8h},[x4], x8 //(3)store the result pu1_dst 652 653 add x1,x1,#16 //(core loop) 654 655 st1 {v22.8h},[x4], x8 //(4)store the result pu1_dst 656 657 sub x9,x9,x5 658 subs x14,x14,#4 //decrement the ht loop 659 sub x1,x1,x5,lsl #1 660 add x9,x9,x2,lsl #2 661 add x1,x1,x3,lsl #3 662 bgt outer_loop_ht_4 663 664 cmp x7,#0 665 mov x10,x5 666 csel x12, x9, x12,gt 667 csel x4, x1, x4,gt 668 bgt loop_residue_4 669 670 b end_loops 671 672 outer_loop_4: 673 add x6,x1,x3,lsl #1 //pu1_dst + dst_strd 674 mov x10,x5 675 add x4,x12,x2 //pu1_src + src_strd 676 677 inner_loop_4: 678 //ld1 {v0.2s, v1.2s},[x12] //vector load pu1_src 679 ld1 {v20.2s},[x12],x11 //vector load pu1_src 680 ld1 {v21.2s},[x12],x11 //vector load pu1_src 681 ld1 {v22.2s},[x12],x11 //vector load pu1_src 682 ld1 {v23.2s},[x12] //vector load pu1_src 683 684 //**** removal 685 //add x12,x12,#4 //increment the input pointer 686 //**** removal ends 687 //**** addn 688 sub x12,x12,#2 //increment the input pointer 689 //**** addn ends 690 ld1 {v16.2s},[x4],x11 //vector load pu1_src 691 ld1 {v17.2s},[x4],x11 //vector load pu1_src 692 ld1 {v18.2s},[x4],x11 //vector load pu1_src 693 ld1 {v19.2s},[x4] //vector load pu1_src 694 //vext.u8 d2,d0,d1,#2 //vector extract of src[0_2] 695 //vext.u8 d4,d0,d1,#4 //vector extract of src[0_4] 696 //ld1 {v12.2s, v13.2s},[x4] //vector load pu1_src + src_strd 697 //vext.u8 d6,d0,d1,#6 //vector extract of src[0_6] 698 699 //add x4,x4,#4 //increment the input pointer 700 sub x4,x4,#2 701 //vext.u8 d14,d12,d13,#2 //vector extract of src[0_2] 702 //vext.u8 d16,d12,d13,#4 //vector extract of src[0_4] 703 //vext.u8 d18,d12,d13,#6 //vector extract of src[0_6] 704 705 //**** removal 706 //zip1 v0.2s, v0.2s, v12.2s 707 //zip2 v12.2s, v0.2s, v12.2s //vector zip the i iteration and ii interation in single register 708 //zip1 v2.2s, v2.2s, v14.2s 709 //zip2 v14.2s, v2.2s, v14.2s 710 //zip1 v4.2s, v4.2s, v16.2s 711 //zip2 v16.2s, v4.2s, v16.2s 712 //zip1 v6.2s, v6.2s, v18.2s 713 //zip2 v18.2s, v6.2s, v18.2s 714 //**** removal ends 715 //**** addn 716 zip1 v0.2s, v20.2s, v16.2s 717 zip2 v4.2s, v20.2s, v16.2s //vector zip the i iteration and ii interation in single register 718 zip1 v1.2s, v21.2s, v17.2s 719 zip2 v5.2s, v21.2s, v17.2s 720 zip1 v2.2s, v22.2s, v18.2s 721 zip2 v6.2s, v22.2s, v18.2s 722 zip1 v3.2s, v23.2s, v19.2s 723 zip2 v7.2s, v23.2s, v19.2s 724 //**** addn ends 725 726 umull v29.8h, v1.8b, v25.8b //arithmetic operations for ii iteration in the same time 727 umlsl v29.8h, v0.8b, v24.8b 728 umlal v29.8h, v2.8b, v26.8b 729 umlsl v29.8h, v3.8b, v27.8b 730 731 st1 {v29.d}[0],[x1],#8 //store the i iteration result which is in upper part of the register 732 subs x10,x10,#4 //decrement the wd by 4 733 734 st1 {v29.d}[1],[x6],#8 //store the ii iteration result which is in lower part of the register 735 736 bgt inner_loop_4 737 738 sub x12,x12,x5 739 subs x14,x14,#2 //decrement the ht by 2 740 sub x1,x1,x5,lsl #1 741 add x12,x12,x2,lsl #1 742 add x1,x1,x3,lsl #2 743 bgt outer_loop_4 744 745 cmp x7,#0 746 mov x10,x5 747 beq end_loops 748 749 loop_residue_4: 750 751 mov x10,x5 //2wd 752 753 loop_residue: 754 755 //ld1 {v0.2s, v1.2s},[x12] //vector load pu1_src 756 ld1 {v20.2s},[x12],x11 //vector load pu1_src 757 ld1 {v21.2s},[x12],x11 //vector load pu1_src 758 ld1 {v22.2s},[x12],x11 //vector load pu1_src 759 ld1 {v23.2s},[x12] //vector load pu1_src 760 //vext.u8 d2,d0,d1,#2 //vector extract of src[0_2] 761 //umull v8.8h, v2.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 762 //umlsl v8.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 763 //vext.u8 d4,d0,d1,#4 //vector extract of src[0_4] 764 //add x12,x12,#4 //pu1_src + 4 765 sub x12, x12, #2 766 //vext.u8 d6,d0,d1,#6 //vector extract of src[0_6] 767 //umlal v8.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 768 //umlsl v8.8h, v6.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 769 umull v29.8h, v21.8b, v25.8b 770 umlsl v29.8h, v20.8b, v24.8b 771 umlal v29.8h, v22.8b, v26.8b 772 umlsl v29.8h, v23.8b, v27.8b 773 774 st1 {v29.1d},[x1] //store the result pu1_dst 775 subs x10,x10,#4 //decrement the wd loop 776 add x1,x1,#8 //pi2_dst + 8 777 778 bgt loop_residue //loop again 779 780 //inner loop ends 781 //add x8,x3,lsl #1 //2*dst_strd 782 //sub x8,x8,x5,lsl #1 //2*dst_strd - 2wd 783 //sub x9,x2,x5 //src_strd - 2wd 784 //subs x7,x7,#1 //decrement the ht loop 785 //add x12,x12,x9 //pu1_src + src_strd 786 //add x1,x1,x8 //pu1_dst + 2*dst_strd 787 //bgt outer_loop_residue_4 //loop again 788 //b end_loops //jumps to end 789 790 end_loops: 791 792 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 793 ldp x19, x20,[sp],#16 794 ldp d14,d15,[sp],#16 795 ldp d12,d13,[sp],#16 796 ldp d10,d11,[sp],#16 797 ret 798 799 800 801 802 803 804