1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 //****************************************************************************** 20 //* //file 21 //* ihevc_inter_pred_luma_horz.s 22 //* 23 //* //brief 24 //* contains function definitions for inter prediction interpolation. 25 //* functions are coded using neon intrinsics and can be compiled using 26 27 //* rvct 28 //* 29 //* //author 30 //* parthiban v 31 //* 32 //* //par list of functions: 33 //* 34 //* - ihevc_inter_pred_luma_horz() 35 //* 36 //* //remarks 37 //* none 38 //* 39 //******************************************************************************* 40 //*/ 41 42 ///* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */ 43 ///* include reconstruction */ 44 // 45 46 ///** 47 //******************************************************************************* 48 //* 49 //* //brief 50 //* interprediction luma filter for vertical input 51 //* 52 //* //par description: 53 //* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to 54 //* the elements pointed by 'pu1_src' and writes to the location pointed by 55 //* 'pu1_dst' the output is downshifted by 6 and clipped to 8 bits 56 //* assumptions : the function is optimized considering the fact width is 57 //* multiple of 4 or 8. and height as multiple of 2. 58 //* 59 //* //param[in] pu1_src 60 //* uword8 pointer to the source 61 //* 62 //* //param[out] pu1_dst 63 //* uword8 pointer to the destination 64 //* 65 //* //param[in] src_strd 66 //* integer source stride 67 //* 68 //* //param[in] dst_strd 69 //* integer destination stride 70 //* 71 //* //param[in] pi1_coeff 72 //* word8 pointer to the filter coefficients 73 //* 74 //* //param[in] ht 75 //* integer height of the array 76 //* 77 //* //param[in] wd 78 //* integer width of the array 79 //* 80 //* //returns 81 //* 82 //* //remarks 83 //* none 84 //* 85 //******************************************************************************* 86 //*/ 87 88 //void ihevc_inter_pred_luma_horz ( 89 // uword8 *pu1_src, 90 // uword8 *pu1_dst, 91 // word32 src_strd, 92 // word32 dst_strd, 93 // word8 *pi1_coeff, 94 // word32 ht, 95 // word32 wd ) 96 97 //**************variables vs registers***************************************** 98 // x0 => *pu1_src 99 // x1 => *pu1_dst 100 // x2 => src_strd 101 // x3 => dst_strd 102 // x4 => *pi1_coeff 103 // x5 => ht 104 // x6 => wd 105 106 .text 107 .align 4 108 109 .include "ihevc_neon_macros.s" 110 111 .globl ihevc_inter_pred_luma_horz_av8 112 113 .type ihevc_inter_pred_luma_horz_av8, %function 114 115 ihevc_inter_pred_luma_horz_av8: 116 117 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 118 push_v_regs 119 stp x19, x20,[sp,#-16]! 120 //str x1,[sp,#-4] 121 // mov x7,#8192 122 123 mov x15,x4 // pi1_coeff 124 mov x16,x5 // ht 125 mov x17,x6 // wd 126 127 start_loop_count: 128 // ldr x1,[sp,#-4] 129 130 131 mov x4,x15 //loads pi1_coeff 132 mov x8,x16 //loads ht 133 mov x10,x17 //loads wd 134 135 ld1 {v0.8b},[x4] //coeff = vld1_s8(pi1_coeff) 136 mov x11,#1 137 subs x14,x8,#0 //checks for ht == 0 138 139 abs v2.8b, v0.8b //vabs_s8(coeff) 140 141 //ble end_loops 142 143 144 dup v24.8b, v2.8b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0) 145 sub x12,x0,#3 //pu1_src - 3 146 dup v25.8b, v2.8b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1) 147 add x4,x12,x2 //pu1_src_tmp2_8 = pu1_src + src_strd 148 dup v26.8b, v2.8b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2) 149 sub x20,x10,x2,lsl #1 //2*src_strd - wd 150 neg x9, x20 151 dup v27.8b, v2.8b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3) 152 sub x20,x10,x3,lsl #1 //2*dst_strd - wd 153 neg x8, x20 154 dup v28.8b, v2.8b[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4) 155 156 dup v29.8b, v2.8b[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5) 157 // tst x10,#7 //checks wd for multiples 158 dup v30.8b, v2.8b[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6) 159 dup v31.8b, v2.8b[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7) 160 161 mov x7,x1 162 163 cmp x10,#4 164 ble outer_loop_4 165 166 cmp x10,#24 167 mov x20,#16 168 csel x10, x20, x10,eq 169 add x20, x8,#8 170 csel x8, x20, x8,eq 171 add x20, x9,#8 172 csel x9, x20, x9,eq 173 174 cmp x10,#16 175 bge outer_loop_16 176 177 cmp x10,#12 178 add x20, x8,#4 179 csel x8, x20, x8,eq 180 add x20, x9,#4 181 csel x9, x20, x9,eq 182 b outer_loop_8 183 184 185 outer_loop8_residual: 186 sub x12,x0,#3 //pu1_src - 3 187 mov x1,x7 188 mov x14,#32 189 add x1, x1,#16 190 add x12, x12,#16 191 mov x10,#8 192 add x8, x8,#8 193 add x9, x9,#8 194 195 outer_loop_8: 196 197 add x6,x1,x3 //pu1_dst + dst_strd 198 add x4,x12,x2 //pu1_src + src_strd 199 subs x5,x10,#0 //checks wd 200 201 ble end_inner_loop_8 202 203 inner_loop_8: 204 ld1 {v0.2s},[x12],x11 //vector load pu1_src 205 ld1 {v1.2s},[x12],x11 206 ld1 {v2.2s},[x12],x11 207 ld1 {v3.2s},[x12],x11 208 209 210 211 212 213 // vext.u8 d2,d0,d1,#2 //vector extract of src[0_2] 214 // vext.u8 d3,d0,d1,#3 //vector extract of src[0_3] 215 // vext.u8 d4,d0,d1,#4 //vector extract of src[0_4] 216 // vext.u8 d5,d0,d1,#5 //vector extract of src[0_5] 217 // vext.u8 d6,d0,d1,#6 //vector extract of src [0_6] 218 // vext.u8 d7,d0,d1,#7 //vector extract of src[0_7] 219 // vext.u8 d1,d0,d1,#1 //vector extract of src[0_1] 220 // vext.u8 d14,d12,d13,#2 221 222 //vext.u8 d15,d12,d13,#3 //vector extract of src[0_3] 223 // vext.u8 d16,d12,d13,#4 //vector extract of src[0_4] 224 // vext.u8 d17,d12,d13,#5 //vector extract of src[0_5] 225 //vext.u8 d18,d12,d13,#6 //vector extract of src[0_6] 226 //vext.u8 d19,d12,d13,#7 //vector extract of src[0_7] 227 //vext.u8 d13,d12,d13,#1 //vector extract of src[0_1] 228 ld1 {v4.2s},[x12],x11 229 umull v8.8h, v1.8b, v25.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 230 ld1 {v5.2s},[x12],x11 231 umlal v8.8h, v3.8b, v27.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 232 ld1 {v6.2s},[x12],x11 233 umlsl v8.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 234 ld1 {v7.2s},[x12],x11 235 umlsl v8.8h, v2.8b, v26.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 236 ld1 {v12.2s},[x4],x11 //vector load pu1_src + src_strd 237 umlal v8.8h, v4.8b, v28.8b //mul_res = vmlal_u8(src[0_4], coeffabs_4)// 238 ld1 {v13.2s},[x4],x11 239 umlsl v8.8h, v5.8b, v29.8b //mul_res = vmlsl_u8(src[0_5], coeffabs_5)// 240 ld1 {v14.2s},[x4],x11 241 umlal v8.8h, v6.8b, v30.8b //mul_res = vmlal_u8(src[0_6], coeffabs_6)// 242 ld1 {v15.2s},[x4],x11 243 umlsl v8.8h, v7.8b, v31.8b //mul_res = vmlsl_u8(src[0_7], coeffabs_7)// 244 ld1 {v16.2s},[x4],x11 //vector load pu1_src + src_strd 245 246 umull v10.8h, v15.8b, v27.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 247 ld1 {v17.2s},[x4],x11 248 umlsl v10.8h, v14.8b, v26.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 249 ld1 {v18.2s},[x4],x11 250 umlal v10.8h, v16.8b, v28.8b //mul_res = vmlal_u8(src[0_4], coeffabs_4)// 251 ld1 {v19.2s},[x4],x11 //vector load pu1_src + src_strd 252 umlsl v10.8h, v17.8b, v29.8b //mul_res = vmlsl_u8(src[0_5], coeffabs_5)// 253 sqrshrun v20.8b, v8.8h,#6 //right shift and saturating narrow result 1 254 umlal v10.8h, v18.8b, v30.8b //mul_res = vmlal_u8(src[0_6], coeffabs_6)// 255 umlsl v10.8h, v19.8b, v31.8b //mul_res = vmlsl_u8(src[0_7], coeffabs_7)// 256 st1 {v20.8b},[x1],#8 //store the result pu1_dst 257 umlsl v10.8h, v12.8b, v24.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 258 umlal v10.8h, v13.8b, v25.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 259 260 261 262 sqrshrun v8.8b, v10.8h,#6 //right shift and saturating narrow result 2 263 subs x5,x5,#8 //decrement the wd loop 264 st1 {v8.8b},[x6],#8 //store the result pu1_dst 265 cmp x5,#4 266 bgt inner_loop_8 267 268 end_inner_loop_8: 269 subs x14,x14,#2 //decrement the ht loop 270 add x12,x12,x9 //increment the src pointer by 2*src_strd-wd 271 add x1,x1,x8 //increment the dst pointer by 2*dst_strd-wd 272 bgt outer_loop_8 273 274 275 276 277 278 mov x10,x17 //loads wd 279 cmp x10,#12 280 281 beq outer_loop4_residual 282 283 284 end_loops: 285 286 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 287 ldp x19, x20,[sp], #16 288 pop_v_regs 289 ret 290 291 292 293 294 295 296 outer_loop_16: 297 mov x15, #-7 298 stp x0,x7, [sp, #-16]! 299 300 add x6,x1,x3 //pu1_dst + dst_strd 301 add x4,x12,x2 //pu1_src + src_strd 302 and x0, x12, #31 303 sub x5,x10,#0 //checks wd 304 //ble end_loops1 305 add x20,x12, x2, lsl #1 306 prfm PLDL1KEEP,[x20] 307 ld1 { v0.2s},[x12],#8 //vector load pu1_src 308 ld1 { v1.2s},[x12],x15 //vector load pu1_src 309 add x20,x4, x2, lsl #1 310 prfm PLDL1KEEP,[x20] 311 ld1 { v2.2s},[x12],#8 312 ld1 { v3.2s},[x12],x15 313 ld1 { v4.2s},[x12],#8 314 ld1 { v5.2s},[x12],x15 315 ld1 { v6.2s},[x12],#8 316 ld1 { v7.2s},[x12],x15 317 ld1 { v12.2s},[x12],#8 318 ld1 { v13.2s},[x12],x15 319 umull v8.8h, v2.8b, v25.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 320 ld1 { v14.2s},[x12],#8 321 ld1 { v15.2s},[x12],x15 322 umlal v8.8h, v6.8b, v27.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 323 ld1 { v16.2s},[x12],#8 324 ld1 { v17.2s},[x12],x15 325 umlsl v8.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 326 ld1 { v18.2s},[x12],#8 327 ld1 { v19.2s},[x12],x15 328 umlsl v8.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 329 umlal v8.8h, v12.8b, v28.8b //mul_res = vmlal_u8(src[0_4], coeffabs_4)// 330 umlsl v8.8h, v14.8b, v29.8b //mul_res = vmlsl_u8(src[0_5], coeffabs_5)// 331 umlal v8.8h, v16.8b, v30.8b //mul_res = vmlal_u8(src[0_6], coeffabs_6)// 332 umlsl v8.8h, v18.8b, v31.8b //mul_res = vmlsl_u8(src[0_7], coeffabs_7)// 333 334 335 inner_loop_16: 336 337 338 subs x5,x5,#16 339 umull v20.8h, v3.8b, v25.8b 340 341 add x12, x12,#8 342 umlsl v20.8h, v1.8b, v24.8b 343 344 sub x20,x14,#2 345 csel x14, x20, x14,eq 346 umlal v20.8h, v7.8b, v27.8b 347 348 ld1 { v0.2s},[x4],#8 //vector load pu1_src 349 ld1 { v1.2s},[x4],x15 //vector load pu1_src 350 351 umlsl v20.8h, v5.8b, v26.8b 352 353 ld1 { v2.2s},[x4],#8 354 ld1 { v3.2s},[x4],x15 355 356 umlal v20.8h, v13.8b, v28.8b 357 358 ld1 { v4.2s},[x4],#8 359 ld1 { v5.2s},[x4],x15 360 umlal v20.8h, v17.8b, v30.8b 361 362 ld1 { v6.2s},[x4],#8 363 ld1 { v7.2s},[x4],x15 364 umlsl v20.8h, v15.8b, v29.8b 365 366 ld1 { v12.2s},[x4],#8 367 ld1 { v13.2s},[x4],x15 368 umlsl v20.8h, v19.8b, v31.8b 369 370 ld1 { v14.2s},[x4],#8 371 ld1 { v15.2s},[x4],x15 372 sqrshrun v8.8b, v8.8h,#6 //right shift and saturating narrow result 1 373 374 ld1 { v16.2s},[x4],#8 375 ld1 { v17.2s},[x4],x15 376 umull v10.8h, v2.8b, v25.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 377 378 ld1 { v18.2s},[x4],#8 379 ld1 { v19.2s},[x4],x15 380 umlal v10.8h, v6.8b, v27.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 381 382 add x4, x4,#8 383 umlsl v10.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 384 385 add x20,x12,x9 //increment the src pointer by 2*src_strd-wd 386 csel x12, x20, x12,eq 387 umlsl v10.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 388 389 add x20,x12,x2 //pu1_src + src_strd 390 csel x4, x20, x4,eq 391 sqrshrun v9.8b, v20.8h,#6 392 393 umlal v10.8h, v12.8b, v28.8b //mul_res = vmlal_u8(src[0_4], coeffabs_4)// 394 395 // and x7, x12, #31 396 umlsl v10.8h, v14.8b, v29.8b //mul_res = vmlsl_u8(src[0_5], coeffabs_5)// 397 398 umlal v10.8h, v16.8b, v30.8b //mul_res = vmlal_u8(src[0_6], coeffabs_6)// 399 400 umlsl v10.8h, v18.8b, v31.8b //mul_res = vmlsl_u8(src[0_7], coeffabs_7)// 401 402 umull v22.8h, v3.8b, v25.8b 403 404 umlsl v22.8h, v1.8b, v24.8b 405 406 st1 { v8.8b},[x1],#8 //store the result pu1_dst 407 st1 { v9.8b},[x1],#8 //store the result pu1_dst 408 umlal v22.8h, v7.8b, v27.8b 409 410 add x20,x1,x8 411 csel x1, x20, x1,eq 412 sqrshrun v10.8b, v10.8h,#6 //right shift and saturating narrow result 2 413 414 // cmp x7, x0 415 umlsl v22.8h, v5.8b, v26.8b 416 417 add x20,x12, x2, lsl #2 418 prfm PLDL1KEEP,[x20] 419 umlal v22.8h, v13.8b, v28.8b 420 421 add x20,x4, x2, lsl #2 422 prfm PLDL1KEEP,[x20] 423 umlal v22.8h, v17.8b, v30.8b 424 425 // mov x0, x7 426 umlsl v22.8h, v15.8b, v29.8b 427 428 cmp x14,#0 429 umlsl v22.8h, v19.8b, v31.8b 430 431 beq epilog_16 432 ld1 { v0.2s},[x12],#8 //vector load pu1_src 433 ld1 { v1.2s},[x12],x15 //vector load pu1_src 434 ld1 { v2.2s},[x12],#8 435 ld1 { v3.2s},[x12],x15 436 ld1 { v4.2s},[x12],#8 437 ld1 { v5.2s},[x12],x15 438 ld1 { v6.2s},[x12],#8 439 ld1 { v7.2s},[x12],x15 440 ld1 { v12.2s},[x12],#8 441 ld1 { v13.2s},[x12],x15 442 sqrshrun v11.8b, v22.8h,#6 443 umull v8.8h, v2.8b, v25.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)// 444 ld1 { v14.2s},[x12],#8 445 ld1 { v15.2s},[x12],x15 446 umlal v8.8h, v6.8b, v27.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)// 447 ld1 { v16.2s},[x12],#8 448 ld1 { v17.2s},[x12],x15 449 umlsl v8.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)// 450 ld1 { v18.2s},[x12],#8 451 ld1 { v19.2s},[x12],x15 452 umlsl v8.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)// 453 umlal v8.8h, v12.8b, v28.8b //mul_res = vmlal_u8(src[0_4], coeffabs_4)// 454 cmp x5,#0 455 umlsl v8.8h, v14.8b, v29.8b //mul_res = vmlsl_u8(src[0_5], coeffabs_5)// 456 csel x5, x10, x5,eq 457 umlal v8.8h, v16.8b, v30.8b //mul_res = vmlal_u8(src[0_6], coeffabs_6)// 458 st1 { v10.8b},[x6],#8 //store the result pu1_dst 459 st1 { v11.8b},[x6],#8 //store the result pu1_dst 460 umlsl v8.8h, v18.8b, v31.8b //mul_res = vmlsl_u8(src[0_7], coeffabs_7)// 461 add x20,x1,x3 //pu1_dst + dst_strd 462 csel x6, x20, x6,eq 463 b inner_loop_16 464 465 466 epilog_16: 467 sqrshrun v11.8b, v22.8h,#6 468 st1 { v10.8b},[x6],#8 //store the result pu1_dst 469 st1 { v11.8b},[x6],#8 //store the result pu1_dst 470 471 ldp x0,x7, [sp], #16 472 mov x10,x17 473 cmp x10,#24 474 475 beq outer_loop8_residual 476 477 478 479 end_loops1: 480 481 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 482 ldp x19, x20,[sp], #16 483 pop_v_regs 484 ret 485 486 487 488 489 490 491 492 493 outer_loop4_residual: 494 sub x12,x0,#3 //pu1_src - 3 495 mov x1,x7 496 add x1, x1,#8 497 mov x10,#4 498 add x12, x12,#8 499 mov x14,#16 500 add x8, x8,#4 501 add x9, x9,#4 502 503 outer_loop_4: 504 add x6,x1,x3 //pu1_dst + dst_strd 505 add x4,x12,x2 //pu1_src + src_strd 506 507 subs x5,x10,#0 //checks wd 508 ble end_inner_loop_4 509 510 inner_loop_4: 511 ld1 {v20.2s},[x12],x11 //vector load pu1_src 512 ld1 {v21.2s},[x12],x11 513 ld1 {v22.2s},[x4],x11 //vector load pu1_src + src_strd 514 ld1 {v23.2s},[x4],x11 515 516 zip1 v0.2s, v20.2s, v22.2s 517 zip2 v12.2s, v20.2s, v22.2s //vector zip the i iteration and ii interation in single register 518 zip1 v1.2s, v21.2s, v23.2s 519 zip2 v13.2s, v21.2s, v23.2s 520 521 ld1 {v20.2s},[x12],x11 //vector load pu1_src 522 ld1 {v21.2s},[x12],x11 523 ld1 {v22.2s},[x4],x11 //vector load pu1_src + src_strd 524 ld1 {v23.2s},[x4],x11 525 526 zip1 v2.2s, v20.2s, v22.2s 527 zip2 v14.2s, v20.2s, v22.2s 528 zip1 v3.2s, v21.2s, v23.2s 529 zip2 v15.2s, v21.2s, v23.2s 530 531 ld1 {v20.2s},[x12],x11 //vector load pu1_src 532 ld1 {v21.2s},[x12],x11 533 ld1 {v22.2s},[x4],x11 //vector load pu1_src + src_strd 534 ld1 {v23.2s},[x4],x11 535 536 zip1 v4.2s, v20.2s, v22.2s 537 zip2 v16.2s, v20.2s, v22.2s 538 zip1 v5.2s, v21.2s, v23.2s 539 zip2 v17.2s, v21.2s, v23.2s 540 541 ld1 {v20.2s},[x12],x11 //vector load pu1_src 542 ld1 {v21.2s},[x12],x11 543 ld1 {v22.2s},[x4],x11 //vector load pu1_src + src_strd 544 ld1 {v23.2s},[x4],x11 545 546 zip1 v6.2s, v20.2s, v22.2s 547 zip2 v18.2s, v20.2s, v22.2s 548 zip1 v7.2s, v21.2s, v23.2s 549 zip2 v19.2s, v21.2s, v23.2s 550 551 //add x12,x12,#4 //increment the input pointer 552 sub x12,x12,#4 553 //vext.u8 d2,d0,d1,#2 //vector extract of src[0_2] 554 //vext.u8 d3,d0,d1,#3 //vector extract of src[0_3] 555 //vext.u8 d4,d0,d1,#4 //vector extract of src[0_4] 556 557 //vext.u8 d5,d0,d1,#5 //vector extract of src[0_5] 558 //vext.u8 d6,d0,d1,#6 //vector extract of src[0_6] 559 //vext.u8 d7,d0,d1,#7 //vector extract of src[0_7] 560 //vext.u8 d1,d0,d1,#1 //vector extract of src[0_1] 561 562 sub x4,x4,#4 563 // add x4,x4,#4 //increment the input pointer 564 // vext.u8 d14,d12,d13,#2 //vector extract of src[0_2] 565 // vext.u8 d15,d12,d13,#3 //vector extract of src[0_3] 566 // vext.u8 d16,d12,d13,#4 //vector extract of src[0_4] 567 // vext.u8 d17,d12,d13,#5 //vector extract of src[0_5] 568 // vext.u8 d18,d12,d13,#6 //vector extract of src[0_6] 569 // vext.u8 d19,d12,d13,#7 //vector extract of src[0_7] 570 //vext.u8 d13,d12,d13,#1 //vector extract of src[0_1] 571 572 umull v8.8h, v1.8b, v25.8b //arithmetic operations for ii iteration in the same time 573 umlsl v8.8h, v0.8b, v24.8b 574 umlsl v8.8h, v2.8b, v26.8b 575 umlal v8.8h, v3.8b, v27.8b 576 umlal v8.8h, v4.8b, v28.8b 577 umlsl v8.8h, v5.8b, v29.8b 578 umlal v8.8h, v6.8b, v30.8b 579 umlsl v8.8h, v7.8b, v31.8b 580 581 sqrshrun v8.8b, v8.8h,#6 //narrow right shift and saturating the result 582 st1 {v8.s}[0],[x1],#4 //store the i iteration result which is in upper part of the register 583 st1 {v8.s}[1],[x6],#4 //store the ii iteration result which is in lower part of the register 584 subs x5,x5,#4 //decrement the wd by 4 585 bgt inner_loop_4 586 587 end_inner_loop_4: 588 subs x14,x14,#2 //decrement the ht by 4 589 add x12,x12,x9 //increment the input pointer 2*src_strd-wd 590 add x1,x1,x8 //increment the output pointer 2*dst_strd-wd 591 bgt outer_loop_4 592 //subs x7,x7,#1 593 // bgt start_loop_count 594 595 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 596 ldp x19, x20,[sp], #16 597 pop_v_regs 598 ret 599 600 601 602 603 604 605 606