1 //****************************************************************************** 2 //* 3 //* Copyright (C) 2015 The Android Open Source Project 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //***************************************************************************** 18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 //*/ 20 ///** 21 //****************************************************************************** 22 //* @file 23 //* ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s 24 //* 25 //* @brief 26 //* Contains function definitions for inter prediction interpolation. 27 //* 28 //* @author 29 //* Mohit 30 //* 31 //* @par List of Functions: 32 //* 33 //* - ih264_inter_pred_luma_horz_qpel_vert_hpel_av8() 34 //* 35 //* @remarks 36 //* None 37 //* 38 //******************************************************************************* 39 //*/ 40 41 ///* All the functions here are replicated from ih264_inter_pred_filters.c 42 // 43 44 ///** 45 ///** 46 ///** 47 //******************************************************************************* 48 //* 49 //* @brief 50 //* This function implements a two stage cascaded six tap filter. It 51 //* applies the six tap filter in the vertical direction on the 52 //* predictor values, followed by applying the same filter in the 53 //* horizontal direction on the output of the first stage. It then averages 54 //* the output of the 1st stage and the final stage to obtain the quarter 55 //* pel values.The six tap filtering operation is described in sec 8.4.2.2.1 56 //* titled "Luma sample interpolation process". 57 //* 58 //* @par Description: 59 //* This function is called to obtain pixels lying at the following 60 //* location (1/4,1/2) or (3/4,1/2). The function interpolates 61 //* the predictors first in the verical direction and then in the 62 //* horizontal direction to output the (1/2,1/2). It then averages 63 //* the output of the 2nd stage and (1/2,1/2) value to obtain (1/4,1/2) 64 //* or (3/4,1/2) depending on the offset. 65 //* 66 //* @param[in] pu1_src 67 //* UWORD8 pointer to the source 68 //* 69 //* @param[out] pu1_dst 70 //* UWORD8 pointer to the destination 71 //* 72 //* @param[in] src_strd 73 //* integer source stride 74 //* 75 //* @param[in] dst_strd 76 //* integer destination stride 77 //* 78 //* @param[in] ht 79 //* integer height of the array 80 //* 81 //* @param[in] wd 82 //* integer width of the array 83 //* 84 //* @param[in] pu1_tmp: temporary buffer 85 //* 86 //* @param[in] dydx: x and y reference offset for qpel calculations 87 //* 88 //* @returns 89 //* 90 //* @remarks 91 //* None 92 //* 93 //******************************************************************************* 94 //*/; 95 96 //void ih264_inter_pred_luma_horz_qpel_vert_hpel(UWORD8 *pu1_src, 97 // UWORD8 *pu1_dst, 98 // WORD32 src_strd,, 99 // WORD32 dst_strd, 100 // WORD32 ht, 101 // WORD32 wd, 102 // UWORD8* pu1_tmp, 103 // UWORD32 dydx) 104 105 //**************Variables Vs Registers***************************************** 106 // x0 => *pu1_src 107 // x1 => *pu1_dst 108 // w2 => src_strd 109 // w3 => dst_strd 110 // w4 => ht 111 // w5 => wd 112 // x6 => *pu1_tmp 113 // w7 => dydx 114 115 .text 116 .p2align 2 117 .include "ih264_neon_macros.s" 118 119 120 121 .global ih264_inter_pred_luma_horz_qpel_vert_hpel_av8 122 123 ih264_inter_pred_luma_horz_qpel_vert_hpel_av8: 124 125 // STMFD sp!, {x4-x12, x14} //store register values to stack 126 push_v_regs 127 stp x19, x20, [sp, #-16]! 128 sxtw x2, w2 129 sxtw x3, w3 130 sxtw x4, w4 131 sxtw x5, w5 132 133 sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd 134 sub x0, x0, #2 //pu1_src-2 135 mov x9, x6 136 mov w6, w7 137 138 and x6, x6, #2 // dydx & 0x3 followed by dydx>>1 and dydx<<1 139 140 add x7, x9, #4 141 add x6, x7, x6 // pi16_pred1_temp += (x_offset>>1) 142 143 movi v26.8h, #0x14 // Filter coeff 20 into Q13 144 movi v24.8h, #0x5 // Filter coeff 5 into Q12 145 movi v27.8h, #0x14 // Filter coeff 20 into Q13 146 movi v25.8h, #0x5 // Filter coeff 5 into Q12 147 mov x7, #0x20 148 mov x8, #0x30 149 subs x12, x5, #4 //if wd=4 branch to loop_4 150 beq loop_4_start 151 152 subs x12, x5, #8 //if wd=8 branch to loop_8 153 beq loop_8_start 154 155 //when wd=16 156 movi v28.8h, #0x14 // Filter coeff 20 into Q13 157 movi v30.8h, #0x5 // Filter coeff 5 into Q12 158 sub x2, x2, #16 159 ld1 {v0.2s, v1.2s}, [x0], #16 // Vector load from src[0_0] 160 ld1 {v12.2s}, [x0], x2 // Vector load from src[0_0] 161 ld1 {v2.2s, v3.2s}, [x0], #16 // Vector load from src[1_0] 162 ld1 {v13.2s}, [x0], x2 // Vector load from src[1_0] 163 ld1 {v4.2s, v5.2s}, [x0], #16 // Vector load from src[2_0] 164 ld1 {v14.2s}, [x0], x2 // Vector load from src[2_0] 165 ld1 {v6.2s, v7.2s}, [x0], #16 // Vector load from src[3_0] 166 ld1 {v15.2s}, [x0], x2 // Vector load from src[3_0] 167 ld1 {v8.2s, v9.2s}, [x0], #16 // Vector load from src[4_0] 168 ld1 {v16.2s}, [x0], x2 // Vector load from src[4_0] 169 170 loop_16: 171 172 ld1 {v10.2s, v11.2s}, [x0], #16 // Vector load from src[5_0] 173 ld1 {v17.2s}, [x0], x2 // Vector load from src[5_0] 174 175 176 uaddl v20.8h, v4.8b, v6.8b 177 uaddl v18.8h, v0.8b, v10.8b 178 uaddl v22.8h, v2.8b, v8.8b 179 mla v18.8h, v20.8h , v28.8h 180 uaddl v24.8h, v5.8b, v7.8b 181 uaddl v20.8h, v1.8b, v11.8b 182 uaddl v26.8h, v3.8b, v9.8b 183 mla v20.8h, v24.8h , v28.8h 184 uaddl v24.8h, v14.8b, v15.8b 185 mls v18.8h, v22.8h , v30.8h 186 uaddl v22.8h, v12.8b, v17.8b 187 mls v20.8h, v26.8h , v30.8h 188 uaddl v26.8h, v13.8b, v16.8b 189 mla v22.8h, v24.8h , v28.8h 190 mls v22.8h, v26.8h , v30.8h 191 st1 {v18.4s }, [x9], #16 192 st1 {v20.4s}, [x9], #16 193 ext v24.16b, v18.16b , v20.16b , #4 194 ext v26.16b, v18.16b , v20.16b , #6 195 st1 {v22.4s}, [x9] 196 ext v22.16b, v18.16b , v20.16b , #10 197 add v0.8h, v24.8h , v26.8h 198 ext v24.16b, v18.16b , v20.16b , #2 199 ext v26.16b, v18.16b , v20.16b , #8 200 add v24.8h, v24.8h , v26.8h 201 202 saddl v26.4s, v18.4h, v22.4h 203 smlal v26.4s, v0.4h, v28.4h 204 smlsl v26.4s, v24.4h, v30.4h 205 206 saddl2 v22.4s, v18.8h, v22.8h 207 smlal2 v22.4s, v0.8h, v28.8h 208 smlsl2 v22.4s, v24.8h, v30.8h 209 210 sqrshrun v18.4h, v26.4s, #10 211 sqrshrun v19.4h, v22.4s, #10 212 ld1 {v22.4s}, [x9], #16 213 214 uqxtn v18.8b, v18.8h 215 uqxtn v19.8b, v19.8h 216 mov v18.s[1], v19.s[0] 217 218 ext v24.16b, v20.16b , v22.16b , #4 219 ext v26.16b, v20.16b , v22.16b , #6 220 ext v0.16b, v20.16b , v22.16b , #10 221 st1 {v18.2s}, [x1] 222 add v18.8h, v24.8h , v26.8h 223 ext v24.16b, v20.16b , v22.16b , #2 224 ext v26.16b, v20.16b , v22.16b , #8 225 add v24.8h, v24.8h , v26.8h 226 227 saddl v26.4s, v0.4h, v20.4h 228 smlal v26.4s, v18.4h, v28.4h 229 smlsl v26.4s, v24.4h, v30.4h 230 231 saddl2 v22.4s, v0.8h, v20.8h 232 smlal2 v22.4s, v18.8h, v28.8h 233 smlsl2 v22.4s, v24.8h, v30.8h 234 235 sqrshrun v19.4h, v26.4s, #10 236 sqrshrun v18.4h, v22.4s, #10 237 238 uaddl v24.8h, v7.8b, v9.8b 239 ld1 {v20.4s}, [x6], #16 240 ld1 {v22.4s}, [x6], x7 241 242 243 uqxtn v19.8b, v19.8h 244 uqxtn v18.8b, v18.8h 245 mov v19.s[1], v18.s[0] 246 247 ld1 {v18.2s}, [x1] 248 sqrshrun v20.8b, v20.8h, #5 249 sqrshrun v21.8b, v22.8h, #5 250 uaddl v22.8h, v4.8b, v10.8b 251 ld1 {v0.2s, v1.2s}, [x0], #16 // Vector load from src[6_0] 252 urhadd v18.16b, v18.16b , v20.16b 253 urhadd v19.16b, v19.16b , v21.16b 254 255 ld1 {v12.2s}, [x0], x2 // Vector load from src[6_0] 256 uaddl v20.8h, v6.8b, v8.8b 257 uaddl v26.8h, v5.8b, v11.8b 258 st1 {v18.2s, v19.2s}, [x1], x3 // store row 0 259 260 261 //ROW_2 262 263 264 uaddl v18.8h, v2.8b, v0.8b 265 266 mla v18.8h, v20.8h , v28.8h 267 268 uaddl v20.8h, v3.8b, v1.8b 269 270 mla v20.8h, v24.8h , v28.8h 271 uaddl v24.8h, v15.8b, v16.8b 272 mls v18.8h, v22.8h , v30.8h 273 uaddl v22.8h, v13.8b, v12.8b 274 mls v20.8h, v26.8h , v30.8h 275 uaddl v26.8h, v14.8b, v17.8b 276 mla v22.8h, v24.8h , v28.8h 277 mls v22.8h, v26.8h , v30.8h 278 st1 {v18.4s}, [x9], #16 279 st1 {v20.4s}, [x9], #16 280 ext v24.16b, v18.16b , v20.16b , #4 281 ext v26.16b, v18.16b , v20.16b , #6 282 st1 {v22.4s}, [x9] 283 ext v22.16b, v18.16b , v20.16b , #10 284 add v2.8h, v24.8h , v26.8h 285 ext v24.16b, v18.16b , v20.16b , #2 286 ext v26.16b, v18.16b , v20.16b , #8 287 add v24.8h, v24.8h , v26.8h 288 289 saddl v26.4s, v18.4h, v22.4h 290 smlal v26.4s, v2.4h, v28.4h 291 smlsl v26.4s, v24.4h, v30.4h 292 293 saddl2 v22.4s, v18.8h, v22.8h 294 smlal2 v22.4s, v2.8h, v28.8h 295 smlsl2 v22.4s, v24.8h, v30.8h 296 297 sqrshrun v18.4h, v26.4s, #10 298 sqrshrun v19.4h, v22.4s, #10 299 300 ld1 {v22.4s}, [x9], #16 301 302 uqxtn v18.8b, v18.8h 303 uqxtn v19.8b, v19.8h 304 mov v18.s[1], v19.s[0] 305 306 ext v24.16b, v20.16b , v22.16b , #4 307 ext v26.16b, v20.16b , v22.16b , #6 308 ext v2.16b, v20.16b , v22.16b , #10 309 st1 {v18.2s}, [x1] 310 add v18.8h, v24.8h , v26.8h 311 ext v24.16b, v20.16b , v22.16b , #2 312 ext v26.16b, v20.16b , v22.16b , #8 313 add v24.8h, v24.8h , v26.8h 314 315 saddl v26.4s, v2.4h, v20.4h 316 smlal v26.4s, v18.4h, v28.4h 317 smlsl v26.4s, v24.4h, v30.4h 318 319 saddl2 v22.4s, v2.8h, v20.8h 320 smlal2 v22.4s, v18.8h, v28.8h 321 smlsl2 v22.4s, v24.8h, v30.8h 322 323 sqrshrun v19.4h, v26.4s, #10 324 sqrshrun v18.4h, v22.4s, #10 325 uaddl v24.8h, v9.8b, v11.8b 326 ld1 {v20.4s}, [x6], #16 327 ld1 {v22.4s}, [x6], x7 328 uqxtn v19.8b, v19.8h 329 uqxtn v18.8b, v18.8h 330 mov v19.s[1], v18.s[0] 331 ld1 {v18.4s}, [x1] 332 sqrshrun v20.8b, v20.8h, #5 333 sqrshrun v21.8b, v22.8h, #5 334 335 uaddl v22.8h, v6.8b, v0.8b 336 ld1 {v2.2s, v3.2s}, [x0], #16 // Vector load from src[7_0] 337 338 urhadd v18.16b, v18.16b , v20.16b 339 urhadd v19.16b, v19.16b , v21.16b 340 ld1 {v13.2s}, [x0], x2 // Vector load from src[7_0] 341 uaddl v20.8h, v8.8b, v10.8b 342 uaddl v26.8h, v7.8b, v1.8b 343 st1 {v18.2s, v19.2s}, [x1], x3 // store row 1 344 345 //ROW_3 346 347 348 uaddl v18.8h, v4.8b, v2.8b 349 350 mla v18.8h, v20.8h , v28.8h 351 352 uaddl v20.8h, v5.8b, v3.8b 353 354 mla v20.8h, v24.8h , v28.8h 355 uaddl v24.8h, v16.8b, v17.8b 356 mls v18.8h, v22.8h , v30.8h 357 uaddl v22.8h, v14.8b, v13.8b 358 mls v20.8h, v26.8h , v30.8h 359 uaddl v26.8h, v15.8b, v12.8b 360 mla v22.8h, v24.8h , v28.8h 361 mls v22.8h, v26.8h , v30.8h 362 st1 {v18.4s}, [x9], #16 363 st1 {v20.4s}, [x9], #16 364 ext v24.16b, v18.16b , v20.16b , #4 365 ext v26.16b, v18.16b , v20.16b , #6 366 st1 {v22.4s}, [x9] 367 ext v22.16b, v18.16b , v20.16b , #10 368 add v4.8h, v24.8h , v26.8h 369 ext v24.16b, v18.16b , v20.16b , #2 370 ext v26.16b, v18.16b , v20.16b , #8 371 add v24.8h, v24.8h , v26.8h 372 373 saddl v26.4s, v18.4h, v22.4h 374 smlal v26.4s, v4.4h, v28.4h 375 smlsl v26.4s, v24.4h, v30.4h 376 377 saddl2 v22.4s, v18.8h, v22.8h 378 smlal2 v22.4s, v4.8h, v28.8h 379 smlsl2 v22.4s, v24.8h, v30.8h 380 381 sqrshrun v18.4h, v26.4s, #10 382 sqrshrun v19.4h, v22.4s, #10 383 ld1 {v22.4s}, [x9], #16 384 385 uqxtn v18.8b, v18.8h 386 uqxtn v19.8b, v19.8h 387 mov v18.s[1], v19.s[0] 388 389 390 ext v24.16b, v20.16b , v22.16b , #4 391 ext v26.16b, v20.16b , v22.16b , #6 392 ext v4.16b, v20.16b , v22.16b , #10 393 st1 {v18.2s}, [x1] 394 add v18.8h, v24.8h , v26.8h 395 ext v24.16b, v20.16b , v22.16b , #2 396 ext v26.16b, v20.16b , v22.16b , #8 397 add v24.8h, v24.8h , v26.8h 398 399 saddl v26.4s, v4.4h, v20.4h 400 smlal v26.4s, v18.4h, v28.4h 401 smlsl v26.4s, v24.4h, v30.4h 402 403 saddl2 v22.4s, v4.8h, v20.8h 404 smlal2 v22.4s, v18.8h, v28.8h 405 smlsl2 v22.4s, v24.8h, v30.8h 406 407 sqrshrun v19.4h, v26.4s, #10 408 sqrshrun v18.4h, v22.4s, #10 409 410 uaddl v24.8h, v11.8b, v1.8b 411 ld1 {v20.4s}, [x6], #16 412 ld1 {v22.4s}, [x6], x7 413 414 uqxtn v19.8b, v19.8h 415 uqxtn v18.8b, v18.8h 416 mov v19.s[1], v18.s[0] 417 418 ld1 {v18.2s}, [x1] 419 sqrshrun v20.8b, v20.8h, #5 420 sqrshrun v21.8b, v22.8h, #5 421 422 uaddl v22.8h, v8.8b, v2.8b 423 ld1 {v4.2s, v5.2s}, [x0], #16 // Vector load from src[8_0] 424 425 urhadd v18.16b, v18.16b , v20.16b 426 urhadd v19.16b, v19.16b , v21.16b 427 ld1 {v14.2s}, [x0], x2 // Vector load from src[8_0] 428 uaddl v20.8h, v10.8b, v0.8b 429 uaddl v26.8h, v9.8b, v3.8b 430 st1 {v18.2s, v19.2s}, [x1], x3 // store row 2 431 432 433 //ROW_4 434 435 uaddl v18.8h, v6.8b, v4.8b 436 437 mla v18.8h, v20.8h , v28.8h 438 439 uaddl v20.8h, v7.8b, v5.8b 440 441 mla v20.8h, v24.8h , v28.8h 442 uaddl v24.8h, v17.8b, v12.8b 443 mls v18.8h, v22.8h , v30.8h 444 uaddl v22.8h, v15.8b, v14.8b 445 mls v20.8h, v26.8h , v30.8h 446 uaddl v26.8h, v16.8b, v13.8b 447 mla v22.8h, v24.8h , v28.8h 448 mls v22.8h, v26.8h , v30.8h 449 st1 {v18.4s}, [x9], #16 450 st1 {v20.4s}, [x9], #16 451 ext v24.16b, v18.16b , v20.16b , #4 452 ext v26.16b, v18.16b , v20.16b , #6 453 st1 {v22.4s}, [x9] 454 ext v22.16b, v18.16b , v20.16b , #10 455 add v6.8h, v24.8h , v26.8h 456 ext v24.16b, v18.16b , v20.16b , #2 457 ext v26.16b, v18.16b , v20.16b , #8 458 add v24.8h, v24.8h , v26.8h 459 460 saddl v26.4s, v18.4h, v22.4h 461 smlal v26.4s, v6.4h, v28.4h 462 smlsl v26.4s, v24.4h, v30.4h 463 464 saddl2 v22.4s, v18.8h, v22.8h 465 smlal2 v22.4s, v6.8h, v28.8h 466 smlsl2 v22.4s, v24.8h, v30.8h 467 468 sqrshrun v18.4h, v26.4s, #10 469 sqrshrun v19.4h, v22.4s, #10 470 ld1 {v22.4s}, [x9], #16 471 uqxtn v18.8b, v18.8h 472 uqxtn v19.8b, v19.8h 473 mov v18.s[1], v19.s[0] 474 475 476 ext v24.16b, v20.16b , v22.16b , #4 477 ext v26.16b, v20.16b , v22.16b , #6 478 ext v6.16b, v20.16b , v22.16b , #10 479 st1 {v18.2s}, [x1] 480 add v18.8h, v24.8h , v26.8h 481 ext v24.16b, v20.16b , v22.16b , #2 482 ext v26.16b, v20.16b , v22.16b , #8 483 add v24.8h, v24.8h , v26.8h 484 485 saddl v26.4s, v6.4h, v20.4h 486 smlal v26.4s, v18.4h, v28.4h 487 smlsl v26.4s, v24.4h, v30.4h 488 489 saddl2 v22.4s, v6.8h, v20.8h 490 smlal2 v22.4s, v18.8h, v28.8h 491 smlsl2 v22.4s, v24.8h, v30.8h 492 493 mov v6.16b, v2.16b 494 mov v7.16b, v3.16b 495 496 mov v2.16b, v10.16b 497 mov v3.16b, v11.16b 498 499 subs x4, x4, #4 500 sqrshrun v19.4h, v26.4s, #10 501 sqrshrun v18.4h, v22.4s, #10 502 mov v10.16b, v0.16b 503 mov v11.16b, v1.16b 504 505 mov v24.8b, v14.8b 506 507 mov v14.16b, v12.16b 508 mov v15.16b, v13.16b 509 510 511 uqxtn v19.8b, v19.8h 512 uqxtn v18.8b, v18.8h 513 mov v19.s[1], v18.s[0] 514 515 ld1 {v20.4s}, [x6], #16 516 ld1 {v22.4s}, [x6], x7 517 ld1 {v18.2s}, [x1] 518 sqrshrun v20.8b, v20.8h, #5 519 sqrshrun v21.8b, v22.8h, #5 520 521 mov v0.16b, v8.16b 522 mov v1.16b, v9.16b 523 524 mov v8.16b, v4.16b 525 mov v9.16b, v5.16b 526 527 mov v12.16b, v16.16b 528 mov v13.16b, v17.16b 529 urhadd v18.16b, v18.16b , v20.16b 530 urhadd v19.16b, v19.16b , v21.16b 531 532 mov v4.16b, v10.16b 533 mov v5.16b, v11.16b 534 535 mov v16.8b, v24.8b 536 st1 {v18.2s, v19.2s}, [x1], x3 // store row 3 537 538 bgt loop_16 // looping if height =16 539 b end_func 540 541 loop_8_start: 542 ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0] 543 ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0] 544 ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0] 545 ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0] 546 ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0] 547 548 loop_8: 549 550 ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0] 551 uaddl v14.8h, v4.8b, v6.8b 552 uaddl v12.8h, v0.8b, v10.8b 553 uaddl v16.8h, v2.8b, v8.8b 554 mla v12.8h, v14.8h , v26.8h 555 uaddl v18.8h, v5.8b, v7.8b 556 uaddl v14.8h, v1.8b, v11.8b 557 uaddl v22.8h, v3.8b, v9.8b 558 mla v14.8h, v18.8h , v26.8h 559 mls v12.8h, v16.8h , v24.8h 560 ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[6_0] 561 uaddl v16.8h, v6.8b, v8.8b 562 mls v14.8h, v22.8h , v24.8h 563 uaddl v28.8h, v2.8b, v0.8b 564 st1 {v12.4s}, [x9], #16 // store row 0 to temp buffer: col 0 565 ext v22.16b, v12.16b , v14.16b , #10 566 uaddl v18.8h, v4.8b, v10.8b 567 mla v28.8h, v16.8h , v26.8h 568 saddl v30.4s, v12.4h, v22.4h 569 st1 {v14.4s}, [x9], x7 // store row 0 to temp buffer: col 1 570 saddl2 v22.4s, v12.8h, v22.8h 571 ext v16.16b, v12.16b , v14.16b , #4 572 mls v28.8h, v18.8h , v24.8h 573 ext v18.16b, v12.16b , v14.16b , #6 574 ext v20.16b, v12.16b , v14.16b , #8 575 ext v14.16b, v12.16b , v14.16b , #2 576 add v16.8h, v16.8h , v18.8h 577 add v18.8h, v14.8h , v20.8h 578 uaddl v20.8h, v7.8b, v9.8b 579 smlal v30.4s, v16.4h, v26.4h 580 smlsl v30.4s, v18.4h, v24.4h 581 smlal2 v22.4s, v16.8h, v26.8h 582 smlsl2 v22.4s, v18.8h, v24.8h 583 uaddl v14.8h, v3.8b, v1.8b 584 st1 {v28.4s}, [x9], #16 // store row 1 to temp buffer: col 0 585 mla v14.8h, v20.8h , v26.8h 586 sqrshrun v12.4h, v30.4s, #10 587 uaddl v16.8h, v5.8b, v11.8b 588 sqrshrun v13.4h, v22.4s, #10 589 mls v14.8h, v16.8h , v24.8h 590 ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[7_0] 591 uqxtn v25.8b, v12.8h 592 uqxtn v13.8b, v13.8h 593 mov v25.s[1], v13.s[0] 594 uaddl v16.8h, v8.8b, v10.8b 595 596 597 ext v22.16b, v28.16b , v14.16b , #10 598 uaddl v20.8h, v4.8b, v2.8b 599 saddl v30.4s, v28.4h, v22.4h 600 mla v20.8h, v16.8h , v26.8h 601 st1 {v14.4s}, [x9], x7 // store row 1 to temp buffer: col 1 602 saddl2 v22.4s, v28.8h, v22.8h 603 ext v16.16b, v28.16b , v14.16b , #4 604 ext v18.16b, v28.16b , v14.16b , #6 605 ext v12.16b, v28.16b , v14.16b , #8 606 ext v14.16b, v28.16b , v14.16b , #2 607 add v16.8h, v16.8h , v18.8h 608 add v18.8h, v12.8h , v14.8h 609 ld1 {v14.4s, v15.4s}, [x6], x8 // load row 0 from temp buffer 610 smlal v30.4s, v16.4h, v26.4h 611 smlsl v30.4s, v18.4h, v24.4h 612 smlal2 v22.4s, v16.8h, v26.8h 613 smlsl2 v22.4s, v18.8h, v24.8h 614 sqrshrun v14.8b, v14.8h, #0x5 615 ld1 {v28.4s, v29.4s}, [x6], x8 // load row 1 from temp buffer 616 uaddl v18.8h, v6.8b, v0.8b 617 sqrshrun v16.4h, v30.4s, #10 618 sqrshrun v15.8b, v28.8h, #0x5 619 sqrshrun v17.4h, v22.4s, #10 620 621 mov v12.8b, v25.8b 622 mov v25.8b, v24.8b 623 624 uaddl v28.8h, v9.8b, v11.8b 625 uqxtn v13.8b, v16.8h 626 uqxtn v17.8b, v17.8h 627 mov v13.s[1], v17.s[0] 628 629 urhadd v12.16b, v12.16b , v14.16b 630 urhadd v13.16b, v13.16b , v15.16b 631 uaddl v14.8h, v5.8b, v3.8b 632 uaddl v22.8h, v7.8b, v1.8b 633 mls v20.8h, v18.8h , v24.8h 634 st1 {v12.2s}, [x1], x3 // store row 0 635 mla v14.8h, v28.8h , v26.8h 636 ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[8_0] 637 uaddl v30.8h, v10.8b, v0.8b 638 uaddl v28.8h, v6.8b, v4.8b 639 mls v14.8h, v22.8h , v24.8h 640 st1 {v13.2s}, [x1], x3 // store row 1 641 mla v28.8h, v30.8h , v26.8h 642 st1 {v20.4s}, [x9], #16 // store row 2 to temp buffer: col 0 643 ext v22.16b, v20.16b , v14.16b , #10 644 saddl v30.4s, v20.4h, v22.4h 645 st1 {v14.2s, v15.2s}, [x9], x7 // store row 2 to temp buffer: col 0 646 saddl2 v22.4s, v20.8h, v22.8h 647 ext v16.16b, v20.16b , v14.16b , #4 648 ext v18.16b, v20.16b , v14.16b , #6 649 ext v12.16b, v20.16b , v14.16b , #8 650 ext v14.16b, v20.16b , v14.16b , #2 651 add v16.8h, v16.8h , v18.8h 652 add v18.8h, v14.8h , v12.8h 653 uaddl v20.8h, v8.8b, v2.8b 654 smlal v30.4s, v16.4h, v26.4h 655 smlsl v30.4s, v18.4h, v24.4h 656 smlal2 v22.4s, v16.8h, v26.8h 657 smlsl2 v22.4s, v18.8h, v24.8h 658 uaddl v18.8h, v11.8b, v1.8b 659 uaddl v16.8h, v7.8b, v5.8b 660 sqrshrun v12.4h, v30.4s, #10 661 uaddl v30.8h, v9.8b, v3.8b 662 mla v16.8h, v18.8h , v26.8h 663 sqrshrun v13.4h, v22.4s, #10 664 mls v28.8h, v20.8h , v24.8h 665 ld1 {v14.4s, v15.4s}, [x6], x8 // load row 2 from temp buffer 666 mls v16.8h, v30.8h , v24.8h 667 uqxtn v27.8b, v12.8h 668 uqxtn v13.8b, v13.8h 669 mov v27.s[1], v13.s[0] 670 671 sqrshrun v14.8b, v14.8h, #5 672 ext v22.16b, v28.16b , v16.16b , #10 673 st1 {v28.4s}, [x9], #16 // store row 3 to temp buffer: col 0 674 saddl v30.4s, v28.4h, v22.4h 675 st1 {v16.2s, v17.2s}, [x9], x7 // store row 3 to temp buffer: col 1 676 saddl2 v22.4s, v28.8h, v22.8h 677 ext v12.16b, v28.16b , v16.16b , #4 678 ext v18.16b, v28.16b , v16.16b , #6 679 ext v20.16b, v28.16b , v16.16b , #8 680 ext v28.16b, v28.16b , v16.16b , #2 681 add v12.8h, v12.8h , v18.8h 682 add v18.8h, v28.8h , v20.8h 683 ld1 {v16.4s, v17.4s}, [x6], x8 // load row 3 from temp buffer 684 smlal v30.4s, v12.4h, v26.4h 685 smlsl v30.4s, v18.4h, v24.4h 686 smlal2 v22.4s, v12.8h, v26.8h 687 smlsl2 v22.4s, v18.8h, v24.8h 688 sqrshrun v15.8b, v16.8h, #0x5 689 690 mov v12.8b, v27.8b 691 mov v27.8b, v26.8b 692 693 sqrshrun v16.4h, v30.4s, #10 694 695 mov v6.16b, v2.16b 696 mov v7.16b, v3.16b 697 698 sqrshrun v17.4h, v22.4s, #10 699 700 mov v2.16b, v10.16b 701 mov v3.16b, v11.16b 702 703 mov v10.16b, v0.16b 704 mov v11.16b, v1.16b 705 706 subs x4, x4, #4 707 uqxtn v13.8b, v16.8h 708 uqxtn v17.8b, v17.8h 709 mov v13.s[1], v17.s[0] 710 urhadd v12.16b, v12.16b , v14.16b 711 urhadd v13.16b, v13.16b , v15.16b 712 713 mov v0.16b, v8.16b 714 mov v1.16b, v9.16b 715 716 mov v8.16b, v4.16b 717 mov v9.16b, v5.16b 718 719 mov v4.16b, v10.16b 720 mov v5.16b, v11.16b 721 722 st1 {v12.2s}, [x1], x3 // store row 2 723 st1 {v13.2s}, [x1], x3 // store row 3 724 725 bgt loop_8 //if height =8 loop 726 b end_func 727 728 loop_4_start: 729 ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0] 730 ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0] 731 ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0] 732 ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0] 733 ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0] 734 735 loop_4: 736 ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0] 737 uaddl v14.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0] 738 uaddl v12.8h, v0.8b, v10.8b // temp = src[0_0] + src[5_0] 739 uaddl v16.8h, v2.8b, v8.8b // temp2 = src[1_0] + src[4_0] 740 mla v12.8h, v14.8h , v26.8h // temp += temp1 * 20 741 uaddl v18.8h, v5.8b, v7.8b // temp1 = src[2_0] + src[3_0] 742 uaddl v14.8h, v1.8b, v11.8b // temp = src[0_0] + src[5_0] 743 uaddl v22.8h, v3.8b, v9.8b // temp2 = src[1_0] + src[4_0] 744 mla v14.8h, v18.8h , v26.8h // temp += temp1 * 20 745 mls v12.8h, v16.8h , v24.8h // temp -= temp2 * 5 746 ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[6_0] 747 uaddl v16.8h, v6.8b, v8.8b 748 mls v14.8h, v22.8h , v24.8h // temp -= temp2 * 5 749 //Q6 and Q7 have filtered values 750 uaddl v28.8h, v2.8b, v0.8b 751 st1 {v12.4s}, [x9], #16 // store row 0 to temp buffer: col 0 752 ext v22.16b, v12.16b , v14.16b , #10 753 uaddl v18.8h, v4.8b, v10.8b 754 mla v28.8h, v16.8h , v26.8h 755 saddl v30.4s, v12.4h, v22.4h 756 st1 {v14.4s}, [x9], x7 // store row 0 to temp buffer: col 1 757 saddl v22.4s, v13.4h, v23.4h 758 ext v16.16b, v12.16b , v14.16b , #4 759 mls v28.8h, v18.8h , v24.8h 760 ext v18.16b, v12.16b , v14.16b , #6 761 ext v20.16b, v12.16b , v14.16b , #8 762 ext v14.16b, v12.16b , v14.16b , #2 763 add v16.8h, v16.8h , v18.8h 764 add v18.8h, v14.8h , v20.8h 765 uaddl v20.8h, v7.8b, v9.8b 766 smlal v30.4s, v16.4h, v26.4h 767 smlsl v30.4s, v18.4h, v24.4h 768 smlal v22.4s, v17.4h, v26.4h 769 smlsl v22.4s, v19.4h, v24.4h 770 uaddl v14.8h, v3.8b, v1.8b 771 st1 {v28.4s}, [x9], #16 // store row 1 to temp buffer: col 0 772 mla v14.8h, v20.8h , v26.8h 773 sqrshrun v12.4h, v30.4s, #10 774 uaddl v16.8h, v5.8b, v11.8b 775 sqrshrun v13.4h, v22.4s, #10 776 mls v14.8h, v16.8h , v24.8h 777 ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[7_0] 778 uqxtn v25.8b, v12.8h 779 uaddl v16.8h, v8.8b, v10.8b 780 781 ext v22.16b, v28.16b , v14.16b , #10 782 uaddl v20.8h, v4.8b, v2.8b 783 saddl v30.4s, v28.4h, v22.4h 784 mla v20.8h, v16.8h , v26.8h 785 st1 {v14.4s}, [x9], x7 // store row 1 to temp buffer: col 1 786 saddl v22.4s, v29.4h, v23.4h 787 ext v16.16b, v28.16b , v14.16b , #4 788 ext v18.16b, v28.16b , v14.16b , #6 789 ext v12.16b, v28.16b , v14.16b , #8 790 ext v14.16b, v28.16b , v14.16b , #2 791 add v16.8h, v16.8h , v18.8h 792 add v18.8h, v12.8h , v14.8h 793 ld1 {v14.2s}, [x6], x8 //load row 0 from temp buffer 794 smlal v30.4s, v16.4h, v26.4h 795 smlsl v30.4s, v18.4h, v24.4h 796 smlal v22.4s, v17.4h, v26.4h 797 smlsl v22.4s, v19.4h, v24.4h 798 sqrshrun v14.8b, v14.8h, #0x5 799 ld1 {v28.2s}, [x6], x8 //load row 1 from temp buffer 800 uaddl v18.8h, v6.8b, v0.8b 801 sqrshrun v16.4h, v30.4s, #10 802 sqrshrun v15.8b, v28.8h, #0x5 803 sqrshrun v17.4h, v22.4s, #10 804 805 mov v12.8b, v25.8b 806 mov v25.8b, v24.8b 807 808 uaddl v28.8h, v9.8b, v11.8b 809 uqxtn v13.8b, v16.8h 810 811 urhadd v12.16b, v12.16b , v14.16b 812 urhadd v13.16b, v13.16b , v15.16b 813 814 uaddl v14.8h, v5.8b, v3.8b 815 uaddl v22.8h, v7.8b, v1.8b 816 mls v20.8h, v18.8h , v24.8h 817 st1 {v12.s}[0], [x1], x3 // store row 0 818 mla v14.8h, v28.8h , v26.8h 819 ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[8_0] 820 uaddl v30.8h, v10.8b, v0.8b 821 uaddl v28.8h, v6.8b, v4.8b 822 mls v14.8h, v22.8h , v24.8h 823 st1 {v13.s}[0], [x1], x3 //store row 1 824 mla v28.8h, v30.8h , v26.8h 825 st1 {v20.4s}, [x9], #16 // store row 2 to temp buffer: col 0 826 ext v22.16b, v20.16b , v14.16b , #10 827 saddl v30.4s, v20.4h, v22.4h 828 st1 {v14.4s}, [x9], x7 // store row 2 to temp buffer: col 1 829 saddl v22.4s, v21.4h, v23.4h 830 ext v16.16b, v20.16b , v14.16b , #4 831 ext v18.16b, v20.16b , v14.16b , #6 832 ext v12.16b, v20.16b , v14.16b , #8 833 ext v14.16b, v20.16b , v14.16b , #2 834 add v16.8h, v16.8h , v18.8h 835 add v18.8h, v14.8h , v12.8h 836 uaddl v20.8h, v8.8b, v2.8b 837 smlal v30.4s, v16.4h, v26.4h 838 smlsl v30.4s, v18.4h, v24.4h 839 smlal v22.4s, v17.4h, v26.4h 840 smlsl v22.4s, v19.4h, v24.4h 841 uaddl v18.8h, v11.8b, v1.8b 842 uaddl v16.8h, v7.8b, v5.8b 843 sqrshrun v12.4h, v30.4s, #10 844 uaddl v30.8h, v9.8b, v3.8b 845 mla v16.8h, v18.8h , v26.8h 846 sqrshrun v13.4h, v22.4s, #10 847 mls v28.8h, v20.8h , v24.8h 848 ld1 {v14.2s}, [x6], x8 //load row 3 from temp buffer 849 mls v16.8h, v30.8h , v24.8h 850 uqxtn v27.8b, v12.8h 851 sqrshrun v14.8b, v14.8h, #5 852 ext v22.16b, v28.16b , v16.16b , #10 853 st1 {v28.4s}, [x9], #16 // store row 3 to temp buffer: col 0 854 saddl v30.4s, v28.4h, v22.4h 855 st1 {v16.4s}, [x9], x7 // store row 3 to temp buffer: col 1 856 saddl v22.4s, v29.4h, v23.4h 857 ext v12.16b, v28.16b , v16.16b , #4 858 ext v18.16b, v28.16b , v16.16b , #6 859 ext v20.16b, v28.16b , v16.16b , #8 860 ext v28.16b, v28.16b , v16.16b , #2 861 add v12.8h, v12.8h , v18.8h 862 add v18.8h, v28.8h , v20.8h 863 ld1 {v16.2s}, [x6], x8 //load row 4 from temp buffer 864 smlal v30.4s, v12.4h, v26.4h 865 smlsl v30.4s, v18.4h, v24.4h 866 smlal v22.4s, v13.4h, v26.4h 867 smlsl v22.4s, v19.4h, v24.4h 868 sqrshrun v15.8b, v16.8h, #0x5 869 870 mov v12.8b, v27.8b 871 mov v27.8b, v26.8b 872 873 sqrshrun v16.4h, v30.4s, #10 874 875 mov v6.16b, v2.16b 876 mov v7.16b, v3.16b 877 878 sqrshrun v17.4h, v22.4s, #10 879 880 mov v2.16b, v10.16b 881 mov v3.16b, v11.16b 882 883 mov v10.16b, v0.16b 884 mov v11.16b, v1.16b 885 886 subs x4, x4, #4 887 uqxtn v13.8b, v16.8h 888 urhadd v12.16b, v12.16b , v14.16b 889 urhadd v13.16b, v13.16b , v15.16b 890 891 mov v0.16b, v8.16b 892 mov v1.16b, v9.16b 893 894 mov v8.16b, v4.16b 895 mov v9.16b, v5.16b 896 897 898 mov v4.16b, v10.16b 899 mov v5.16b, v11.16b 900 901 902 st1 {v12.s}[0], [x1], x3 // store row 2 903 st1 {v13.s}[0], [x1], x3 // store row 3 904 905 bgt loop_4 906 907 end_func: 908 // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack 909 ldp x19, x20, [sp], #16 910 pop_v_regs 911 ret 912 913 914 915