1 //****************************************************************************** 2 //* 3 //* Copyright (C) 2015 The Android Open Source Project 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //***************************************************************************** 18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 //*/ 20 ///** 21 //****************************************************************************** 22 //* @file 23 //* ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s 24 //* 25 //* @brief 26 //* Contains function definitions for inter prediction interpolation. 27 //* 28 //* @author 29 //* Mohit 30 //* 31 //* @par List of Functions: 32 //* 33 //* - ih264_inter_pred_luma_horz_hpel_vert_hpel_av8() 34 //* 35 //* @remarks 36 //* None 37 //* 38 //******************************************************************************* 39 //*/ 40 41 42 43 //void ih264_inter_pred_luma_horz_hpel_vert_hpel(UWORD8 *pu1_src, 44 // UWORD8 *pu1_dst, 45 // WORD32 src_strd,, 46 // WORD32 dst_strd, 47 // WORD32 ht, 48 // WORD32 wd, 49 // UWORD8* pu1_tmp, 50 // UWORD32 dydx) 51 52 //**************Variables Vs Registers***************************************** 53 // x0 => *pu1_src 54 // x1 => *pu1_dst 55 // w2 => src_strd 56 // w3 => dst_strd 57 // w4 => ht 58 // w5 => wd 59 60 61 .text 62 .p2align 2 63 .include "ih264_neon_macros.s" 64 65 66 67 .global ih264_inter_pred_luma_horz_hpel_vert_hpel_av8 68 69 ih264_inter_pred_luma_horz_hpel_vert_hpel_av8: 70 71 //store register values to stack 72 push_v_regs 73 stp x19, x20, [sp, #-16]! 74 sxtw x2, w2 75 sxtw x3, w3 76 sxtw x4, w4 77 sxtw x5, w5 78 79 sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd 80 sub x0, x0, #2 //pu1_src-2 81 82 movi v26.8h, #0x14 // Filter coeff 20 into Q13 83 movi v24.8h, #0x5 // Filter coeff 5 into Q12 84 movi v27.8h, #0x14 // Filter coeff 20 into Q13 85 movi v25.8h, #0x5 // Filter coeff 5 into Q12 86 mov x7, #0x20 87 mov x8, #0x30 88 subs x12, x5, #4 //if wd=4 branch to loop_4 89 beq loop_4_start 90 91 subs x12, x5, #8 //if wd=8 branch to loop_8 92 beq loop_8_start 93 94 //when wd=16 95 movi v28.8h, #0x14 // Filter coeff 20 into Q13 96 movi v30.8h, #0x5 // Filter coeff 5 into Q12 97 sub x2, x2, #16 98 ld1 {v0.2s, v1.2s}, [x0], #16 // Vector load from src[0_0] 99 ld1 {v12.2s}, [x0], x2 // Vector load from src[0_0] 100 ld1 {v2.2s, v3.2s}, [x0], #16 // Vector load from src[1_0] 101 ld1 {v13.2s}, [x0], x2 // Vector load from src[1_0] 102 ld1 {v4.2s, v5.2s}, [x0], #16 // Vector load from src[2_0] 103 ld1 {v14.2s}, [x0], x2 // Vector load from src[2_0] 104 ld1 {v6.2s, v7.2s}, [x0], #16 // Vector load from src[3_0] 105 ld1 {v15.2s}, [x0], x2 // Vector load from src[3_0] 106 ld1 {v8.2s, v9.2s}, [x0], #16 // Vector load from src[4_0] 107 ld1 {v16.2s}, [x0], x2 // Vector load from src[4_0] 108 loop_16: 109 110 ld1 {v10.2s, v11.2s}, [x0], #16 // Vector load from src[5_0] 111 ld1 {v17.2s}, [x0], x2 // Vector load from src[5_0] 112 113 114 uaddl v20.8h, v4.8b, v6.8b 115 uaddl v18.8h, v0.8b, v10.8b 116 uaddl v22.8h, v2.8b, v8.8b 117 mla v18.8h, v20.8h , v28.8h 118 uaddl v24.8h, v5.8b, v7.8b 119 uaddl v20.8h, v1.8b, v11.8b 120 uaddl v26.8h, v3.8b, v9.8b 121 mla v20.8h, v24.8h , v28.8h 122 uaddl v24.8h, v14.8b, v15.8b 123 mls v18.8h, v22.8h , v30.8h 124 uaddl v22.8h, v12.8b, v17.8b 125 mls v20.8h, v26.8h , v30.8h 126 uaddl v26.8h, v13.8b, v16.8b 127 mla v22.8h, v24.8h , v28.8h 128 mls v22.8h, v26.8h , v30.8h 129 130 ext v24.16b, v18.16b , v20.16b , #4 131 ext v26.16b, v18.16b , v20.16b , #6 132 133 ext v23.16b, v18.16b , v20.16b , #10 134 add v0.8h, v24.8h , v26.8h 135 ext v24.16b, v18.16b , v20.16b , #2 136 ext v26.16b, v18.16b , v20.16b , #8 137 add v24.8h, v24.8h , v26.8h 138 139 saddl v26.4s, v18.4h, v23.4h 140 smlal v26.4s, v0.4h, v28.4h 141 smlsl v26.4s, v24.4h, v30.4h 142 143 saddl2 v23.4s, v18.8h, v23.8h 144 smlal2 v23.4s, v0.8h, v28.8h 145 smlsl2 v23.4s, v24.8h, v30.8h 146 147 sqrshrun v18.4h, v26.4s, #10 148 sqrshrun v19.4h, v23.4s, #10 149 150 151 uqxtn v18.8b, v18.8h 152 uqxtn v19.8b, v19.8h 153 mov v18.s[1], v19.s[0] 154 155 ext v24.16b, v20.16b , v22.16b , #4 156 ext v26.16b, v20.16b , v22.16b , #6 157 ext v0.16b, v20.16b , v22.16b , #10 158 159 add v25.8h, v24.8h , v26.8h 160 ext v24.16b, v20.16b , v22.16b , #2 161 ext v26.16b, v20.16b , v22.16b , #8 162 add v24.8h, v24.8h , v26.8h 163 164 saddl v26.4s, v0.4h, v20.4h 165 smlal v26.4s, v25.4h, v28.4h 166 smlsl v26.4s, v24.4h, v30.4h 167 168 saddl2 v22.4s, v0.8h, v20.8h 169 smlal2 v22.4s, v25.8h, v28.8h 170 smlsl2 v22.4s, v24.8h, v30.8h 171 172 sqrshrun v19.4h, v26.4s, #10 173 sqrshrun v25.4h, v22.4s, #10 174 175 uaddl v24.8h, v7.8b, v9.8b 176 177 178 179 uqxtn v19.8b, v19.8h 180 uqxtn v25.8b, v25.8h 181 mov v19.s[1], v25.s[0] 182 183 uaddl v22.8h, v4.8b, v10.8b 184 ld1 {v0.2s, v1.2s}, [x0], #16 // Vector load from src[6_0] 185 186 187 ld1 {v12.2s}, [x0], x2 // Vector load from src[6_0] 188 uaddl v20.8h, v6.8b, v8.8b 189 uaddl v26.8h, v5.8b, v11.8b 190 st1 {v18.2s, v19.2s}, [x1], x3 // store row 0 191 192 193 //ROW_2 194 195 196 uaddl v18.8h, v2.8b, v0.8b 197 198 mla v18.8h, v20.8h , v28.8h 199 200 uaddl v20.8h, v3.8b, v1.8b 201 202 mla v20.8h, v24.8h , v28.8h 203 uaddl v24.8h, v15.8b, v16.8b 204 mls v18.8h, v22.8h , v30.8h 205 uaddl v22.8h, v13.8b, v12.8b 206 mls v20.8h, v26.8h , v30.8h 207 uaddl v26.8h, v14.8b, v17.8b 208 mla v22.8h, v24.8h , v28.8h 209 mls v22.8h, v26.8h , v30.8h 210 211 ext v24.16b, v18.16b , v20.16b , #4 212 ext v26.16b, v18.16b , v20.16b , #6 213 214 ext v23.16b, v18.16b , v20.16b , #10 215 add v2.8h, v24.8h , v26.8h 216 ext v24.16b, v18.16b , v20.16b , #2 217 ext v26.16b, v18.16b , v20.16b , #8 218 add v24.8h, v24.8h , v26.8h 219 220 saddl v26.4s, v18.4h, v23.4h 221 smlal v26.4s, v2.4h, v28.4h 222 smlsl v26.4s, v24.4h, v30.4h 223 224 saddl2 v23.4s, v18.8h, v23.8h 225 smlal2 v23.4s, v2.8h, v28.8h 226 smlsl2 v23.4s, v24.8h, v30.8h 227 228 sqrshrun v18.4h, v26.4s, #10 229 sqrshrun v19.4h, v23.4s, #10 230 231 232 233 uqxtn v18.8b, v18.8h 234 uqxtn v19.8b, v19.8h 235 mov v18.s[1], v19.s[0] 236 237 ext v24.16b, v20.16b , v22.16b , #4 238 ext v26.16b, v20.16b , v22.16b , #6 239 ext v2.16b, v20.16b , v22.16b , #10 240 241 add v25.8h, v24.8h , v26.8h 242 ext v24.16b, v20.16b , v22.16b , #2 243 ext v26.16b, v20.16b , v22.16b , #8 244 add v24.8h, v24.8h , v26.8h 245 246 saddl v26.4s, v2.4h, v20.4h 247 smlal v26.4s, v25.4h, v28.4h 248 smlsl v26.4s, v24.4h, v30.4h 249 250 saddl2 v22.4s, v2.8h, v20.8h 251 smlal2 v22.4s, v25.8h, v28.8h 252 smlsl2 v22.4s, v24.8h, v30.8h 253 254 sqrshrun v19.4h, v26.4s, #10 255 sqrshrun v25.4h, v22.4s, #10 256 uaddl v24.8h, v9.8b, v11.8b 257 258 uqxtn v19.8b, v19.8h 259 uqxtn v25.8b, v25.8h 260 mov v19.s[1], v25.s[0] 261 262 263 uaddl v22.8h, v6.8b, v0.8b 264 ld1 {v2.2s, v3.2s}, [x0], #16 // Vector load from src[7_0] 265 266 267 ld1 {v13.2s}, [x0], x2 // Vector load from src[7_0] 268 uaddl v20.8h, v8.8b, v10.8b 269 uaddl v26.8h, v7.8b, v1.8b 270 st1 {v18.2s, v19.2s}, [x1], x3 // store row 1 271 272 //ROW_3 273 274 275 uaddl v18.8h, v4.8b, v2.8b 276 277 mla v18.8h, v20.8h , v28.8h 278 279 uaddl v20.8h, v5.8b, v3.8b 280 281 mla v20.8h, v24.8h , v28.8h 282 uaddl v24.8h, v16.8b, v17.8b 283 mls v18.8h, v22.8h , v30.8h 284 uaddl v22.8h, v14.8b, v13.8b 285 mls v20.8h, v26.8h , v30.8h 286 uaddl v26.8h, v15.8b, v12.8b 287 mla v22.8h, v24.8h , v28.8h 288 mls v22.8h, v26.8h , v30.8h 289 290 ext v24.16b, v18.16b , v20.16b , #4 291 ext v26.16b, v18.16b , v20.16b , #6 292 293 ext v23.16b, v18.16b , v20.16b , #10 294 add v4.8h, v24.8h , v26.8h 295 ext v24.16b, v18.16b , v20.16b , #2 296 ext v26.16b, v18.16b , v20.16b , #8 297 add v24.8h, v24.8h , v26.8h 298 299 saddl v26.4s, v18.4h, v23.4h 300 smlal v26.4s, v4.4h, v28.4h 301 smlsl v26.4s, v24.4h, v30.4h 302 303 saddl2 v23.4s, v18.8h, v23.8h 304 smlal2 v23.4s, v4.8h, v28.8h 305 smlsl2 v23.4s, v24.8h, v30.8h 306 307 sqrshrun v18.4h, v26.4s, #10 308 sqrshrun v19.4h, v23.4s, #10 309 310 311 uqxtn v18.8b, v18.8h 312 uqxtn v19.8b, v19.8h 313 mov v18.s[1], v19.s[0] 314 315 316 ext v24.16b, v20.16b , v22.16b , #4 317 ext v26.16b, v20.16b , v22.16b , #6 318 ext v4.16b, v20.16b , v22.16b , #10 319 320 add v25.8h, v24.8h , v26.8h 321 ext v24.16b, v20.16b , v22.16b , #2 322 ext v26.16b, v20.16b , v22.16b , #8 323 add v24.8h, v24.8h , v26.8h 324 325 saddl v26.4s, v4.4h, v20.4h 326 smlal v26.4s, v25.4h, v28.4h 327 smlsl v26.4s, v24.4h, v30.4h 328 329 saddl2 v22.4s, v4.8h, v20.8h 330 smlal2 v22.4s, v25.8h, v28.8h 331 smlsl2 v22.4s, v24.8h, v30.8h 332 333 sqrshrun v19.4h, v26.4s, #10 334 sqrshrun v25.4h, v22.4s, #10 335 336 uaddl v24.8h, v11.8b, v1.8b 337 338 339 uqxtn v19.8b, v19.8h 340 uqxtn v25.8b, v25.8h 341 mov v19.s[1], v25.s[0] 342 343 344 345 uaddl v22.8h, v8.8b, v2.8b 346 ld1 {v4.2s, v5.2s}, [x0], #16 // Vector load from src[8_0] 347 348 349 ld1 {v14.2s}, [x0], x2 // Vector load from src[8_0] 350 uaddl v20.8h, v10.8b, v0.8b 351 uaddl v26.8h, v9.8b, v3.8b 352 st1 {v18.2s, v19.2s}, [x1], x3 // store row 2 353 354 355 //ROW_4 356 357 uaddl v18.8h, v6.8b, v4.8b 358 359 mla v18.8h, v20.8h , v28.8h 360 361 uaddl v20.8h, v7.8b, v5.8b 362 363 mla v20.8h, v24.8h , v28.8h 364 uaddl v24.8h, v17.8b, v12.8b 365 mls v18.8h, v22.8h , v30.8h 366 uaddl v22.8h, v15.8b, v14.8b 367 mls v20.8h, v26.8h , v30.8h 368 uaddl v26.8h, v16.8b, v13.8b 369 mla v22.8h, v24.8h , v28.8h 370 mls v22.8h, v26.8h , v30.8h 371 372 ext v24.16b, v18.16b , v20.16b , #4 373 ext v26.16b, v18.16b , v20.16b , #6 374 375 ext v23.16b, v18.16b , v20.16b , #10 376 add v6.8h, v24.8h , v26.8h 377 ext v24.16b, v18.16b , v20.16b , #2 378 ext v26.16b, v18.16b , v20.16b , #8 379 add v24.8h, v24.8h , v26.8h 380 381 saddl v26.4s, v18.4h, v23.4h 382 smlal v26.4s, v6.4h, v28.4h 383 smlsl v26.4s, v24.4h, v30.4h 384 385 saddl2 v23.4s, v18.8h, v23.8h 386 smlal2 v23.4s, v6.8h, v28.8h 387 smlsl2 v23.4s, v24.8h, v30.8h 388 389 sqrshrun v18.4h, v26.4s, #10 390 sqrshrun v19.4h, v23.4s, #10 391 392 uqxtn v18.8b, v18.8h 393 uqxtn v19.8b, v19.8h 394 mov v18.s[1], v19.s[0] 395 396 397 ext v24.16b, v20.16b , v22.16b , #4 398 ext v26.16b, v20.16b , v22.16b , #6 399 ext v6.16b, v20.16b , v22.16b , #10 400 401 add v25.8h, v24.8h , v26.8h 402 ext v24.16b, v20.16b , v22.16b , #2 403 ext v26.16b, v20.16b , v22.16b , #8 404 add v24.8h, v24.8h , v26.8h 405 406 saddl v26.4s, v6.4h, v20.4h 407 smlal v26.4s, v25.4h, v28.4h 408 smlsl v26.4s, v24.4h, v30.4h 409 410 saddl2 v22.4s, v6.8h, v20.8h 411 smlal2 v22.4s, v25.8h, v28.8h 412 smlsl2 v22.4s, v24.8h, v30.8h 413 414 mov v6.16b, v2.16b 415 mov v7.16b, v3.16b 416 417 mov v2.16b, v10.16b 418 mov v3.16b, v11.16b 419 420 subs x4, x4, #4 421 sqrshrun v19.4h, v26.4s, #10 422 sqrshrun v25.4h, v22.4s, #10 423 mov v10.16b, v0.16b 424 mov v11.16b, v1.16b 425 426 mov v24.8b, v14.8b 427 428 mov v14.16b, v12.16b 429 mov v15.16b, v13.16b 430 431 432 uqxtn v19.8b, v19.8h 433 uqxtn v25.8b, v25.8h 434 mov v19.s[1], v25.s[0] 435 436 437 438 mov v0.16b, v8.16b 439 mov v1.16b, v9.16b 440 441 mov v8.16b, v4.16b 442 mov v9.16b, v5.16b 443 444 mov v12.16b, v16.16b 445 mov v13.16b, v17.16b 446 447 mov v4.16b, v10.16b 448 mov v5.16b, v11.16b 449 450 mov v16.8b, v24.8b 451 st1 {v18.2s, v19.2s}, [x1], x3 // store row 3 452 453 bgt loop_16 // looping if height =16 454 b end_func 455 456 loop_8_start: 457 ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0] 458 ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0] 459 ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0] 460 ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0] 461 ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0] 462 463 loop_8: 464 465 ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0] 466 uaddl v14.8h, v4.8b, v6.8b 467 uaddl v12.8h, v0.8b, v10.8b 468 uaddl v16.8h, v2.8b, v8.8b 469 mla v12.8h, v14.8h , v26.8h 470 uaddl v18.8h, v5.8b, v7.8b 471 uaddl v14.8h, v1.8b, v11.8b 472 uaddl v22.8h, v3.8b, v9.8b 473 mla v14.8h, v18.8h , v26.8h 474 mls v12.8h, v16.8h , v24.8h 475 ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[6_0] 476 uaddl v16.8h, v6.8b, v8.8b 477 mls v14.8h, v22.8h , v24.8h 478 uaddl v28.8h, v2.8b, v0.8b 479 480 ext v22.16b, v12.16b , v14.16b , #10 481 uaddl v18.8h, v4.8b, v10.8b 482 mla v28.8h, v16.8h , v26.8h 483 saddl v30.4s, v12.4h, v22.4h 484 485 saddl2 v22.4s, v12.8h, v22.8h 486 ext v16.16b, v12.16b , v14.16b , #4 487 mls v28.8h, v18.8h , v24.8h 488 ext v18.16b, v12.16b , v14.16b , #6 489 ext v20.16b, v12.16b , v14.16b , #8 490 ext v14.16b, v12.16b , v14.16b , #2 491 add v16.8h, v16.8h , v18.8h 492 add v18.8h, v14.8h , v20.8h 493 uaddl v20.8h, v7.8b, v9.8b 494 smlal v30.4s, v16.4h, v26.4h 495 smlsl v30.4s, v18.4h, v24.4h 496 smlal2 v22.4s, v16.8h, v26.8h 497 smlsl2 v22.4s, v18.8h, v24.8h 498 uaddl v14.8h, v3.8b, v1.8b 499 500 mla v14.8h, v20.8h , v26.8h 501 sqrshrun v12.4h, v30.4s, #10 502 uaddl v16.8h, v5.8b, v11.8b 503 sqrshrun v13.4h, v22.4s, #10 504 mls v14.8h, v16.8h , v24.8h 505 ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[7_0] 506 uqxtn v25.8b, v12.8h 507 uqxtn v13.8b, v13.8h 508 mov v25.s[1], v13.s[0] 509 uaddl v16.8h, v8.8b, v10.8b 510 511 512 ext v22.16b, v28.16b , v14.16b , #10 513 uaddl v20.8h, v4.8b, v2.8b 514 saddl v30.4s, v28.4h, v22.4h 515 mla v20.8h, v16.8h , v26.8h 516 517 saddl2 v22.4s, v28.8h, v22.8h 518 ext v16.16b, v28.16b , v14.16b , #4 519 ext v18.16b, v28.16b , v14.16b , #6 520 ext v12.16b, v28.16b , v14.16b , #8 521 ext v14.16b, v28.16b , v14.16b , #2 522 add v16.8h, v16.8h , v18.8h 523 add v18.8h, v12.8h , v14.8h 524 525 smlal v30.4s, v16.4h, v26.4h 526 smlsl v30.4s, v18.4h, v24.4h 527 smlal2 v22.4s, v16.8h, v26.8h 528 smlsl2 v22.4s, v18.8h, v24.8h 529 530 531 uaddl v18.8h, v6.8b, v0.8b 532 sqrshrun v16.4h, v30.4s, #10 533 534 sqrshrun v17.4h, v22.4s, #10 535 536 mov v12.8b, v25.8b 537 mov v25.8b, v24.8b 538 539 uaddl v28.8h, v9.8b, v11.8b 540 uqxtn v13.8b, v16.8h 541 uqxtn v17.8b, v17.8h 542 mov v13.s[1], v17.s[0] 543 544 545 uaddl v14.8h, v5.8b, v3.8b 546 uaddl v22.8h, v7.8b, v1.8b 547 mls v20.8h, v18.8h , v24.8h 548 st1 {v12.2s}, [x1], x3 // store row 0 549 mla v14.8h, v28.8h , v26.8h 550 ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[8_0] 551 uaddl v30.8h, v10.8b, v0.8b 552 uaddl v28.8h, v6.8b, v4.8b 553 mls v14.8h, v22.8h , v24.8h 554 st1 {v13.2s}, [x1], x3 // store row 1 555 mla v28.8h, v30.8h , v26.8h 556 557 ext v22.16b, v20.16b , v14.16b , #10 558 saddl v30.4s, v20.4h, v22.4h 559 560 saddl2 v22.4s, v20.8h, v22.8h 561 ext v16.16b, v20.16b , v14.16b , #4 562 ext v18.16b, v20.16b , v14.16b , #6 563 ext v12.16b, v20.16b , v14.16b , #8 564 ext v14.16b, v20.16b , v14.16b , #2 565 add v16.8h, v16.8h , v18.8h 566 add v18.8h, v14.8h , v12.8h 567 uaddl v20.8h, v8.8b, v2.8b 568 smlal v30.4s, v16.4h, v26.4h 569 smlsl v30.4s, v18.4h, v24.4h 570 smlal2 v22.4s, v16.8h, v26.8h 571 smlsl2 v22.4s, v18.8h, v24.8h 572 uaddl v18.8h, v11.8b, v1.8b 573 uaddl v16.8h, v7.8b, v5.8b 574 sqrshrun v12.4h, v30.4s, #10 575 uaddl v30.8h, v9.8b, v3.8b 576 mla v16.8h, v18.8h , v26.8h 577 sqrshrun v13.4h, v22.4s, #10 578 mls v28.8h, v20.8h , v24.8h 579 580 mls v16.8h, v30.8h , v24.8h 581 uqxtn v27.8b, v12.8h 582 uqxtn v13.8b, v13.8h 583 mov v27.s[1], v13.s[0] 584 585 586 ext v22.16b, v28.16b , v16.16b , #10 587 588 saddl v30.4s, v28.4h, v22.4h 589 590 saddl2 v22.4s, v28.8h, v22.8h 591 ext v12.16b, v28.16b , v16.16b , #4 592 ext v18.16b, v28.16b , v16.16b , #6 593 ext v20.16b, v28.16b , v16.16b , #8 594 ext v28.16b, v28.16b , v16.16b , #2 595 add v12.8h, v12.8h , v18.8h 596 add v18.8h, v28.8h , v20.8h 597 598 smlal v30.4s, v12.4h, v26.4h 599 smlsl v30.4s, v18.4h, v24.4h 600 smlal2 v22.4s, v12.8h, v26.8h 601 smlsl2 v22.4s, v18.8h, v24.8h 602 603 604 mov v12.8b, v27.8b 605 mov v27.8b, v26.8b 606 607 sqrshrun v16.4h, v30.4s, #10 608 609 mov v6.16b, v2.16b 610 mov v7.16b, v3.16b 611 612 sqrshrun v17.4h, v22.4s, #10 613 614 mov v2.16b, v10.16b 615 mov v3.16b, v11.16b 616 617 mov v10.16b, v0.16b 618 mov v11.16b, v1.16b 619 620 subs x4, x4, #4 621 uqxtn v13.8b, v16.8h 622 uqxtn v17.8b, v17.8h 623 mov v13.s[1], v17.s[0] 624 625 626 mov v0.16b, v8.16b 627 mov v1.16b, v9.16b 628 629 mov v8.16b, v4.16b 630 mov v9.16b, v5.16b 631 632 mov v4.16b, v10.16b 633 mov v5.16b, v11.16b 634 635 st1 {v12.2s}, [x1], x3 // store row 2 636 st1 {v13.2s}, [x1], x3 // store row 3 637 638 bgt loop_8 //if height =8 loop 639 b end_func 640 641 loop_4_start: 642 ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0] 643 ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0] 644 ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0] 645 ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0] 646 ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0] 647 648 loop_4: 649 ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0] 650 uaddl v14.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0] 651 uaddl v12.8h, v0.8b, v10.8b // temp = src[0_0] + src[5_0] 652 uaddl v16.8h, v2.8b, v8.8b // temp2 = src[1_0] + src[4_0] 653 mla v12.8h, v14.8h , v26.8h // temp += temp1 * 20 654 uaddl v18.8h, v5.8b, v7.8b // temp1 = src[2_0] + src[3_0] 655 uaddl v14.8h, v1.8b, v11.8b // temp = src[0_0] + src[5_0] 656 uaddl v22.8h, v3.8b, v9.8b // temp2 = src[1_0] + src[4_0] 657 mla v14.8h, v18.8h , v26.8h // temp += temp1 * 20 658 mls v12.8h, v16.8h , v24.8h // temp -= temp2 * 5 659 ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[6_0] 660 uaddl v16.8h, v6.8b, v8.8b 661 mls v14.8h, v22.8h , v24.8h // temp -= temp2 * 5 662 //Q6 and Q7 have filtered values 663 uaddl v28.8h, v2.8b, v0.8b 664 665 ext v22.16b, v12.16b , v14.16b , #10 666 uaddl v18.8h, v4.8b, v10.8b 667 mla v28.8h, v16.8h , v26.8h 668 saddl v30.4s, v12.4h, v22.4h 669 670 saddl v22.4s, v13.4h, v23.4h 671 ext v16.16b, v12.16b , v14.16b , #4 672 mls v28.8h, v18.8h , v24.8h 673 ext v18.16b, v12.16b , v14.16b , #6 674 ext v20.16b, v12.16b , v14.16b , #8 675 ext v14.16b, v12.16b , v14.16b , #2 676 add v16.8h, v16.8h , v18.8h 677 add v18.8h, v14.8h , v20.8h 678 uaddl v20.8h, v7.8b, v9.8b 679 smlal v30.4s, v16.4h, v26.4h 680 smlsl v30.4s, v18.4h, v24.4h 681 smlal v22.4s, v17.4h, v26.4h 682 smlsl v22.4s, v19.4h, v24.4h 683 uaddl v14.8h, v3.8b, v1.8b 684 685 mla v14.8h, v20.8h , v26.8h 686 sqrshrun v12.4h, v30.4s, #10 687 uaddl v16.8h, v5.8b, v11.8b 688 sqrshrun v13.4h, v22.4s, #10 689 mls v14.8h, v16.8h , v24.8h 690 ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[7_0] 691 uqxtn v25.8b, v12.8h 692 uaddl v16.8h, v8.8b, v10.8b 693 694 ext v22.16b, v28.16b , v14.16b , #10 695 uaddl v20.8h, v4.8b, v2.8b 696 saddl v30.4s, v28.4h, v22.4h 697 mla v20.8h, v16.8h , v26.8h 698 699 saddl v22.4s, v29.4h, v23.4h 700 ext v16.16b, v28.16b , v14.16b , #4 701 ext v18.16b, v28.16b , v14.16b , #6 702 ext v12.16b, v28.16b , v14.16b , #8 703 ext v14.16b, v28.16b , v14.16b , #2 704 add v16.8h, v16.8h , v18.8h 705 add v18.8h, v12.8h , v14.8h 706 707 smlal v30.4s, v16.4h, v26.4h 708 smlsl v30.4s, v18.4h, v24.4h 709 smlal v22.4s, v17.4h, v26.4h 710 smlsl v22.4s, v19.4h, v24.4h 711 712 713 uaddl v18.8h, v6.8b, v0.8b 714 sqrshrun v16.4h, v30.4s, #10 715 716 sqrshrun v17.4h, v22.4s, #10 717 718 mov v12.8b, v25.8b 719 mov v25.8b, v24.8b 720 721 uaddl v28.8h, v9.8b, v11.8b 722 uqxtn v13.8b, v16.8h 723 724 725 726 uaddl v14.8h, v5.8b, v3.8b 727 uaddl v22.8h, v7.8b, v1.8b 728 mls v20.8h, v18.8h , v24.8h 729 st1 {v12.s}[0], [x1], x3 // store row 0 730 mla v14.8h, v28.8h , v26.8h 731 ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[8_0] 732 uaddl v30.8h, v10.8b, v0.8b 733 uaddl v28.8h, v6.8b, v4.8b 734 mls v14.8h, v22.8h , v24.8h 735 st1 {v13.s}[0], [x1], x3 //store row 1 736 mla v28.8h, v30.8h , v26.8h 737 738 ext v22.16b, v20.16b , v14.16b , #10 739 saddl v30.4s, v20.4h, v22.4h 740 741 saddl v22.4s, v21.4h, v23.4h 742 ext v16.16b, v20.16b , v14.16b , #4 743 ext v18.16b, v20.16b , v14.16b , #6 744 ext v12.16b, v20.16b , v14.16b , #8 745 ext v14.16b, v20.16b , v14.16b , #2 746 add v16.8h, v16.8h , v18.8h 747 add v18.8h, v14.8h , v12.8h 748 uaddl v20.8h, v8.8b, v2.8b 749 smlal v30.4s, v16.4h, v26.4h 750 smlsl v30.4s, v18.4h, v24.4h 751 smlal v22.4s, v17.4h, v26.4h 752 smlsl v22.4s, v19.4h, v24.4h 753 uaddl v18.8h, v11.8b, v1.8b 754 uaddl v16.8h, v7.8b, v5.8b 755 sqrshrun v12.4h, v30.4s, #10 756 uaddl v30.8h, v9.8b, v3.8b 757 mla v16.8h, v18.8h , v26.8h 758 sqrshrun v13.4h, v22.4s, #10 759 mls v28.8h, v20.8h , v24.8h 760 761 mls v16.8h, v30.8h , v24.8h 762 uqxtn v27.8b, v12.8h 763 764 ext v22.16b, v28.16b , v16.16b , #10 765 766 saddl v30.4s, v28.4h, v22.4h 767 768 saddl v22.4s, v29.4h, v23.4h 769 ext v12.16b, v28.16b , v16.16b , #4 770 ext v18.16b, v28.16b , v16.16b , #6 771 ext v20.16b, v28.16b , v16.16b , #8 772 ext v28.16b, v28.16b , v16.16b , #2 773 add v12.8h, v12.8h , v18.8h 774 add v18.8h, v28.8h , v20.8h 775 776 smlal v30.4s, v12.4h, v26.4h 777 smlsl v30.4s, v18.4h, v24.4h 778 smlal v22.4s, v13.4h, v26.4h 779 smlsl v22.4s, v19.4h, v24.4h 780 781 782 mov v12.8b, v27.8b 783 mov v27.8b, v26.8b 784 785 sqrshrun v16.4h, v30.4s, #10 786 787 mov v6.16b, v2.16b 788 mov v7.16b, v3.16b 789 790 sqrshrun v17.4h, v22.4s, #10 791 792 mov v2.16b, v10.16b 793 mov v3.16b, v11.16b 794 795 mov v10.16b, v0.16b 796 mov v11.16b, v1.16b 797 798 subs x4, x4, #4 799 uqxtn v13.8b, v16.8h 800 801 mov v0.16b, v8.16b 802 mov v1.16b, v9.16b 803 804 mov v8.16b, v4.16b 805 mov v9.16b, v5.16b 806 807 808 mov v4.16b, v10.16b 809 mov v5.16b, v11.16b 810 811 812 st1 {v12.s}[0], [x1], x3 // store row 2 813 st1 {v13.s}[0], [x1], x3 // store row 3 814 815 bgt loop_4 816 817 end_func: 818 //Restoring registers from stack 819 ldp x19, x20, [sp], #16 820 pop_v_regs 821 ret 822 823 824 825