1 //****************************************************************************** 2 //* 3 //* Copyright (C) 2015 The Android Open Source Project 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //***************************************************************************** 18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 //*/ 20 ///** 21 //****************************************************************************** 22 //* @file 23 //* ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s 24 //* 25 //* @brief 26 //* Contains function definitions for inter prediction interpolation. 27 //* 28 //* @author 29 //* Mohit 30 //* 31 //* @par List of Functions: 32 //* 33 //* - ih264_inter_pred_luma_horz_hpel_vert_qpel_av8() 34 //* 35 //* @remarks 36 //* None 37 //* 38 //******************************************************************************* 39 //*/ 40 41 ///* All the functions here are replicated from ih264_inter_pred_filters.c 42 // 43 44 ///** 45 ///** 46 ///** 47 //******************************************************************************* 48 //* 49 //* @brief 50 //* This function implements a two stage cascaded six tap filter. It 51 //* applies the six tap filter in the horizontal direction on the 52 //* predictor values, followed by applying the same filter in the 53 //* vertical direction on the output of the first stage. It then averages 54 //* the output of the 1st stage and the output of the 2nd stage to obtain 55 //* the quarter pel values. The six tap filtering operation is described 56 //* in sec 8.4.2.2.1 titled "Luma sample interpolation process". 57 //* 58 //* @par Description: 59 //* This function is called to obtain pixels lying at the following 60 //* location (1/2,1/4) or (1/2,3/4). The function interpolates 61 //* the predictors first in the horizontal direction and then in the 62 //* vertical direction to output the (1/2,1/2). It then averages 63 //* the output of the 2nd stage and (1/2,1/2) value to obtain (1/2,1/4) 64 //* or (1/2,3/4) depending on the offset. 65 //* 66 //* @param[in] pu1_src 67 //* UWORD8 pointer to the source 68 //* 69 //* @param[out] pu1_dst 70 //* UWORD8 pointer to the destination 71 //* 72 //* @param[in] src_strd 73 //* integer source stride 74 //* 75 //* @param[in] dst_strd 76 //* integer destination stride 77 //* 78 //* @param[in] ht 79 //* integer height of the array 80 //* 81 //* @param[in] wd 82 //* integer width of the array 83 //* 84 //* @param[in] pu1_tmp: temporary buffer 85 //* 86 //* @param[in] dydx: x and y reference offset for qpel calculations 87 //* 88 //* @returns 89 //* 90 //* @remarks 91 //* None 92 //* 93 //******************************************************************************* 94 //*/; 95 96 //void ih264_inter_pred_luma_horz_hpel_vert_qpel(UWORD8 *pu1_src, 97 // UWORD8 *pu1_dst, 98 // WORD32 src_strd,, 99 // WORD32 dst_strd, 100 // WORD32 ht, 101 // WORD32 wd, 102 // UWORD8* pu1_tmp, 103 // UWORD32 dydx) 104 105 //**************Variables Vs Registers***************************************** 106 // x0 => *pu1_src 107 // x1 => *pu1_dst 108 // w2 => src_strd 109 // w3 => dst_strd 110 // w4 => ht 111 // w5 => wd 112 // x6 => *pu1_tmp 113 // w7 => dydx 114 115 .text 116 .p2align 2 117 .include "ih264_neon_macros.s" 118 119 120 121 .global ih264_inter_pred_luma_horz_hpel_vert_qpel_av8 122 123 ih264_inter_pred_luma_horz_hpel_vert_qpel_av8: 124 125 126 // store register values to stack 127 push_v_regs 128 stp x19, x20, [sp, #-16]! 129 sxtw x2, w2 130 sxtw x3, w3 131 sxtw x4, w4 132 sxtw x5, w5 133 134 135 136 sub x0, x0, x2, lsl #1 // pu1_src-2*src_strd 137 sub x0, x0, #2 // pu1_src-2 138 139 mov x9, x6 140 141 // by writing to w7 here, we clear the upper half of x7 142 lsr w7, w7, #3 // dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit 143 144 add x7, x7, #2 145 mov x6, #48 146 madd x7, x7, x6, x9 147 148 subs x12, x5, #4 //if wd=4 branch to loop_4 149 beq loop_4_start 150 151 subs x12, x5, #8 //if wd=8 branch to loop_8 152 beq loop_8_start 153 154 //when wd=16 155 movi v22.8h, #20 // Filter coeff 0x14 into Q11 156 movi v24.8h, #5 // Filter coeff 0x5 into Q12 157 add x8, x0, #8 158 add x14, x1, #8 159 add x10, x9, #8 160 mov x12, x4 161 add x11, x7, #8 162 loop_16_lowhalf_start: 163 ld1 {v0.2s, v1.2s}, [x0], x2 // row -2 load for horizontal filter 164 ext v5.8b, v0.8b , v1.8b , #5 165 uaddl v6.8h, v0.8b, v5.8b 166 167 ext v2.8b, v0.8b , v1.8b , #2 168 ext v3.8b, v0.8b , v1.8b , #3 169 uaddl v8.8h, v2.8b, v3.8b 170 ext v4.8b, v0.8b , v1.8b , #4 171 mla v6.8h, v8.8h , v22.8h 172 ext v1.8b, v0.8b , v1.8b , #1 173 uaddl v8.8h, v1.8b, v4.8b 174 ld1 {v0.2s, v1.2s}, [x0], x2 // row -1 load for horizontal filter 175 mls v6.8h, v8.8h , v24.8h 176 ext v5.8b, v0.8b , v1.8b , #5 177 uaddl v8.8h, v0.8b, v5.8b 178 ext v2.8b, v0.8b , v1.8b , #2 179 ext v3.8b, v0.8b , v1.8b , #3 180 uaddl v10.8h, v2.8b, v3.8b 181 182 st1 {v6.4s}, [x9], x6 // store temp buffer 0 183 184 ext v4.8b, v0.8b , v1.8b , #4 185 mla v8.8h, v10.8h , v22.8h 186 ext v1.8b, v0.8b , v1.8b , #1 187 uaddl v10.8h, v1.8b, v4.8b 188 ld1 {v0.2s, v1.2s}, [x0], x2 // row 0 load for horizontal filter 189 mls v8.8h, v10.8h , v24.8h 190 ext v5.8b, v0.8b , v1.8b , #5 191 uaddl v10.8h, v0.8b, v5.8b 192 ext v2.8b, v0.8b , v1.8b , #2 193 ext v3.8b, v0.8b , v1.8b , #3 194 uaddl v12.8h, v2.8b, v3.8b 195 196 st1 {v8.4s}, [x9], x6 // store temp buffer 1 197 198 ext v4.8b, v0.8b , v1.8b , #4 199 mla v10.8h, v12.8h , v22.8h 200 ext v1.8b, v0.8b , v1.8b , #1 201 uaddl v12.8h, v1.8b, v4.8b 202 ld1 {v0.2s, v1.2s}, [x0], x2 // row 1 load for horizontal filter 203 mls v10.8h, v12.8h , v24.8h 204 ext v5.8b, v0.8b , v1.8b , #5 205 uaddl v12.8h, v0.8b, v5.8b 206 ext v2.8b, v0.8b , v1.8b , #2 207 ext v3.8b, v0.8b , v1.8b , #3 208 uaddl v14.8h, v2.8b, v3.8b 209 210 st1 {v10.4s}, [x9], x6 // store temp buffer 2 211 212 ext v4.8b, v0.8b , v1.8b , #4 213 mla v12.8h, v14.8h , v22.8h 214 ext v1.8b, v0.8b , v1.8b , #1 215 uaddl v14.8h, v1.8b, v4.8b 216 ld1 {v0.2s, v1.2s}, [x0], x2 // row 2 load for horizontal filter 217 mls v12.8h, v14.8h , v24.8h 218 ext v5.8b, v0.8b , v1.8b , #5 219 uaddl v14.8h, v0.8b, v5.8b 220 ext v2.8b, v0.8b , v1.8b , #2 221 ext v3.8b, v0.8b , v1.8b , #3 222 uaddl v16.8h, v2.8b, v3.8b 223 224 st1 {v12.4s}, [x9], x6 // store temp buffer 3 225 226 ext v4.8b, v0.8b , v1.8b , #4 227 mla v14.8h, v16.8h , v22.8h 228 ext v1.8b, v0.8b , v1.8b , #1 229 uaddl v16.8h, v1.8b, v4.8b 230 231 mls v14.8h, v16.8h , v24.8h 232 loop_16_lowhalf: 233 234 ld1 {v0.2s, v1.2s}, [x0], x2 // row 3 load for horizontal filter 235 ext v5.8b, v0.8b , v1.8b , #5 236 ext v2.8b, v0.8b , v1.8b , #2 237 ext v3.8b, v0.8b , v1.8b , #3 238 uaddl v16.8h, v0.8b, v5.8b 239 240 st1 {v14.4s}, [x9], x6 // store temp buffer 4 241 242 uaddl v18.8h, v2.8b, v3.8b 243 ext v4.8b, v0.8b , v1.8b , #4 244 mla v16.8h, v18.8h , v22.8h 245 ext v1.8b, v0.8b , v1.8b , #1 246 add v28.8h, v8.8h , v14.8h 247 uaddl v18.8h, v1.8b, v4.8b 248 add v30.8h, v10.8h , v12.8h 249 mls v16.8h, v18.8h , v24.8h 250 ld1 {v0.2s, v1.2s}, [x0], x2 // row 4 load for hoorizontal filter 251 ext v5.8b, v0.8b , v1.8b , #5 252 ext v2.8b, v0.8b , v1.8b , #2 253 ext v3.8b, v0.8b , v1.8b , #3 254 uaddl v20.8h, v0.8b, v5.8b 255 256 st1 {v16.4s}, [x9], x6 // store temp buffer x5 257 258 saddl v18.4s, v6.4h, v16.4h 259 260 ld1 {v26.4s}, [x7], x6 // load from temp buffer 0 261 262 saddl2 v6.4s, v6.8h, v16.8h 263 264 sqrshrun v26.8b, v26.8h, #5 265 266 smlal v18.4s, v30.4h, v22.4h 267 smlsl v18.4s, v28.4h, v24.4h 268 smlal2 v6.4s, v30.8h, v22.8h 269 smlsl2 v6.4s, v28.8h, v24.8h 270 uaddl v2.8h, v2.8b, v3.8b 271 ext v4.8b, v0.8b , v1.8b , #4 272 mla v20.8h, v2.8h , v22.8h 273 sqrshrun v18.4h, v18.4s, #10 274 ext v1.8b, v0.8b , v1.8b , #1 275 sqrshrun v19.4h, v6.4s, #10 276 add v28.8h, v10.8h , v16.8h 277 uaddl v2.8h, v1.8b, v4.8b 278 add v30.8h, v12.8h , v14.8h 279 mls v20.8h, v2.8h , v24.8h 280 281 uqxtn v18.8b, v18.8h 282 uqxtn v19.8b, v19.8h 283 mov v18.s[1], v19.s[0] 284 285 ld1 {v0.2s, v1.2s}, [x0], x2 // row 5 load for horizontal filter 286 287 urhadd v26.8b, v18.8b , v26.8b 288 289 ext v5.8b, v0.8b , v1.8b , #5 290 ext v2.8b, v0.8b , v1.8b , #2 291 292 st1 {v20.4s}, [x9], x6 // store temp buffer x6 293 294 saddl v18.4s, v8.4h, v20.4h 295 296 saddl2 v6.4s, v8.8h, v20.8h 297 298 ld1 {v8.4s}, [x7], x6 //load from temp buffer 1 299 300 301 st1 {v26.2s}, [x1], x3 // store row 0 302 303 smlal v18.4s, v30.4h, v22.4h 304 smlsl v18.4s, v28.4h, v24.4h 305 smlal2 v6.4s, v30.8h, v22.8h 306 smlsl2 v6.4s, v28.8h, v24.8h 307 308 sqrshrun v28.8b, v8.8h, #5 309 ext v3.8b, v0.8b , v1.8b , #3 310 uaddl v8.8h, v0.8b, v5.8b 311 uaddl v2.8h, v2.8b, v3.8b 312 sqrshrun v18.4h, v18.4s, #10 313 ext v4.8b, v0.8b , v1.8b , #4 314 sqrshrun v19.4h, v6.4s, #10 315 mla v8.8h, v2.8h , v22.8h 316 ext v1.8b, v0.8b , v1.8b , #1 317 add v26.8h, v12.8h , v20.8h 318 uaddl v2.8h, v1.8b, v4.8b 319 uqxtn v18.8b, v18.8h 320 uqxtn v19.8b, v19.8h 321 mov v18.s[1], v19.s[0] 322 add v30.8h, v14.8h , v16.8h 323 mls v8.8h, v2.8h , v24.8h 324 ld1 {v0.2s, v1.2s}, [x0], x2 // row 6 load for horizontal filter 325 326 urhadd v28.8b, v28.8b , v18.8b 327 328 ext v5.8b, v0.8b , v1.8b , #5 329 ext v2.8b, v0.8b , v1.8b , #2 330 ext v3.8b, v0.8b , v1.8b , #3 331 332 st1 {v28.2s}, [x1], x3 // store row 1 333 334 uaddl v28.8h, v0.8b, v5.8b 335 336 st1 {v8.4s}, [x9], x6 // store temp buffer x7 337 338 saddl v18.4s, v10.4h, v8.4h 339 saddl2 v6.4s, v10.8h, v8.8h 340 341 ld1 {v10.4s}, [x7], x6 // load from temp buffer 2 342 343 smlal v18.4s, v30.4h, v22.4h 344 smlsl v18.4s, v26.4h, v24.4h 345 346 smlal2 v6.4s, v30.8h, v22.8h 347 smlsl2 v6.4s, v26.8h, v24.8h 348 349 sqrshrun v26.8b, v10.8h, #5 350 351 uaddl v2.8h, v2.8b, v3.8b 352 ext v4.8b, v0.8b , v1.8b , #4 353 mla v28.8h, v2.8h , v22.8h 354 sqrshrun v18.4h, v18.4s, #10 355 ext v1.8b, v0.8b , v1.8b , #1 356 sqrshrun v19.4h, v6.4s, #10 357 add v10.8h, v14.8h , v8.8h 358 uaddl v2.8h, v1.8b, v4.8b 359 add v30.8h, v16.8h , v20.8h 360 mls v28.8h, v2.8h , v24.8h 361 uqxtn v27.8b, v18.8h 362 uqxtn v19.8b, v19.8h 363 mov v27.s[1], v19.s[0] 364 saddl v18.4s, v12.4h, v28.4h 365 saddl2 v6.4s, v12.8h, v28.8h 366 367 urhadd v26.8b, v26.8b , v27.8b 368 369 smlal v18.4s, v30.4h, v22.4h 370 smlsl v18.4s, v10.4h, v24.4h 371 smlal2 v6.4s, v30.8h, v22.8h 372 smlsl2 v6.4s, v10.8h, v24.8h 373 374 st1 {v26.2s}, [x1], x3 // store row 2 375 376 st1 {v28.2s, v29.2s}, [x9] 377 378 379 sqrshrun v18.4h, v18.4s, #10 380 381 mov v10.16b, v20.16b 382 mov v11.16b, v21.16b 383 ld1 {v30.4s}, [x7], x6 // load from temp buffer 3 384 385 sqrshrun v19.4h, v6.4s, #10 386 subs x4, x4, #4 387 388 sqrshrun v30.8b, v30.8h, #5 389 390 uqxtn v18.8b, v18.8h 391 uqxtn v19.8b, v19.8h 392 mov v18.s[1], v19.s[0] 393 394 mov v12.16b, v8.16b 395 mov v13.16b, v9.16b 396 mov v6.16b, v14.16b 397 mov v7.16b, v15.16b 398 399 urhadd v30.8b, v18.8b , v30.8b 400 401 mov v8.16b, v16.16b 402 mov v9.16b, v17.16b 403 mov v14.16b, v28.16b 404 mov v15.16b, v29.16b 405 406 st1 {v30.2s}, [x1], x3 // store row 3 407 408 bgt loop_16_lowhalf // looping if height =16 409 410 411 loop_16_highhalf_start: 412 ld1 {v0.2s, v1.2s}, [x8], x2 413 ext v5.8b, v0.8b , v1.8b , #5 414 uaddl v6.8h, v0.8b, v5.8b 415 ext v2.8b, v0.8b , v1.8b , #2 416 ext v3.8b, v0.8b , v1.8b , #3 417 uaddl v8.8h, v2.8b, v3.8b 418 ext v4.8b, v0.8b , v1.8b , #4 419 mla v6.8h, v8.8h , v22.8h 420 ext v1.8b, v0.8b , v1.8b , #1 421 uaddl v8.8h, v1.8b, v4.8b 422 ld1 {v0.2s, v1.2s}, [x8], x2 423 mls v6.8h, v8.8h , v24.8h 424 ext v5.8b, v0.8b , v1.8b , #5 425 uaddl v8.8h, v0.8b, v5.8b 426 ext v2.8b, v0.8b , v1.8b , #2 427 ext v3.8b, v0.8b , v1.8b , #3 428 uaddl v10.8h, v2.8b, v3.8b 429 430 st1 {v6.4s}, [x10], x6 431 432 ext v4.8b, v0.8b , v1.8b , #4 433 mla v8.8h, v10.8h , v22.8h 434 ext v1.8b, v0.8b , v1.8b , #1 435 uaddl v10.8h, v1.8b, v4.8b 436 ld1 {v0.2s, v1.2s}, [x8], x2 437 mls v8.8h, v10.8h , v24.8h 438 ext v5.8b, v0.8b , v1.8b , #5 439 uaddl v10.8h, v0.8b, v5.8b 440 ext v2.8b, v0.8b , v1.8b , #2 441 ext v3.8b, v0.8b , v1.8b , #3 442 uaddl v12.8h, v2.8b, v3.8b 443 444 st1 {v8.4s}, [x10], x6 445 446 ext v4.8b, v0.8b , v1.8b , #4 447 mla v10.8h, v12.8h , v22.8h 448 ext v1.8b, v0.8b , v1.8b , #1 449 uaddl v12.8h, v1.8b, v4.8b 450 ld1 {v0.2s, v1.2s}, [x8], x2 451 mls v10.8h, v12.8h , v24.8h 452 ext v5.8b, v0.8b , v1.8b , #5 453 uaddl v12.8h, v0.8b, v5.8b 454 ext v2.8b, v0.8b , v1.8b , #2 455 ext v3.8b, v0.8b , v1.8b , #3 456 uaddl v14.8h, v2.8b, v3.8b 457 458 st1 {v10.4s}, [x10], x6 459 460 ext v4.8b, v0.8b , v1.8b , #4 461 mla v12.8h, v14.8h , v22.8h 462 ext v1.8b, v0.8b , v1.8b , #1 463 uaddl v14.8h, v1.8b, v4.8b 464 ld1 {v0.2s, v1.2s}, [x8], x2 465 mls v12.8h, v14.8h , v24.8h 466 ext v5.8b, v0.8b , v1.8b , #5 467 uaddl v14.8h, v0.8b, v5.8b 468 ext v2.8b, v0.8b , v1.8b , #2 469 ext v3.8b, v0.8b , v1.8b , #3 470 uaddl v16.8h, v2.8b, v3.8b 471 472 st1 {v12.4s}, [x10], x6 473 474 ext v4.8b, v0.8b , v1.8b , #4 475 mla v14.8h, v16.8h , v22.8h 476 ext v1.8b, v0.8b , v1.8b , #1 477 uaddl v16.8h, v1.8b, v4.8b 478 479 mls v14.8h, v16.8h , v24.8h 480 481 loop_16_highhalf: 482 483 ld1 {v0.2s, v1.2s}, [x8], x2 484 ext v5.8b, v0.8b , v1.8b , #5 485 ext v2.8b, v0.8b , v1.8b , #2 486 ext v3.8b, v0.8b , v1.8b , #3 487 uaddl v16.8h, v0.8b, v5.8b 488 489 st1 {v14.4s}, [x10], x6 490 491 uaddl v18.8h, v2.8b, v3.8b 492 ext v4.8b, v0.8b , v1.8b , #4 493 mla v16.8h, v18.8h , v22.8h 494 ext v1.8b, v0.8b , v1.8b , #1 495 add v28.8h, v8.8h , v14.8h 496 uaddl v18.8h, v1.8b, v4.8b 497 add v30.8h, v10.8h , v12.8h 498 mls v16.8h, v18.8h , v24.8h 499 ld1 {v0.2s, v1.2s}, [x8], x2 500 ext v5.8b, v0.8b , v1.8b , #5 501 ext v2.8b, v0.8b , v1.8b , #2 502 ext v3.8b, v0.8b , v1.8b , #3 503 uaddl v20.8h, v0.8b, v5.8b 504 505 st1 {v16.4s}, [x10], x6 506 507 saddl v18.4s, v6.4h, v16.4h 508 509 ld1 {v26.4s}, [x11], x6 510 511 saddl2 v6.4s, v6.8h, v16.8h 512 513 sqrshrun v26.8b, v26.8h, #5 514 515 smlal v18.4s, v30.4h, v22.4h 516 smlsl v18.4s, v28.4h, v24.4h 517 smlal2 v6.4s, v30.8h, v22.8h 518 smlsl2 v6.4s, v28.8h, v24.8h 519 uaddl v2.8h, v2.8b, v3.8b 520 ext v4.8b, v0.8b , v1.8b , #4 521 mla v20.8h, v2.8h , v22.8h 522 sqrshrun v18.4h, v18.4s, #10 523 ext v1.8b, v0.8b , v1.8b , #1 524 sqrshrun v19.4h, v6.4s, #10 525 add v28.8h, v10.8h , v16.8h 526 uaddl v2.8h, v1.8b, v4.8b 527 add v30.8h, v12.8h , v14.8h 528 mls v20.8h, v2.8h , v24.8h 529 uqxtn v18.8b, v18.8h 530 uqxtn v19.8b, v19.8h 531 mov v18.s[1], v19.s[0] 532 ld1 {v0.2s, v1.2s}, [x8], x2 533 534 urhadd v26.8b, v18.8b , v26.8b 535 536 ext v5.8b, v0.8b , v1.8b , #5 537 ext v2.8b, v0.8b , v1.8b , #2 538 539 st1 {v20.4s}, [x10], x6 540 541 saddl v18.4s, v8.4h, v20.4h 542 saddl2 v6.4s, v8.8h, v20.8h 543 544 ld1 {v8.4s}, [x11], x6 545 546 547 st1 {v26.2s}, [x14], x3 //store row 0 548 549 smlal v18.4s, v30.4h, v22.4h 550 smlsl v18.4s, v28.4h, v24.4h 551 smlal2 v6.4s, v30.8h, v22.8h 552 smlsl2 v6.4s, v28.8h, v24.8h 553 sqrshrun v28.8b, v8.8h, #5 554 ext v3.8b, v0.8b , v1.8b , #3 555 uaddl v8.8h, v0.8b, v5.8b 556 uaddl v2.8h, v2.8b, v3.8b 557 sqrshrun v18.4h, v18.4s, #10 558 ext v4.8b, v0.8b , v1.8b , #4 559 sqrshrun v19.4h, v6.4s, #10 560 mla v8.8h, v2.8h , v22.8h 561 ext v1.8b, v0.8b , v1.8b , #1 562 add v26.8h, v12.8h , v20.8h 563 uaddl v2.8h, v1.8b, v4.8b 564 uqxtn v18.8b, v18.8h 565 uqxtn v19.8b, v19.8h 566 mov v18.s[1], v19.s[0] 567 add v30.8h, v14.8h , v16.8h 568 mls v8.8h, v2.8h , v24.8h 569 ld1 {v0.2s, v1.2s}, [x8], x2 570 571 urhadd v28.8b, v28.8b , v18.8b 572 573 ext v5.8b, v0.8b , v1.8b , #5 574 ext v2.8b, v0.8b , v1.8b , #2 575 ext v3.8b, v0.8b , v1.8b , #3 576 577 st1 {v28.2s}, [x14], x3 //store row 1 578 579 uaddl v28.8h, v0.8b, v5.8b 580 581 st1 {v8.4s}, [x10], x6 582 583 saddl v18.4s, v10.4h, v8.4h 584 saddl2 v6.4s, v10.8h, v8.8h 585 586 ld1 {v10.4s}, [x11], x6 587 588 smlal v18.4s, v30.4h, v22.4h 589 smlsl v18.4s, v26.4h, v24.4h 590 smlal2 v6.4s, v30.8h, v22.8h 591 smlsl2 v6.4s, v26.8h, v24.8h 592 593 sqrshrun v26.8b, v10.8h, #5 594 uaddl v2.8h, v2.8b, v3.8b 595 ext v4.8b, v0.8b , v1.8b , #4 596 mla v28.8h, v2.8h , v22.8h 597 sqrshrun v18.4h, v18.4s, #10 598 ext v1.8b, v0.8b , v1.8b , #1 599 sqrshrun v19.4h, v6.4s, #10 600 add v10.8h, v14.8h , v8.8h 601 uaddl v2.8h, v1.8b, v4.8b 602 add v30.8h, v16.8h , v20.8h 603 mls v28.8h, v2.8h , v24.8h 604 uqxtn v27.8b, v18.8h 605 uqxtn v19.8b, v19.8h 606 mov v27.s[1], v19.s[0] 607 608 609 saddl v18.4s, v12.4h, v28.4h 610 saddl2 v6.4s, v12.8h, v28.8h 611 612 urhadd v26.8b, v26.8b , v27.8b 613 614 smlal v18.4s, v30.4h, v22.4h 615 smlsl v18.4s, v10.4h, v24.4h 616 smlal2 v6.4s, v30.8h, v22.8h 617 smlsl2 v6.4s, v10.8h, v24.8h 618 619 st1 {v26.2s}, [x14], x3 // store row 2 620 621 st1 {v28.4s}, [x10] 622 623 sqrshrun v18.4h, v18.4s, #10 624 mov v10.16b, v20.16b 625 mov v11.16b, v21.16b 626 ld1 {v30.4s}, [x11], x6 627 628 sqrshrun v19.4h, v6.4s, #10 629 subs x12, x12, #4 630 631 sqrshrun v30.8b, v30.8h, #5 632 633 uqxtn v18.8b, v18.8h 634 uqxtn v19.8b, v19.8h 635 mov v18.s[1], v19.s[0] 636 637 mov v12.16b, v8.16b 638 mov v13.16b, v9.16b 639 mov v6.16b, v14.16b 640 mov v7.16b, v15.16b 641 urhadd v30.8b, v18.8b , v30.8b 642 643 mov v8.16b, v16.16b 644 mov v9.16b, v17.16b 645 mov v14.16b, v28.16b 646 mov v15.16b, v29.16b 647 st1 {v30.2s}, [x14], x3 // store row 3 648 649 bgt loop_16_highhalf // looping if height = 8 or 16 650 b end_func 651 652 loop_8_start: 653 654 movi v22.8h, #0x14 // Filter coeff 20 into Q11 655 movi v24.8h, #5 // Filter coeff 5 into Q12 656 ld1 {v0.2s, v1.2s}, [x0], x2 // row -2 load for horizontal filter 657 ext v5.8b, v0.8b , v1.8b , #5 658 uaddl v6.8h, v0.8b, v5.8b 659 660 ext v2.8b, v0.8b , v1.8b , #2 661 ext v3.8b, v0.8b , v1.8b , #3 662 uaddl v8.8h, v2.8b, v3.8b 663 ext v4.8b, v0.8b , v1.8b , #4 664 mla v6.8h, v8.8h , v22.8h 665 ext v1.8b, v0.8b , v1.8b , #1 666 uaddl v8.8h, v1.8b, v4.8b 667 ld1 {v0.2s, v1.2s}, [x0], x2 // row -1 load for horizontal filter 668 mls v6.8h, v8.8h , v24.8h 669 ext v5.8b, v0.8b , v1.8b , #5 670 uaddl v8.8h, v0.8b, v5.8b 671 ext v2.8b, v0.8b , v1.8b , #2 672 ext v3.8b, v0.8b , v1.8b , #3 673 uaddl v10.8h, v2.8b, v3.8b 674 675 st1 {v6.4s}, [x9], x6 // store temp buffer 0 676 677 ext v4.8b, v0.8b , v1.8b , #4 678 mla v8.8h, v10.8h , v22.8h 679 ext v1.8b, v0.8b , v1.8b , #1 680 uaddl v10.8h, v1.8b, v4.8b 681 ld1 {v0.2s, v1.2s}, [x0], x2 // row 0 load for horizontal filter 682 mls v8.8h, v10.8h , v24.8h 683 ext v5.8b, v0.8b , v1.8b , #5 684 uaddl v10.8h, v0.8b, v5.8b 685 ext v2.8b, v0.8b , v1.8b , #2 686 ext v3.8b, v0.8b , v1.8b , #3 687 uaddl v12.8h, v2.8b, v3.8b 688 689 st1 {v8.4s}, [x9], x6 // store temp buffer 1 690 691 ext v4.8b, v0.8b , v1.8b , #4 692 mla v10.8h, v12.8h , v22.8h 693 ext v1.8b, v0.8b , v1.8b , #1 694 uaddl v12.8h, v1.8b, v4.8b 695 ld1 {v0.2s, v1.2s}, [x0], x2 // row 1 load for horizontal filter 696 mls v10.8h, v12.8h , v24.8h 697 ext v5.8b, v0.8b , v1.8b , #5 698 uaddl v12.8h, v0.8b, v5.8b 699 ext v2.8b, v0.8b , v1.8b , #2 700 ext v3.8b, v0.8b , v1.8b , #3 701 uaddl v14.8h, v2.8b, v3.8b 702 703 st1 {v10.4s}, [x9], x6 // store temp buffer 2 704 705 ext v4.8b, v0.8b , v1.8b , #4 706 mla v12.8h, v14.8h , v22.8h 707 ext v1.8b, v0.8b , v1.8b , #1 708 uaddl v14.8h, v1.8b, v4.8b 709 ld1 {v0.2s, v1.2s}, [x0], x2 // row 2 load for horizontal filter 710 mls v12.8h, v14.8h , v24.8h 711 ext v5.8b, v0.8b , v1.8b , #5 712 uaddl v14.8h, v0.8b, v5.8b 713 ext v2.8b, v0.8b , v1.8b , #2 714 ext v3.8b, v0.8b , v1.8b , #3 715 uaddl v16.8h, v2.8b, v3.8b 716 717 st1 {v12.4s}, [x9], x6 // store temp buffer 3 718 719 ext v4.8b, v0.8b , v1.8b , #4 720 mla v14.8h, v16.8h , v22.8h 721 ext v1.8b, v0.8b , v1.8b , #1 722 uaddl v16.8h, v1.8b, v4.8b 723 724 mls v14.8h, v16.8h , v24.8h 725 loop_8: 726 727 ld1 {v0.2s, v1.2s}, [x0], x2 // row 3 load for horizontal filter 728 ext v5.8b, v0.8b , v1.8b , #5 729 ext v2.8b, v0.8b , v1.8b , #2 730 ext v3.8b, v0.8b , v1.8b , #3 731 uaddl v16.8h, v0.8b, v5.8b 732 733 st1 {v14.4s}, [x9], x6 // store temp buffer 4 734 735 uaddl v18.8h, v2.8b, v3.8b 736 ext v4.8b, v0.8b , v1.8b , #4 737 mla v16.8h, v18.8h , v22.8h 738 ext v1.8b, v0.8b , v1.8b , #1 739 add v28.8h, v8.8h , v14.8h 740 uaddl v18.8h, v1.8b, v4.8b 741 add v30.8h, v10.8h , v12.8h 742 mls v16.8h, v18.8h , v24.8h 743 ld1 {v0.2s, v1.2s} , [x0], x2 // row 4 load for hoorizontal filter 744 ext v5.8b, v0.8b , v1.8b , #5 745 ext v2.8b, v0.8b , v1.8b , #2 746 ext v3.8b, v0.8b , v1.8b , #3 747 uaddl v20.8h, v0.8b, v5.8b 748 749 st1 {v16.4s}, [x9], x6 // store temp buffer x5 750 751 saddl v18.4s, v6.4h, v16.4h 752 753 ld1 {v26.4s}, [x7], x6 // load from temp buffer 0 754 755 saddl2 v6.4s, v6.8h, v16.8h 756 757 sqrshrun v26.8b, v26.8h, #5 758 759 smlal v18.4s, v30.4h, v22.4h 760 smlsl v18.4s, v28.4h, v24.4h 761 smlal2 v6.4s, v30.8h, v22.8h 762 smlsl2 v6.4s, v28.8h, v24.8h 763 uaddl v2.8h, v2.8b, v3.8b 764 ext v4.8b, v0.8b , v1.8b , #4 765 mla v20.8h, v2.8h , v22.8h 766 sqrshrun v18.4h, v18.4s, #10 767 ext v1.8b, v0.8b , v1.8b , #1 768 sqrshrun v19.4h, v6.4s, #10 769 add v28.8h, v10.8h , v16.8h 770 uaddl v2.8h, v1.8b, v4.8b 771 add v30.8h, v12.8h , v14.8h 772 mls v20.8h, v2.8h , v24.8h 773 774 uqxtn v18.8b, v18.8h 775 uqxtn v19.8b, v19.8h 776 mov v18.s[1], v19.s[0] 777 778 ld1 {v0.2s, v1.2s}, [x0], x2 // row 5 load for horizontal filter 779 780 urhadd v26.8b, v18.8b , v26.8b 781 782 ext v5.8b, v0.8b , v1.8b , #5 783 ext v2.8b, v0.8b , v1.8b , #2 784 785 st1 {v20.4s}, [x9], x6 // store temp buffer x6 786 787 saddl v18.4s, v8.4h, v20.4h 788 789 saddl2 v6.4s, v8.8h, v20.8h 790 791 ld1 {v8.4s}, [x7], x6 //load from temp buffer 1 792 793 794 st1 {v26.2s}, [x1], x3 // store row 0 795 796 smlal v18.4s, v30.4h, v22.4h 797 smlsl v18.4s, v28.4h, v24.4h 798 799 800 801 smlal2 v6.4s, v30.8h, v22.8h 802 smlsl2 v6.4s, v28.8h, v24.8h 803 804 sqrshrun v28.8b, v8.8h, #5 805 806 ext v3.8b, v0.8b , v1.8b , #3 807 uaddl v8.8h, v0.8b, v5.8b 808 uaddl v2.8h, v2.8b, v3.8b 809 sqrshrun v18.4h, v18.4s, #10 810 ext v4.8b, v0.8b , v1.8b , #4 811 sqrshrun v19.4h, v6.4s, #10 812 mla v8.8h, v2.8h , v22.8h 813 ext v1.8b, v0.8b , v1.8b , #1 814 add v26.8h, v12.8h , v20.8h 815 uaddl v2.8h, v1.8b, v4.8b 816 817 818 uqxtn v18.8b, v18.8h 819 uqxtn v19.8b, v19.8h 820 mov v18.s[1], v19.s[0] 821 822 add v30.8h, v14.8h , v16.8h 823 mls v8.8h, v2.8h , v24.8h 824 ld1 {v0.2s, v1.2s}, [x0], x2 // row 6 load for horizontal filter 825 826 urhadd v28.8b, v28.8b , v18.8b 827 828 ext v5.8b, v0.8b , v1.8b , #5 829 ext v2.8b, v0.8b , v1.8b , #2 830 ext v3.8b, v0.8b , v1.8b , #3 831 832 st1 {v28.2s}, [x1], x3 // store row 1 833 834 uaddl v28.8h, v0.8b, v5.8b 835 836 st1 {v8.4s}, [x9], x6 // store temp buffer x7 837 838 saddl v18.4s, v10.4h, v8.4h 839 saddl2 v6.4s, v10.8h, v8.8h 840 841 ld1 {v10.4s}, [x7], x6 // load from temp buffer 2 842 843 smlal v18.4s, v30.4h, v22.4h 844 smlsl v18.4s, v26.4h, v24.4h 845 smlal2 v6.4s, v30.8h, v22.8h 846 smlsl2 v6.4s, v26.8h, v24.8h 847 848 sqrshrun v26.8b, v10.8h, #5 849 uaddl v2.8h, v2.8b, v3.8b 850 ext v4.8b, v0.8b , v1.8b , #4 851 mla v28.8h, v2.8h , v22.8h 852 sqrshrun v18.4h, v18.4s, #10 853 ext v1.8b, v0.8b , v1.8b , #1 854 sqrshrun v19.4h, v6.4s, #10 855 add v10.8h, v14.8h , v8.8h 856 uaddl v2.8h, v1.8b, v4.8b 857 add v30.8h, v16.8h , v20.8h 858 mls v28.8h, v2.8h , v24.8h 859 860 uqxtn v27.8b, v18.8h 861 uqxtn v19.8b, v19.8h 862 863 mov v27.s[1], v19.s[0] 864 865 saddl v18.4s, v12.4h, v28.4h 866 saddl2 v6.4s, v12.8h, v28.8h 867 868 urhadd v26.8b, v26.8b , v27.8b 869 870 smlal v18.4s, v30.4h, v22.4h 871 smlsl v18.4s, v10.4h, v24.4h 872 smlal2 v6.4s, v30.8h, v22.8h 873 smlsl2 v6.4s, v10.8h, v24.8h 874 875 st1 {v26.2s}, [x1], x3 // store row 2 876 877 st1 {v28.2s, v29.2s}, [x9] 878 879 880 sqrshrun v18.4h, v18.4s, #10 881 mov v10.16b, v20.16b 882 mov v11.16b, v21.16b 883 ld1 {v30.4s}, [x7], x6 // load from temp buffer 3 884 885 sqrshrun v19.4h, v6.4s, #10 886 subs x4, x4, #4 887 888 sqrshrun v30.8b, v30.8h, #5 889 890 891 uqxtn v18.8b, v18.8h 892 uqxtn v19.8b, v19.8h 893 mov v18.s[1], v19.s[0] 894 895 896 mov v12.16b, v8.16b 897 mov v13.16b, v9.16b 898 mov v6.16b, v14.16b 899 mov v7.16b, v15.16b 900 901 urhadd v30.8b, v18.8b , v30.8b 902 mov v8.16b, v16.16b 903 mov v9.16b, v17.16b 904 mov v14.16b, v28.16b 905 mov v15.16b, v29.16b 906 st1 {v30.2s}, [x1], x3 // store row 3 907 908 bgt loop_8 //if height =8 or 16 loop 909 b end_func 910 911 loop_4_start: 912 movi v22.8h, #20 // Filter coeff 20 into D22 913 movi v23.8h, #5 // Filter coeff 5 into D23 914 915 ld1 {v0.2s, v1.2s}, [x0], x2 //row -2 load 916 ext v5.8b, v0.8b , v1.8b , #5 917 uaddl v6.8h, v0.8b, v5.8b 918 ext v2.8b, v0.8b , v1.8b , #2 919 ext v3.8b, v0.8b , v1.8b , #3 920 uaddl v8.8h, v2.8b, v3.8b 921 ext v4.8b, v0.8b , v1.8b , #4 922 mla v6.4h, v8.4h , v22.4h 923 ext v1.8b, v0.8b , v1.8b , #1 924 uaddl v8.8h, v1.8b, v4.8b 925 ld1 {v0.2s, v1.2s}, [x0], x2 // row -1 load 926 mls v6.4h, v8.4h , v23.4h 927 ext v5.8b, v0.8b , v1.8b , #5 928 uaddl v8.8h, v0.8b, v5.8b 929 ext v2.8b, v0.8b , v1.8b , #2 930 ext v3.8b, v0.8b , v1.8b , #3 931 uaddl v10.8h, v2.8b, v3.8b 932 933 st1 {v6.2s}, [x9], x6 // store temp buffer 0 934 935 ext v4.8b, v0.8b , v1.8b , #4 936 mla v8.4h, v10.4h , v22.4h 937 ext v1.8b, v0.8b , v1.8b , #1 938 uaddl v10.8h, v1.8b, v4.8b 939 ld1 {v0.2s, v1.2s}, [x0], x2 // row 0 load 940 mls v8.4h, v10.4h , v23.4h 941 ext v5.8b, v0.8b , v1.8b , #5 942 uaddl v10.8h, v0.8b, v5.8b 943 ext v2.8b, v0.8b , v1.8b , #2 944 ext v3.8b, v0.8b , v1.8b , #3 945 uaddl v12.8h, v2.8b, v3.8b 946 947 st1 {v8.2s}, [x9], x6 // store temp buffer 1 948 949 ext v4.8b, v0.8b , v1.8b , #4 950 mla v10.4h, v12.4h , v22.4h 951 ext v1.8b, v0.8b , v1.8b , #1 952 uaddl v12.8h, v1.8b, v4.8b 953 ld1 {v0.2s, v1.2s}, [x0], x2 // row 1 load 954 mls v10.4h, v12.4h , v23.4h 955 ext v5.8b, v0.8b , v1.8b , #5 956 uaddl v12.8h, v0.8b, v5.8b 957 ext v2.8b, v0.8b , v1.8b , #2 958 ext v3.8b, v0.8b , v1.8b , #3 959 uaddl v14.8h, v2.8b, v3.8b 960 961 st1 {v10.2s}, [x9], x6 // store temp buffer 2 962 963 ext v4.8b, v0.8b , v1.8b , #4 964 mla v12.4h, v14.4h , v22.4h 965 ext v1.8b, v0.8b , v1.8b , #1 966 uaddl v14.8h, v1.8b, v4.8b 967 ld1 {v0.2s, v1.2s}, [x0], x2 // row 2 load 968 mls v12.4h, v14.4h , v23.4h 969 ext v5.8b, v0.8b , v1.8b , #5 970 uaddl v14.8h, v0.8b, v5.8b 971 ext v2.8b, v0.8b , v1.8b , #2 972 ext v3.8b, v0.8b , v1.8b , #3 973 uaddl v16.8h, v2.8b, v3.8b 974 ext v4.8b, v0.8b , v1.8b , #4 975 mla v14.4h, v16.4h , v22.4h 976 ext v1.8b, v0.8b , v1.8b , #1 977 uaddl v16.8h, v1.8b, v4.8b 978 979 st1 {v12.2s}, [x9], x6 // store temp buffer 3 980 981 mls v14.4h, v16.4h , v23.4h 982 983 loop_4: 984 985 ld1 {v0.2s, v1.2s}, [x0], x2 // row 3 load 986 ext v5.8b, v0.8b , v1.8b , #5 987 uaddl v16.8h, v0.8b, v5.8b 988 ext v2.8b, v0.8b , v1.8b , #2 989 ext v3.8b, v0.8b , v1.8b , #3 990 uaddl v18.8h, v2.8b, v3.8b 991 st1 {v14.2s}, [x9], x6 // store temp buffer 4 992 ext v4.8b, v0.8b , v1.8b , #4 993 mla v16.4h, v18.4h , v22.4h 994 ext v1.8b, v0.8b , v1.8b , #1 995 uaddl v18.8h, v1.8b, v4.8b 996 add v2.4h, v10.4h , v12.4h 997 mls v16.4h, v18.4h , v23.4h 998 add v3.4h, v8.4h , v14.4h 999 ld1 {v18.2s, v19.2s}, [x0], x2 // row 4 load 1000 ext v25.8b, v18.8b , v19.8b , #5 1001 uaddl v26.8h, v18.8b, v25.8b 1002 ext v20.8b, v18.8b , v19.8b , #2 1003 1004 st1 {v16.2s}, [x9], x6 // store temp buffer 5 1005 1006 saddl v0.4s, v6.4h, v16.4h 1007 smlal v0.4s, v2.4h, v22.4h 1008 ext v21.8b, v18.8b , v19.8b , #3 1009 uaddl v28.8h, v20.8b, v21.8b 1010 ext v24.8b, v18.8b , v19.8b , #4 1011 smlsl v0.4s, v3.4h, v23.4h 1012 mla v26.4h, v28.4h , v22.4h 1013 ext v19.8b, v18.8b , v19.8b , #1 1014 uaddl v28.8h, v19.8b, v24.8b 1015 add v2.4h, v12.4h , v14.4h 1016 mls v26.4h, v28.4h , v23.4h 1017 sqrshrun v0.4h, v0.4s, #0xa 1018 add v3.4h, v10.4h , v16.4h 1019 ld1 {v18.2s, v19.2s}, [x0], x2 // row 5 load 1020 ext v25.8b, v18.8b , v19.8b , #5 1021 uqxtn v11.8b, v0.8h 1022 uaddl v28.8h, v18.8b, v25.8b 1023 1024 st1 {v26.2s}, [x9], x6 // store temp buffer 6 1025 1026 //Q3 available here 1027 ld1 {v6.2s}, [x7], x6 // load from temp buffer 0 1028 ld1 {v7.2s}, [x7], x6 // load from temp buffer 1 1029 1030 sqrshrun v9.8b, v6.8h, #5 1031 sqrshrun v7.8b, v7.8h, #5 1032 mov v9.s[1], v7.s[0] 1033 1034 ext v20.8b, v18.8b , v19.8b , #2 1035 1036 saddl v0.4s, v8.4h, v26.4h 1037 smlal v0.4s, v2.4h, v22.4h 1038 ext v21.8b, v18.8b , v19.8b , #3 1039 uaddl v6.8h, v20.8b, v21.8b 1040 ext v24.8b, v18.8b , v19.8b , #4 1041 smlsl v0.4s, v3.4h, v23.4h 1042 mla v28.4h, v6.4h , v22.4h 1043 ext v19.8b, v18.8b , v19.8b , #1 1044 uaddl v6.8h, v19.8b, v24.8b 1045 add v2.4h, v14.4h , v16.4h 1046 mls v28.4h, v6.4h , v23.4h 1047 sqrshrun v0.4h, v0.4s, #0xa 1048 add v3.4h, v12.4h , v26.4h 1049 ld1 {v18.2s, v19.2s}, [x0], x2 // row 6 load 1050 ext v25.8b, v18.8b , v19.8b , #5 1051 uqxtn v13.8b, v0.8h 1052 1053 trn1 v11.2s, v11.2s, v13.2s 1054 trn2 v13.2s, v11.2s, v13.2s 1055 saddl v0.4s, v10.4h, v28.4h 1056 urhadd v9.8b, v9.8b , v11.8b 1057 1058 st1 {v28.2s}, [x9], x6 // store temp buffer 7 1059 1060 smlal v0.4s, v2.4h, v22.4h 1061 uaddl v30.8h, v18.8b, v25.8b 1062 1063 st1 {v9.s}[0], [x1], x3 // store row 0 1064 1065 ext v20.8b, v18.8b , v19.8b , #2 1066 1067 st1 {v9.s}[1], [x1], x3 // store row 1 1068 1069 ext v21.8b, v18.8b , v19.8b , #3 1070 smlsl v0.4s, v3.4h, v23.4h 1071 uaddl v8.8h, v20.8b, v21.8b 1072 ext v24.8b, v18.8b , v19.8b , #4 1073 mla v30.4h, v8.4h , v22.4h 1074 ext v19.8b, v18.8b , v19.8b , #1 1075 uaddl v8.8h, v19.8b, v24.8b 1076 sqrshrun v0.4h, v0.4s, #0xa 1077 add v2.4h, v16.4h , v26.4h 1078 mls v30.4h, v8.4h , v23.4h 1079 uqxtn v4.8b, v0.8h 1080 1081 add v3.4h, v14.4h , v28.4h 1082 1083 1084 saddl v0.4s, v12.4h, v30.4h 1085 1086 st1 {v30.2s}, [x9] 1087 1088 smlal v0.4s, v2.4h, v22.4h 1089 1090 ld1 {v8.2s}, [x7], x6 // load from temp buffer 2 1091 ld1 {v9.2s}, [x7], x6 // load from temp buffer 3 1092 smlsl v0.4s, v3.4h, v23.4h 1093 subs x4, x4, #4 1094 1095 sqrshrun v10.8b, v8.8h, #5 1096 sqrshrun v9.8b, v9.8h, #5 1097 mov v10.s[1], v9.s[0] 1098 1099 mov v12.8b, v28.8b 1100 1101 sqrshrun v0.4h, v0.4s, #0xa 1102 mov v6.8b, v14.8b 1103 mov v8.8b, v16.8b 1104 1105 uqxtn v5.8b, v0.8h 1106 1107 trn1 v4.2s, v4.2s, v5.2s 1108 trn2 v5.2s, v4.2s, v5.2s 1109 urhadd v4.8b, v4.8b , v10.8b 1110 mov v10.8b, v26.8b 1111 mov v14.8b, v30.8b 1112 1113 st1 {v4.s}[0], [x1], x3 // store row 2 1114 st1 {v4.s}[1], [x1], x3 // store row 3 1115 1116 bgt loop_4 1117 1118 end_func: 1119 //Restoring registers from stack 1120 ldp x19, x20, [sp], #16 1121 pop_v_regs 1122 ret 1123 1124 1125 1126