1 //****************************************************************************** 2 //* 3 //* Copyright (C) 2015 The Android Open Source Project 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //***************************************************************************** 18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 //*/ 20 ///** 21 // ******************************************************************************* 22 // * @file 23 // * ih264e_half_pel.s 24 // * 25 // * @brief 26 // * 27 // * 28 // * @author 29 // * Ittiam 30 // * 31 // * @par List of Functions: 32 // * ih264e_sixtapfilter_horz 33 // * ih264e_sixtap_filter_2dvh_vert 34 // 35 // * 36 // * @remarks 37 // * None 38 // * 39 // ******************************************************************************* 40 // */ 41 42 43 .text 44 .p2align 2 45 .include "ih264_neon_macros.s" 46 47 ///******************************************************************************* 48 //* 49 //* @brief 50 //* Interprediction luma filter for horizontal input(Filter run for width = 17 and height =16) 51 //* 52 //* @par Description: 53 //* Applies a 6 tap horizontal filter .The output is clipped to 8 bits 54 //* sec 8.4.2.2.1 titled "Luma sample interpolation process" 55 //* 56 //* @param[in] pu1_src 57 //* UWORD8 pointer to the source 58 //* 59 //* @param[out] pu1_dst 60 //* UWORD8 pointer to the destination 61 //* 62 //* @param[in] src_strd 63 //* integer source stride 64 //* 65 //* @param[in] dst_strd 66 //* integer destination stride 67 //* 68 //* 69 //* @returns 70 //* 71 //* @remarks 72 //* None 73 //* 74 //******************************************************************************* 75 //*/ 76 //void ih264e_sixtapfilter_horz(UWORD8 *pu1_src, 77 // UWORD8 *pu1_dst, 78 // WORD32 src_strd, 79 // WORD32 dst_strd); 80 81 82 .equ halfpel_width , 17 + 1 //( make it even, two rows are processed at a time) 83 84 85 .global ih264e_sixtapfilter_horz_av8 86 ih264e_sixtapfilter_horz_av8: 87 // STMFD sp!,{x14} 88 push_v_regs 89 stp x19, x20, [sp, #-16]! 90 91 movi v0.8b, #5 92 sub x0, x0, #2 93 sub x3, x3, #16 94 movi v1.8b, #20 95 mov x14, #16 96 97 filter_horz_loop: 98 99 100 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0 101 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1 102 103 //// Processing row0 and row1 104 105 ext v31.8b, v2.8b , v3.8b , #5 106 ext v30.8b, v3.8b , v4.8b , #5 107 108 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) 109 ext v29.8b, v4.8b , v4.8b , #5 110 uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row0) 111 ext v28.8b, v5.8b , v6.8b , #5 112 uaddl v12.8h, v29.8b, v4.8b //// a0 + a5 (column3,row0) 113 ext v27.8b, v6.8b , v7.8b , #5 114 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) 115 ext v26.8b, v7.8b , v7.8b , #5 116 117 uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row1) 118 ext v31.8b, v2.8b , v3.8b , #2 119 uaddl v18.8h, v26.8b, v7.8b //// a0 + a5 (column3,row1) 120 ext v30.8b, v3.8b , v4.8b , #2 121 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 122 ext v29.8b, v4.8b , v4.8b , #2 123 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) 124 ext v28.8b, v5.8b , v6.8b , #2 125 umlal v12.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) 126 ext v27.8b, v6.8b , v7.8b , #2 127 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) 128 ext v26.8b, v7.8b , v7.8b , #2 129 130 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row1) 131 ext v31.8b, v2.8b , v3.8b , #3 132 umlal v18.8h, v26.8b, v1.8b //// a0 + a5 + 20a2 (column3,row1) 133 ext v30.8b, v3.8b , v4.8b , #3 134 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 135 ext v29.8b, v4.8b , v4.8b , #3 136 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) 137 ext v28.8b, v5.8b , v6.8b , #3 138 umlal v12.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) 139 ext v27.8b, v6.8b , v7.8b , #3 140 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) 141 ext v26.8b, v7.8b , v7.8b , #3 142 143 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row1) 144 ext v31.8b, v2.8b , v3.8b , #1 145 umlal v18.8h, v26.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row1) 146 ext v30.8b, v3.8b , v4.8b , #1 147 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 148 ext v29.8b, v4.8b , v4.8b , #1 149 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) 150 ext v28.8b, v5.8b , v6.8b , #1 151 umlsl v12.8h, v29.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) 152 ext v27.8b, v6.8b , v7.8b , #1 153 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) 154 ext v26.8b, v7.8b , v7.8b , #1 155 156 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1) 157 ext v31.8b, v2.8b , v3.8b , #4 158 umlsl v18.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row1) 159 ext v30.8b, v3.8b , v4.8b , #4 160 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 161 ext v29.8b, v4.8b , v4.8b , #4 162 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) 163 ext v28.8b, v5.8b , v6.8b , #4 164 umlsl v12.8h, v29.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) 165 ext v27.8b, v6.8b , v7.8b , #4 166 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) 167 ext v26.8b, v7.8b , v7.8b , #4 168 169 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1) 170 umlsl v18.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row1) 171 172 sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 173 sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) 174 sqrshrun v22.8b, v12.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) 175 sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) 176 sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1) 177 sqrshrun v25.8b, v18.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row1) 178 179 st1 {v20.8b, v21.8b}, [x1], #16 ////Store dest row0 180 st1 {v22.h}[0], [x1], x3 181 st1 {v23.8b, v24.8b}, [x1], #16 ////Store dest row1 182 st1 {v25.h}[0], [x1], x3 183 184 subs x14, x14, #2 // decrement counter 185 186 bne filter_horz_loop 187 188 189 // LDMFD sp!,{pc} 190 ldp x19, x20, [sp], #16 191 pop_v_regs 192 ret 193 194 195 196 197 198 199 200 201 202 ///** 203 //******************************************************************************* 204 //* 205 //* @brief 206 //* This function implements a two stage cascaded six tap filter. It 207 //* applies the six tap filter in the vertical direction on the 208 //* predictor values, followed by applying the same filter in the 209 //* horizontal direction on the output of the first stage. The six tap 210 //* filtering operation is described in sec 8.4.2.2.1 titled "Luma sample 211 //* interpolation process" 212 //* (Filter run for width = 17 and height =17) 213 //* @par Description: 214 //* The function interpolates 215 //* the predictors first in the vertical direction and then in the 216 //* horizontal direction to output the (1/2,1/2). The output of the first 217 //* stage of the filter is stored in the buffer pointed to by pi16_pred1(only in C) 218 //* in 16 bit precision. 219 //* 220 //* 221 //* @param[in] pu1_src 222 //* UWORD8 pointer to the source 223 //* 224 //* @param[out] pu1_dst1 225 //* UWORD8 pointer to the destination(vertical filtered output) 226 //* 227 //* @param[out] pu1_dst2 228 //* UWORD8 pointer to the destination(out put after applying horizontal filter to the intermediate vertical output) 229 //* 230 //* @param[in] src_strd 231 //* integer source stride 232 //* 233 //* @param[in] dst_strd 234 //* integer destination stride of pu1_dst 235 //* 236 //* @param[in]pi16_pred1 237 //* Pointer to 16bit intermediate buffer(used only in c) 238 //* 239 //* @param[in] pi16_pred1_strd 240 //* integer destination stride of pi16_pred1 241 //* 242 //* 243 //* @returns 244 //* 245 //* @remarks 246 //* None 247 //* 248 //******************************************************************************* 249 //*/ 250 //void ih264e_sixtap_filter_2dvh_vert(UWORD8 *pu1_src, 251 // UWORD8 *pu1_dst1, 252 // UWORD8 *pu1_dst2, 253 // WORD32 src_strd, 254 // WORD32 dst_strd, 255 // WORD32 *pi16_pred1,/* Pointer to 16bit intermmediate buffer (used only in c)*/ 256 // WORD32 pi16_pred1_strd) 257 258 259 260 261 .global ih264e_sixtap_filter_2dvh_vert_av8 262 263 ih264e_sixtap_filter_2dvh_vert_av8: 264 // STMFD sp!,{x10,x11,x12,x14} 265 push_v_regs 266 stp x19, x20, [sp, #-16]! 267 268 ////x0 - pu1_ref 269 ////x3 - u4_ref_width 270 271 //// Load six rows for vertical interpolation 272 lsl x12, x3, #1 273 sub x0, x0, x12 274 sub x0, x0, #2 275 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x3 276 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x3 277 ld1 {v8.8b, v9.8b, v10.8b}, [x0], x3 278 mov x12, #5 279 ld1 {v11.8b, v12.8b, v13.8b}, [x0], x3 280 mov x14, #20 281 ld1 {v14.8b, v15.8b, v16.8b}, [x0], x3 282 mov v0.h[0], w12 283 mov v0.h[1], w14 284 ld1 {v17.8b, v18.8b, v19.8b}, [x0], x3 285 movi v1.8b, #20 286 287 //// x12 - u2_buff1_width 288 //// x14 - u2_buff2_width 289 mov x12, x4 290 add x11, x1, #16 291 292 mov x14, x12 293 294 mov x10, #3 //loop counter 295 sub x16 , x12, #8 296 sub x19, x14, #16 297 filter_2dvh_loop: 298 299 //// ////////////// ROW 1 /////////////////////// 300 301 //// Process first vertical interpolated row 302 //// each column is 303 uaddl v20.8h, v2.8b, v17.8b //// a0 + a5 (column1,row0) 304 movi v31.8b, #5 305 umlal v20.8h, v8.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 306 umlal v20.8h, v11.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 307 umlsl v20.8h, v5.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 308 umlsl v20.8h, v14.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 309 mov v21.d[0], v20.d[1] 310 311 uaddl v22.8h, v3.8b, v18.8b //// a0 + a5 (column2,row0) 312 umlal v22.8h, v9.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) 313 umlal v22.8h, v12.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) 314 umlsl v22.8h, v6.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) 315 umlsl v22.8h, v15.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) 316 ext v30.8b, v20.8b , v21.8b , #4 317 mov v23.d[0], v22.d[1] 318 319 320 uaddl v24.8h, v4.8b, v19.8b //// a0 + a5 (column3,row0) 321 ext v29.8b, v20.8b , v21.8b , #6 322 umlal v24.8h, v10.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) 323 umlal v24.8h, v13.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) 324 umlsl v24.8h, v7.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) 325 umlsl v24.8h, v16.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) 326 mov v25.d[0], v24.d[1] 327 328 sqrshrun v2.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 329 ext v31.8b, v21.8b , v22.8b , #2 330 sqrshrun v3.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) 331 ext v28.8b, v20.8b , v21.8b , #2 332 333 saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) 334 ext v31.8b, v22.8b , v23.8b , #2 335 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1) 336 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1) 337 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) 338 smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) 339 ext v30.8b, v21.8b , v22.8b , #4 340 341 sqrshrun v4.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) 342 ext v29.8b, v21.8b , v22.8b , #6 343 344 ext v28.8b, v21.8b , v22.8b , #2 345 saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) 346 smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2) 347 smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2) 348 smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) 349 smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) 350 ext v31.8b, v23.8b , v24.8b , #2 351 mov v21.d[0], v20.d[1] 352 ext v2.8b, v2.8b , v3.8b , #2 353 ext v3.8b, v3.8b , v4.8b , #2 354 ext v4.8b, v4.8b , v4.8b , #2 355 356 st1 {v2.8b, v3.8b}, [x1], x12 //// store row1 - 1,1/2 grid 357 st1 {v4.h}[0], [x11], x12 //// store row1 - 1,1/2 grid 358 359 ext v30.8b, v22.8b , v23.8b , #4 360 ext v29.8b, v22.8b , v23.8b , #6 361 362 saddl v2.4s, v31.4h, v22.4h //// a0 + a5 (set3) 363 ext v28.8b, v22.8b , v23.8b , #2 364 smlal v2.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3) 365 smlal v2.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3) 366 smlsl v2.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) 367 smlsl v2.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) 368 ext v31.8b, v24.8b , v25.8b , #2 369 370 shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) 371 ext v30.8b, v23.8b , v24.8b , #4 372 shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) 373 ext v29.8b, v23.8b , v24.8b , #6 374 375 saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) 376 ext v28.8b, v23.8b , v24.8b , #2 377 ext v31.8b, v25.8b , v25.8b , #2 378 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4) 379 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4) 380 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) 381 smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) 382 ext v30.8b, v24.8b , v25.8b , #4 383 384 saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) 385 ext v29.8b, v24.8b , v25.8b , #6 386 387 ext v31.8b, v24.8b , v25.8b , #2 388 shrn v28.4h, v2.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) 389 390 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x3 //// Load next Row data 391 smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5) 392 smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5) 393 smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) 394 smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) 395 shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) 396 mov v20.d[1], v21.d[0] 397 sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 398 399 400 ////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 401 ////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) 402 403 ////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 404 405 ////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values 406 //// ////////////// ROW 2 /////////////////////// 407 408 //// Process first vertical interpolated row 409 //// each column is 410 uaddl v20.8h, v5.8b, v2.8b //// a0 + a5 (column1,row0) 411 movi v31.8b, #5 412 umlal v20.8h, v11.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 413 umlal v20.8h, v14.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 414 umlsl v20.8h, v8.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 415 umlsl v20.8h, v17.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 416 mov v21.d[0], v20.d[1] 417 418 mov v28.d[1], v29.d[0] 419 sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 420 421 shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) 422 423 uaddl v22.8h, v6.8b, v3.8b //// a0 + a5 (column2,row0) 424 umlal v22.8h, v12.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) 425 umlal v22.8h, v15.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) 426 umlsl v22.8h, v9.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) 427 umlsl v22.8h, v18.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) 428 mov v23.d[0], v22.d[1] 429 430 sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 431 ext v30.8b, v20.8b , v21.8b , #4 432 433 uaddl v24.8h, v7.8b, v4.8b //// a0 + a5 (column3,row0) 434 ext v29.8b, v20.8b , v21.8b , #6 435 umlal v24.8h, v13.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) 436 umlal v24.8h, v16.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) 437 umlsl v24.8h, v10.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) 438 umlsl v24.8h, v19.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) 439 mov v25.d[0], v24.d[1] 440 441 st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values 442 st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values 443 444 sqrshrun v5.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 445 ext v31.8b, v21.8b , v22.8b , #2 446 sqrshrun v6.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) 447 ext v28.8b, v20.8b , v21.8b , #2 448 449 saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) 450 ext v31.8b, v22.8b , v23.8b , #2 451 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1) 452 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1) 453 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) 454 smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) 455 ext v30.8b, v21.8b , v22.8b , #4 456 457 sqrshrun v7.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) 458 ext v29.8b, v21.8b , v22.8b , #6 459 460 ext v28.8b, v21.8b , v22.8b , #2 461 saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) 462 smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2) 463 smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2) 464 smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) 465 smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) 466 ext v31.8b, v23.8b , v24.8b , #2 467 468 ext v5.8b, v5.8b , v6.8b , #2 469 ext v6.8b, v6.8b , v7.8b , #2 470 ext v7.8b, v7.8b , v7.8b , #2 471 472 st1 {v5.8b, v6.8b}, [x1], x12 //// store row1 - 1,1/2 grid 473 st1 {v7.h}[0], [x11], x12 //// store row1 - 1,1/2 grid 474 475 ext v30.8b, v22.8b , v23.8b , #4 476 ext v29.8b, v22.8b , v23.8b , #6 477 478 saddl v6.4s, v31.4h, v22.4h //// a0 + a5 (set3) 479 ext v28.8b, v22.8b , v23.8b , #2 480 smlal v6.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3) 481 smlal v6.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3) 482 smlsl v6.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) 483 smlsl v6.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) 484 ext v31.8b, v24.8b , v25.8b , #2 485 486 shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) 487 ext v30.8b, v23.8b , v24.8b , #4 488 shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) 489 ext v29.8b, v23.8b , v24.8b , #6 490 491 saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) 492 ext v28.8b, v23.8b , v24.8b , #2 493 ext v31.8b, v25.8b , v25.8b , #2 494 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4) 495 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4) 496 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) 497 smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) 498 ext v30.8b, v24.8b , v25.8b , #4 499 500 saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) 501 ext v29.8b, v24.8b , v25.8b , #6 502 503 ext v31.8b, v24.8b , v25.8b , #2 504 shrn v28.4h, v6.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) 505 506 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x3 //// Load next Row data 507 smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5) 508 smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5) 509 smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) 510 smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) 511 shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) 512 mov v20.d[1], v21.d[0] 513 sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 514 515 516 ////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 517 ////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) 518 519 ////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 520 521 ////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values 522 //// ////////////// ROW 3 /////////////////////// 523 524 //// Process first vertical interpolated row 525 //// each column is 526 uaddl v20.8h, v8.8b, v5.8b //// a0 + a5 (column1,row0) 527 movi v31.8b, #5 528 umlal v20.8h, v14.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 529 umlal v20.8h, v17.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 530 umlsl v20.8h, v11.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 531 umlsl v20.8h, v2.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 532 mov v21.d[0], v20.d[1] 533 534 mov v28.d[1], v29.d[0] 535 sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 536 shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) 537 538 uaddl v22.8h, v9.8b, v6.8b //// a0 + a5 (column2,row0) 539 umlal v22.8h, v15.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) 540 umlal v22.8h, v18.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) 541 umlsl v22.8h, v12.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) 542 umlsl v22.8h, v3.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) 543 mov v23.d[0], v22.d[1] 544 545 sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 546 ext v30.8b, v20.8b , v21.8b , #4 547 548 uaddl v24.8h, v10.8b, v7.8b //// a0 + a5 (column3,row0) 549 ext v29.8b, v20.8b , v21.8b , #6 550 umlal v24.8h, v16.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) 551 umlal v24.8h, v19.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) 552 umlsl v24.8h, v13.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) 553 umlsl v24.8h, v4.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) 554 mov v25.d[0], v24.d[1] 555 556 st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values 557 st1 { v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values 558 559 sqrshrun v8.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 560 ext v31.8b, v21.8b , v22.8b , #2 561 sqrshrun v9.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) 562 ext v28.8b, v20.8b , v21.8b , #2 563 564 saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) 565 ext v31.8b, v22.8b , v23.8b , #2 566 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1) 567 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1) 568 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) 569 smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) 570 ext v30.8b, v21.8b , v22.8b , #4 571 572 sqrshrun v10.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) 573 ext v29.8b, v21.8b , v22.8b , #6 574 575 ext v28.8b, v21.8b , v22.8b , #2 576 saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) 577 smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2) 578 smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2) 579 smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) 580 smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) 581 ext v31.8b, v23.8b , v24.8b , #2 582 583 ext v8.8b, v8.8b , v9.8b , #2 584 ext v9.8b, v9.8b , v10.8b , #2 585 ext v10.8b, v10.8b , v10.8b , #2 586 587 st1 {v8.8b, v9.8b}, [x1], x12 //// store row1 - 1,1/2 grid 588 st1 {v10.h}[0], [x11], x12 //// store row1 - 1,1/2 grid 589 590 ext v30.8b, v22.8b , v23.8b , #4 591 ext v29.8b, v22.8b , v23.8b , #6 592 593 saddl v8.4s, v31.4h, v22.4h //// a0 + a5 (set3) 594 ext v28.8b, v22.8b , v23.8b , #2 595 smlal v8.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3) 596 smlal v8.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3) 597 smlsl v8.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) 598 smlsl v8.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) 599 ext v31.8b, v24.8b , v25.8b , #2 600 601 shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) 602 ext v30.8b, v23.8b , v24.8b , #4 603 shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) 604 ext v29.8b, v23.8b , v24.8b , #6 605 606 saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) 607 ext v28.8b, v23.8b , v24.8b , #2 608 ext v31.8b, v25.8b , v25.8b , #2 609 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4) 610 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4) 611 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) 612 smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) 613 ext v30.8b, v24.8b , v25.8b , #4 614 615 saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) 616 ext v29.8b, v24.8b , v25.8b , #6 617 618 ext v31.8b, v24.8b , v25.8b , #2 619 shrn v28.4h, v8.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) 620 621 ld1 {v8.8b, v9.8b, v10.8b}, [x0], x3 //// Load next Row data 622 smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5) 623 smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5) 624 smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) 625 smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) 626 shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) 627 mov v20.d[1], v21.d[0] 628 sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 629 630 631 ////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 632 ////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) 633 634 ////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 635 636 ////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values 637 //// ////////////// ROW 4 /////////////////////// 638 639 //// Process first vertical interpolated row 640 //// each column is 641 uaddl v20.8h, v11.8b, v8.8b //// a0 + a5 (column1,row0) 642 movi v31.8b, #5 643 umlal v20.8h, v17.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 644 umlal v20.8h, v2.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 645 umlsl v20.8h, v14.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 646 umlsl v20.8h, v5.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 647 mov v21.d[0], v20.d[1] 648 mov v28.d[1], v29.d[0] 649 sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 650 shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) 651 652 uaddl v22.8h, v12.8b, v9.8b //// a0 + a5 (column2,row0) 653 umlal v22.8h, v18.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) 654 umlal v22.8h, v3.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) 655 umlsl v22.8h, v15.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) 656 umlsl v22.8h, v6.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) 657 mov v23.d[0], v22.d[1] 658 659 sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 660 ext v30.8b, v20.8b , v21.8b , #4 661 662 uaddl v24.8h, v13.8b, v10.8b //// a0 + a5 (column3,row0) 663 ext v29.8b, v20.8b , v21.8b , #6 664 umlal v24.8h, v19.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) 665 umlal v24.8h, v4.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) 666 umlsl v24.8h, v16.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) 667 umlsl v24.8h, v7.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) 668 mov v25.d[0], v24.d[1] 669 670 st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values 671 st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values 672 673 sqrshrun v11.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 674 ext v31.8b, v21.8b , v22.8b , #2 675 sqrshrun v12.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) 676 ext v28.8b, v20.8b , v21.8b , #2 677 678 saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) 679 ext v31.8b, v22.8b , v23.8b , #2 680 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1) 681 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1) 682 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) 683 smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) 684 ext v30.8b, v21.8b , v22.8b , #4 685 686 sqrshrun v13.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) 687 ext v29.8b, v21.8b , v22.8b , #6 688 689 ext v28.8b, v21.8b , v22.8b , #2 690 saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) 691 smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2) 692 smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2) 693 smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) 694 smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) 695 ext v31.8b, v23.8b , v24.8b , #2 696 697 ext v11.8b, v11.8b , v12.8b , #2 698 ext v12.8b, v12.8b , v13.8b , #2 699 ext v13.8b, v13.8b , v13.8b , #2 700 701 st1 {v11.8b, v12.8b}, [x1], x12 //// store row1 - 1,1/2 grid 702 st1 {v13.h}[0], [x11], x12 //// store row1 - 1,1/2 grid 703 704 ext v30.8b, v22.8b , v23.8b , #4 705 ext v29.8b, v22.8b , v23.8b , #6 706 707 saddl v12.4s, v31.4h, v22.4h //// a0 + a5 (set3) 708 ext v28.8b, v22.8b , v23.8b , #2 709 smlal v12.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3) 710 smlal v12.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3) 711 smlsl v12.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) 712 smlsl v12.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) 713 ext v31.8b, v24.8b , v25.8b , #2 714 715 shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) 716 ext v30.8b, v23.8b , v24.8b , #4 717 shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) 718 ext v29.8b, v23.8b , v24.8b , #6 719 720 saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) 721 ext v28.8b, v23.8b , v24.8b , #2 722 ext v31.8b, v25.8b , v25.8b , #2 723 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4) 724 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4) 725 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) 726 smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) 727 ext v30.8b, v24.8b , v25.8b , #4 728 729 saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) 730 ext v29.8b, v24.8b , v25.8b , #6 731 732 ext v31.8b, v24.8b , v25.8b , #2 733 shrn v28.4h, v12.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) 734 735 ld1 {v11.8b, v12.8b, v13.8b}, [x0], x3 //// Load next Row data 736 smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5) 737 smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5) 738 smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) 739 smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) 740 shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) 741 mov v20.d[1], v21.d[0] 742 sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 743 744 745 ////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 746 ////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) 747 748 ////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 749 750 ////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values 751 //// ////////////// ROW 5 /////////////////////// 752 753 //// Process first vertical interpolated row 754 //// each column is 755 uaddl v20.8h, v14.8b, v11.8b //// a0 + a5 (column1,row0) 756 movi v31.8b, #5 757 umlal v20.8h, v2.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 758 umlal v20.8h, v5.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 759 umlsl v20.8h, v17.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 760 umlsl v20.8h, v8.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 761 mov v21.d[0], v20.d[1] 762 mov v28.d[1], v29.d[0] 763 sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 764 shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) 765 766 uaddl v22.8h, v15.8b, v12.8b //// a0 + a5 (column2,row0) 767 umlal v22.8h, v3.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) 768 umlal v22.8h, v6.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) 769 umlsl v22.8h, v18.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) 770 umlsl v22.8h, v9.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) 771 mov v23.d[0], v22.d[1] 772 773 sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 774 ext v30.8b, v20.8b , v21.8b , #4 775 776 uaddl v24.8h, v16.8b, v13.8b //// a0 + a5 (column3,row0) 777 ext v29.8b, v20.8b , v21.8b , #6 778 umlal v24.8h, v4.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) 779 umlal v24.8h, v7.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) 780 umlsl v24.8h, v19.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) 781 umlsl v24.8h, v10.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) 782 mov v25.d[0], v24.d[1] 783 784 st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values 785 st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values 786 787 sqrshrun v14.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 788 ext v31.8b, v21.8b , v22.8b , #2 789 sqrshrun v15.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) 790 ext v28.8b, v20.8b , v21.8b , #2 791 792 saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) 793 ext v31.8b, v22.8b , v23.8b , #2 794 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1) 795 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1) 796 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) 797 smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) 798 ext v30.8b, v21.8b , v22.8b , #4 799 800 sqrshrun v16.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) 801 ext v29.8b, v21.8b , v22.8b , #6 802 803 ext v28.8b, v21.8b , v22.8b , #2 804 saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) 805 smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2) 806 smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2) 807 smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) 808 smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) 809 ext v31.8b, v23.8b , v24.8b , #2 810 811 ext v14.8b, v14.8b , v15.8b , #2 812 ext v15.8b, v15.8b , v16.8b , #2 813 ext v16.8b, v16.8b , v16.8b , #2 814 815 st1 {v14.8b, v15.8b}, [x1], x12 //// store row1 - 1,1/2 grid 816 st1 {v16.h}[0], [x11], x12 //// store row1 - 1,1/2 grid 817 818 ext v30.8b, v22.8b , v23.8b , #4 819 ext v29.8b, v22.8b , v23.8b , #6 820 821 saddl v14.4s, v31.4h, v22.4h //// a0 + a5 (set3) 822 ext v28.8b, v22.8b , v23.8b , #2 823 smlal v14.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3) 824 smlal v14.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3) 825 smlsl v14.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) 826 smlsl v14.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) 827 ext v31.8b, v24.8b , v25.8b , #2 828 829 shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) 830 ext v30.8b, v23.8b , v24.8b , #4 831 shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) 832 ext v29.8b, v23.8b , v24.8b , #6 833 834 saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) 835 ext v28.8b, v23.8b , v24.8b , #2 836 ext v31.8b, v25.8b , v25.8b , #2 837 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4) 838 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4) 839 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) 840 smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) 841 ext v30.8b, v24.8b , v25.8b , #4 842 843 saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) 844 ext v29.8b, v24.8b , v25.8b , #6 845 846 ext v31.8b, v24.8b , v25.8b , #2 847 shrn v28.4h, v14.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) 848 849 ld1 {v14.8b, v15.8b, v16.8b}, [x0], x3 //// Load next Row data 850 smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5) 851 smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5) 852 smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) 853 smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) 854 shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) 855 mov v20.d[1], v21.d[0] 856 sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 857 858 859 ////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 860 ////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) 861 862 ////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 863 864 ////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values 865 //// ////////////// ROW 6 /////////////////////// 866 867 //// Process first vertical interpolated row 868 //// each column is 869 870 cmp x10, #1 //// if it 17 rows are complete skip 871 beq filter_2dvh_skip_row 872 uaddl v20.8h, v17.8b, v14.8b //// a0 + a5 (column1,row0) 873 movi v31.8b, #5 874 umlal v20.8h, v5.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 875 umlal v20.8h, v8.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 876 umlsl v20.8h, v2.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 877 umlsl v20.8h, v11.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 878 mov v21.d[0], v20.d[1] 879 mov v28.d[1], v29.d[0] 880 sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 881 shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) 882 883 uaddl v22.8h, v18.8b, v15.8b //// a0 + a5 (column2,row0) 884 umlal v22.8h, v6.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) 885 umlal v22.8h, v9.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) 886 umlsl v22.8h, v3.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) 887 umlsl v22.8h, v12.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) 888 mov v23.d[0], v22.d[1] 889 890 sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 891 ext v30.8b, v20.8b , v21.8b , #4 892 893 uaddl v24.8h, v19.8b, v16.8b //// a0 + a5 (column3,row0) 894 ext v29.8b, v20.8b , v21.8b , #6 895 umlal v24.8h, v7.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) 896 umlal v24.8h, v10.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) 897 umlsl v24.8h, v4.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) 898 umlsl v24.8h, v13.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) 899 mov v25.d[0], v24.d[1] 900 901 st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values 902 st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values 903 904 sqrshrun v17.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 905 ext v31.8b, v21.8b , v22.8b , #2 906 sqrshrun v18.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) 907 ext v28.8b, v20.8b , v21.8b , #2 908 909 saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) 910 ext v31.8b, v22.8b , v23.8b , #2 911 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1) 912 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1) 913 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) 914 smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) 915 ext v30.8b, v21.8b , v22.8b , #4 916 917 sqrshrun v19.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) 918 ext v29.8b, v21.8b , v22.8b , #6 919 920 ext v28.8b, v21.8b , v22.8b , #2 921 saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) 922 smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2) 923 smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2) 924 smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) 925 smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) 926 ext v31.8b, v23.8b , v24.8b , #2 927 928 ext v17.8b, v17.8b , v18.8b , #2 929 ext v18.8b, v18.8b , v19.8b , #2 930 ext v19.8b, v19.8b , v19.8b , #2 931 932 st1 {v17.8b, v18.8b}, [x1], x12 //// store row1 - 1,1/2 grid 933 st1 {v19.h}[0], [x11], x12 //// store row1 - 1,1/2 grid 934 935 ext v30.8b, v22.8b , v23.8b , #4 936 ext v29.8b, v22.8b , v23.8b , #6 937 938 saddl v18.4s, v31.4h, v22.4h //// a0 + a5 (set3) 939 ext v28.8b, v22.8b , v23.8b , #2 940 smlal v18.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3) 941 smlal v18.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3) 942 smlsl v18.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) 943 smlsl v18.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) 944 ext v31.8b, v24.8b , v25.8b , #2 945 946 shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) 947 ext v30.8b, v23.8b , v24.8b , #4 948 shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) 949 ext v29.8b, v23.8b , v24.8b , #6 950 951 saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) 952 ext v28.8b, v23.8b , v24.8b , #2 953 ext v31.8b, v25.8b , v25.8b , #2 954 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4) 955 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4) 956 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) 957 smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) 958 ext v30.8b, v24.8b , v25.8b , #4 959 960 saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) 961 ext v29.8b, v24.8b , v25.8b , #6 962 963 ext v31.8b, v24.8b , v25.8b , #2 964 shrn v28.4h, v18.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) 965 966 ld1 {v17.8b, v18.8b, v19.8b}, [x0], x3 //// Load next Row data 967 smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5) 968 smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5) 969 smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) 970 smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) 971 shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) 972 mov v20.d[1], v21.d[0] 973 sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 974 975 mov v28.d[1], v29.d[0] 976 sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 977 shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) 978 979 sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 980 981 st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values 982 st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values 983 984 subs x10, x10, #1 ////decrement loop counter 985 986 bne filter_2dvh_loop 987 988 989 //// Process first vertical interpolated row 990 //// each column is 991 //// ////////////// ROW 13 /////////////////////// 992 993 //// Process first vertical interpolated row 994 //// each column is 995 996 // LDMFD sp!,{x10,x11,x12,pc} 997 ldp x19, x20, [sp], #16 998 pop_v_regs 999 ret 1000 1001 filter_2dvh_skip_row: 1002 mov v28.d[1], v29.d[0] 1003 sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 1004 shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) 1005 1006 sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 1007 1008 st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values 1009 st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values 1010 // LDMFD sp!,{x10,x11,x12,pc} 1011 ldp x19, x20, [sp], #16 1012 pop_v_regs 1013 ret 1014 1015 1016 ///***************************************** 1017