1 //****************************************************************************** 2 //* 3 //* Copyright (C) 2015 The Android Open Source Project 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //***************************************************************************** 18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 //*/ 20 ///*****************************************************************************/ 21 ///* */ 22 ///* File Name : ih264_deblk_chroma_av8.s */ 23 ///* */ 24 ///* Description : Contains function definitions for deblocking luma */ 25 ///* edge. Functions are coded in NEON assembly and can */ 26 ///* be compiled using ARM RVDS. */ 27 ///* */ 28 ///* List of Functions : ih264_deblk_chroma_vert_bs4_av8() */ 29 ///* ih264_deblk_chroma_vert_bslt4_av8() */ 30 ///* ih264_deblk_chroma_horz_bs4_av8() */ 31 ///* ih264_deblk_chroma_horz_bslt4_av8() */ 32 ///* Issues / Problems : None */ 33 ///* */ 34 ///* Revision History : */ 35 ///* */ 36 ///* DD MM YYYY Author(s) Changes (Describe the changes made) */ 37 ///* 28 11 2013 Ittiam Draft */ 38 ///*****************************************************************************/ 39 40 41 .text 42 .p2align 2 43 .include "ih264_neon_macros.s" 44 45 ///** 46 //******************************************************************************* 47 //* 48 //* @brief 49 //* Performs filtering of a chroma block horizontal edge when the 50 //* boundary strength is set to 4 in high profile 51 //* 52 //* @par Description: 53 //* This operation is described in Sec. 8.7.2.4 under the title 54 //* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. 55 //* 56 //* @param[in] x0 - pu1_src 57 //* Pointer to the src sample q0 58 //* 59 //* @param[in] w1 - src_strd 60 //* Source stride 61 //* 62 //* @param[in] w2 - alpha_cb 63 //* Alpha Value for the boundary in U 64 //* 65 //* @param[in] w3 - beta_cb 66 //* Beta Value for the boundary in U 67 //* 68 //* @param[in] w4 - alpha_cr 69 //* Alpha Value for the boundary in V 70 //* 71 //* @param[in] w5 - beta_cr 72 //* Beta Value for the boundary in V 73 //* 74 //* @returns 75 //* None 76 //* 77 //* @remarks 78 //* None 79 //* 80 //******************************************************************************* 81 //*/ 82 83 .global ih264_deblk_chroma_horz_bs4_av8 84 85 ih264_deblk_chroma_horz_bs4_av8: 86 87 // STMFD sp!,{x4-x6,x14} // 88 push_v_regs 89 stp x19, x20, [sp, #-16]! 90 sxtw x1, w1 91 mov x6, x5 92 mov x5, x4 93 sub x0, x0, x1, lsl #1 //x0 = uc_edgePixel pointing to p1 of chroma 94 ld2 {v6.8b, v7.8b}, [x0], x1 //D6 = p1u , D7 = p1v 95 mov x4, x0 //Keeping a backup of the pointer p0 of chroma 96 ld2 {v4.8b, v5.8b}, [x0], x1 //D4 = p0u , D5 = p0v 97 dup v20.8b, w2 //D20 contains alpha_cb 98 dup v21.8b, w5 //D21 contains alpha_cr 99 mov v20.d[1], v21.d[0] 100 ld2 {v0.8b, v1.8b}, [x0], x1 //D0 = q0u , D1 = q0v 101 uaddl v8.8h, v6.8b, v0.8b // 102 uaddl v10.8h, v7.8b, v1.8b //Q4,Q5 = q0 + p1 103 movi v31.8b, #2 // 104 ld2 {v2.8b, v3.8b}, [x0] //D2 = q1u , D3 = q1v 105 mov v0.d[1], v1.d[0] 106 mov v2.d[1], v3.d[0] 107 mov v4.d[1], v5.d[0] 108 mov v6.d[1], v7.d[0] 109 uabd v26.16b, v6.16b , v4.16b //Q13 = ABS(p1 - p0) 110 umlal v8.8h, v2.8b, v31.8b // 111 umlal v10.8h, v3.8b, v31.8b //Q5,Q4 = (X2(q1U) + q0U + p1U) 112 uabd v22.16b, v4.16b , v0.16b //Q11 = ABS(p0 - q0) 113 uabd v24.16b, v2.16b , v0.16b //Q12 = ABS(q1 - q0) 114 uaddl v14.8h, v4.8b, v2.8b // 115 uaddl v28.8h, v5.8b, v3.8b //Q14,Q7 = P0 + Q1 116 dup v16.8b, w3 //D16 contains beta_cb 117 dup v17.8b, w6 //D17 contains beta_cr 118 mov v16.d[1], v17.d[0] 119 umlal v14.8h, v6.8b, v31.8b // 120 umlal v28.8h, v7.8b, v31.8b //Q14,Q7 = (X2(p1U) + p0U + q1U) 121 cmhs v18.16b, v22.16b, v20.16b 122 cmhs v24.16b, v24.16b, v16.16b 123 cmhs v26.16b, v26.16b, v16.16b 124 rshrn v8.8b, v8.8h, #2 // 125 rshrn v9.8b, v10.8h, #2 //Q4 = (X2(q1U) + q0U + p1U + 2) >> 2 126 mov v8.d[1], v9.d[0] 127 orr v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) 128 rshrn v10.8b, v14.8h, #2 // 129 rshrn v11.8b, v28.8h, #2 //Q5 = (X2(p1U) + p0U + q1U + 2) >> 2 130 mov v10.d[1], v11.d[0] 131 orr v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) 132 bit v10.16b, v4.16b , v18.16b // 133 bit v8.16b, v0.16b , v18.16b // 134 mov v11.d[0], v10.d[1] 135 mov v9.d[0], v8.d[1] 136 st2 {v10.8b, v11.8b}, [x4], x1 // 137 st2 {v8.8b, v9.8b}, [x4] // 138 // LDMFD sp!,{x4-x6,pc} // 139 ldp x19, x20, [sp], #16 140 pop_v_regs 141 ret 142 143 144 145 ///** 146 //******************************************************************************* 147 //* 148 //* @brief 149 //* Performs filtering of a chroma block vertical edge when the 150 //* boundary strength is set to 4 in high profile 151 //* 152 //* @par Description: 153 //* This operation is described in Sec. 8.7.2.4 under the title 154 //* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. 155 //* 156 //* @param[in] x0 - pu1_src 157 //* Pointer to the src sample q0 158 //* 159 //* @param[in] w1 - src_strd 160 //* Source stride 161 //* 162 //* @param[in] w2 - alpha_cb 163 //* Alpha Value for the boundary in U 164 //* 165 //* @param[in] w3 - beta_cb 166 //* Beta Value for the boundary in U 167 //* 168 //* @param[in] w4 - alpha_cr 169 //* Alpha Value for the boundary in V 170 //* 171 //* @param[in] w5 - beta_cr 172 //* Beta Value for the boundary in V 173 //* 174 //* @returns 175 //* None 176 //* 177 //* @remarks 178 //* None 179 //* 180 //******************************************************************************* 181 //*/ 182 183 .global ih264_deblk_chroma_vert_bs4_av8 184 185 ih264_deblk_chroma_vert_bs4_av8: 186 187 // STMFD sp!,{x4,x5,x12,x14} 188 push_v_regs 189 stp x19, x20, [sp, #-16]! 190 sxtw x1, w1 191 192 sub x0, x0, #4 //point x0 to p1u of row0. 193 mov x12, x0 //keep a back up of x0 for buffer write 194 195 add w2, w2, w4, lsl #8 //w2 = (alpha_cr,alpha_cb) 196 add w3, w3, w5, lsl #8 //w3 = (beta_cr,beta_cb) 197 198 ld4 {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1 199 ld4 {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1 200 ld4 {v0.h, v1.h, v2.h, v3.h}[2], [x0], x1 201 ld4 {v0.h, v1.h, v2.h, v3.h}[3], [x0], x1 202 203 ld4 {v4.h, v5.h, v6.h, v7.h}[0], [x0], x1 204 ld4 {v4.h, v5.h, v6.h, v7.h}[1], [x0], x1 205 ld4 {v4.h, v5.h, v6.h, v7.h}[2], [x0], x1 206 ld4 {v4.h, v5.h, v6.h, v7.h}[3], [x0], x1 207 208 mov v10.16b, v2.16b 209 mov v2.16b, v1.16b 210 mov v1.16b, v4.16b 211 mov v4.16b, v10.16b 212 mov v10.16b, v6.16b 213 mov v6.16b, v3.16b 214 mov v3.16b, v5.16b 215 mov v5.16b, v10.16b 216 217 dup v22.8h, w2 //Q11 = alpha 218 dup v24.8h, w3 //Q12 = beta 219 movi v31.8b, #2 220 221 mov v0.d[1], v1.d[0] 222 mov v2.d[1], v3.d[0] 223 mov v4.d[1], v5.d[0] 224 mov v6.d[1], v7.d[0] 225 226 uabd v8.16b, v2.16b , v4.16b //|p0-q0| 227 uabd v10.16b, v6.16b , v4.16b //|q1-q0| 228 uabd v12.16b, v0.16b , v2.16b //|p1-p0| 229 uaddl v14.8h, v2.8b, v6.8b 230 uaddl v16.8h, v3.8b, v7.8b //(p0 + q1) 231 cmhi v8.16b, v22.16b , v8.16b //|p0-q0| < alpha ? 232 cmhi v10.16b, v24.16b , v10.16b //|q1-q0| < beta ? 233 cmhi v12.16b, v24.16b , v12.16b //|p1-p0| < beta ? 234 umlal v14.8h, v0.8b, v31.8b 235 umlal v16.8h, v1.8b, v31.8b //2*p1 + (p0 + q1) 236 uaddl v18.8h, v0.8b, v4.8b 237 uaddl v20.8h, v1.8b, v5.8b //(p1 + q0) 238 and v8.16b, v8.16b , v10.16b //|p0-q0| < alpha && |q1-q0| < beta 239 umlal v18.8h, v6.8b, v31.8b 240 umlal v20.8h, v7.8b, v31.8b //2*q1 + (p1 + q0) 241 242 rshrn v14.8b, v14.8h, #2 243 rshrn v15.8b, v16.8h, #2 //(2*p1 + (p0 + q1) + 2) >> 2 244 mov v14.d[1], v15.d[0] 245 and v8.16b, v8.16b , v12.16b //|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta 246 rshrn v18.8b, v18.8h, #2 247 rshrn v19.8b, v20.8h, #2 //(2*q1 + (p1 + q0) + 2) >> 2 248 mov v18.d[1], v19.d[0] 249 bit v2.16b, v14.16b , v8.16b 250 bit v4.16b, v18.16b , v8.16b 251 252 mov v1.d[0], v0.d[1] 253 mov v3.d[0], v2.d[1] 254 mov v5.d[0], v4.d[1] 255 mov v7.d[0], v6.d[1] 256 257 mov v10.16b, v1.16b 258 mov v1.16b, v2.16b 259 mov v2.16b, v4.16b 260 mov v4.16b, v10.16b 261 mov v10.16b, v3.16b 262 mov v3.16b, v6.16b 263 mov v6.16b, v5.16b 264 mov v5.16b, v10.16b 265 266 st4 {v0.h, v1.h, v2.h, v3.h}[0], [x12], x1 267 st4 {v0.h, v1.h, v2.h, v3.h}[1], [x12], x1 268 st4 {v0.h, v1.h, v2.h, v3.h}[2], [x12], x1 269 st4 {v0.h, v1.h, v2.h, v3.h}[3], [x12], x1 270 271 st4 {v4.h, v5.h, v6.h, v7.h}[0], [x12], x1 272 st4 {v4.h, v5.h, v6.h, v7.h}[1], [x12], x1 273 st4 {v4.h, v5.h, v6.h, v7.h}[2], [x12], x1 274 st4 {v4.h, v5.h, v6.h, v7.h}[3], [x12], x1 275 276 // LDMFD sp!,{x4,x5,x12,pc} 277 ldp x19, x20, [sp], #16 278 pop_v_regs 279 ret 280 281 282 283 ///** 284 //******************************************************************************* 285 //* 286 //* @brief 287 //* Performs filtering of a chroma block horizontal edge for cases where the 288 //* boundary strength is less than 4 in high profile 289 //* 290 //* @par Description: 291 //* This operation is described in Sec. 8.7.2.4 under the title 292 //* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. 293 //* 294 //* @param[in] x0 - pu1_src 295 //* Pointer to the src sample q0 296 //* 297 //* @param[in] w1 - src_strd 298 //* Source stride 299 //* 300 //* @param[in] w2 - alpha_cb 301 //* Alpha Value for the boundary in U 302 //* 303 //* @param[in] w3 - beta_cb 304 //* Beta Value for the boundary in U 305 //* 306 //* @param[in] w4 - alpha_cr 307 //* Alpha Value for the boundary in V 308 //* 309 //* @param[in] w5 - beta_cr 310 //* Beta Value for the boundary in V 311 //* 312 //* @param[in] w6 - u4_bs 313 //* Packed Boundary strength array 314 //* 315 //* @param[in] x7 - pu1_cliptab_cb 316 //* tc0_table for U 317 //* 318 //* @param[in] sp(0) - pu1_cliptab_cr 319 //* tc0_table for V 320 //* 321 //* @returns 322 //* None 323 //* 324 //* @remarks 325 //* None 326 //* 327 //******************************************************************************* 328 //*/ 329 330 .global ih264_deblk_chroma_horz_bslt4_av8 331 332 ih264_deblk_chroma_horz_bslt4_av8: 333 334 // STMFD sp!,{x4-x9,x14} // 335 push_v_regs 336 stp x19, x20, [sp, #-16]! 337 sxtw x1, w1 338 ldr x8, [sp, #80] 339 sub x0, x0, x1, lsl #1 //x0 = uc_edgePixelU pointing to p1 of chroma U 340 rev w6, w6 // 341 mov v12.s[0], w6 //D12[0] = ui_Bs 342 ld1 {v16.s}[0], [x7] //D16[0] contains cliptab_cb 343 ld1 {v17.s}[0], [x8] //D17[0] contains cliptab_cr 344 ld2 {v6.8b, v7.8b}, [x0], x1 //Q3=p1 345 tbl v14.8b, {v16.16b}, v12.8b //Retreiving cliptab values for U 346 tbl v28.8b, {v17.16b}, v12.8b //Retrieving cliptab values for V 347 uxtl v12.8h, v12.8b //Q6 = uc_Bs in each 16 bit scalar 348 mov x6, x0 //Keeping a backup of the pointer to chroma U P0 349 ld2 {v4.8b, v5.8b}, [x0], x1 //Q2=p0 350 movi v30.8b, #1 // 351 dup v20.8b, w2 //D20 contains alpha_cb 352 dup v21.8b, w4 //D21 contains alpha_cr 353 mov v20.d[1], v21.d[0] 354 ld2 {v0.8b, v1.8b}, [x0], x1 //Q0=q0 355 uxtl v14.8h, v14.8b // 356 uxtl v28.8h, v28.8b // 357 mov v15.d[0], v28.d[0] //D14 has cliptab values for U, D15 for V 358 mov v14.d[1], v28.d[0] 359 ld2 {v2.8b, v3.8b}, [x0] //Q1=q1 360 usubl v10.8h, v1.8b, v5.8b // 361 usubl v8.8h, v0.8b, v4.8b //Q5,Q4 = (q0 - p0) 362 mov v6.d[1], v7.d[0] 363 mov v4.d[1], v5.d[0] 364 uabd v26.16b, v6.16b , v4.16b //Q13 = ABS(p1 - p0) 365 shl v10.8h, v10.8h, #2 //Q5 = (q0 - p0)<<2 366 mov v0.d[1], v1.d[0] 367 uabd v22.16b, v4.16b , v0.16b //Q11 = ABS(p0 - q0) 368 shl v8.8h, v8.8h, #2 //Q4 = (q0 - p0)<<2 369 mov v14.d[1], v15.d[0] 370 sli v14.8h, v14.8h, #8 371 mov v15.d[0], v14.d[1] 372 mov v2.d[1], v3.d[0] 373 uabd v24.16b, v2.16b , v0.16b //Q12 = ABS(q1 - q0) 374 cmhs v18.16b, v22.16b, v20.16b 375 usubl v20.8h, v6.8b, v2.8b //Q10 = (p1 - q1)L 376 usubl v6.8h, v7.8b, v3.8b //Q3 = (p1 - q1)H 377 dup v16.8b, w3 //Q8 contains beta_cb 378 dup v17.8b, w5 //Q8 contains beta_cr 379 mov v16.d[1], v17.d[0] 380 add v8.8h, v8.8h , v20.8h // 381 add v10.8h, v10.8h , v6.8h //Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1) 382 cmhs v24.16b, v24.16b, v16.16b 383 cmgt v12.4h, v12.4h, #0 384 sqrshrn v8.8b, v8.8h, #3 // 385 sqrshrn v9.8b, v10.8h, #3 //Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3 386 mov v8.d[1], v9.d[0] 387 add v14.8b, v14.8b , v30.8b //D14 = C = C0+1 for U 388 cmhs v26.16b, v26.16b, v16.16b 389 orr v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) 390 abs v6.16b, v8.16b //Q4 = ABS (i_macro) 391 add v15.8b, v15.8b , v30.8b //D15 = C = C0+1 for V 392 mov v14.d[1], v15.d[0] 393 mov v13.8b, v12.8b 394 mov v12.d[1], v13.d[0] // 395 orr v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) 396 umin v14.16b, v6.16b , v14.16b //Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro) 397 bic v12.16b, v12.16b , v18.16b //final condition 398 cmge v8.16b, v8.16b, #0 399 and v14.16b, v14.16b , v12.16b //Making delta zero in places where values shouldn be filterd 400 uqadd v16.16b, v4.16b , v14.16b //Q8 = p0 + delta 401 uqsub v4.16b, v4.16b , v14.16b //Q2 = p0 - delta 402 uqadd v18.16b, v0.16b , v14.16b //Q9 = q0 + delta 403 uqsub v0.16b, v0.16b , v14.16b //Q0 = q0 - delta 404 bif v16.16b, v4.16b , v8.16b //Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta) 405 bif v0.16b, v18.16b , v8.16b //Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta) 406 mov v17.d[0], v16.d[1] 407 mov v1.d[0], v0.d[1] 408 st2 {v16.8b, v17.8b}, [x6], x1 // 409 st2 {v0.8b, v1.8b}, [x6] // 410 411 ldp x19, x20, [sp], #16 412 pop_v_regs 413 ret 414 415 416 417 418 ///** 419 //******************************************************************************* 420 //* 421 //* @brief 422 //* Performs filtering of a chroma block vertical edge for cases where the 423 //* boundary strength is less than 4 in high profile 424 //* 425 //* @par Description: 426 //* This operation is described in Sec. 8.7.2.4 under the title 427 //* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. 428 //* 429 //* @param[in] x0 - pu1_src 430 //* Pointer to the src sample q0 431 //* 432 //* @param[in] w1 - src_strd 433 //* Source stride 434 //* 435 //* @param[in] w2 - alpha_cb 436 //* Alpha Value for the boundary in U 437 //* 438 //* @param[in] w3 - beta_cb 439 //* Beta Value for the boundary in U 440 //* 441 //* @param[in] w4 - alpha_cr 442 //* Alpha Value for the boundary in V 443 //* 444 //* @param[in] w5 - beta_cr 445 //* Beta Value for the boundary in V 446 //* 447 //* @param[in] w6 - u4_bs 448 //* Packed Boundary strength array 449 //* 450 //* @param[in] x7 - pu1_cliptab_cb 451 //* tc0_table for U 452 //* 453 //* @param[in] sp(0) - pu1_cliptab_cr 454 //* tc0_table for V 455 //* 456 //* @returns 457 //* None 458 //* 459 //* @remarks 460 //* None 461 //* 462 //******************************************************************************* 463 //*/ 464 465 .global ih264_deblk_chroma_vert_bslt4_av8 466 467 ih264_deblk_chroma_vert_bslt4_av8: 468 469 // STMFD sp!,{x4-x7,x10-x12,x14} 470 push_v_regs 471 stp x19, x20, [sp, #-16]! 472 sxtw x1, w1 473 mov x10, x7 474 ldr x11, [sp, #80] //x11 = u4_bs 475 sub x0, x0, #4 //point x0 to p1u of row0. 476 add w2, w2, w4, lsl #8 477 add w3, w3, w5, lsl #8 478 mov x12, x0 //keep a back up of x0 for buffer write 479 ld4 {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1 480 ld4 {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1 481 ld4 {v0.h, v1.h, v2.h, v3.h}[2], [x0], x1 482 ld4 {v0.h, v1.h, v2.h, v3.h}[3], [x0], x1 483 484 ld4 {v4.h, v5.h, v6.h, v7.h}[0], [x0], x1 485 ld4 {v4.h, v5.h, v6.h, v7.h}[1], [x0], x1 486 ld4 {v4.h, v5.h, v6.h, v7.h}[2], [x0], x1 487 ld4 {v4.h, v5.h, v6.h, v7.h}[3], [x0], x1 488 489 mov v10.16b, v2.16b 490 mov v2.16b, v1.16b 491 mov v1.16b, v4.16b 492 mov v4.16b, v10.16b 493 mov v10.16b, v6.16b 494 mov v6.16b, v3.16b 495 mov v3.16b, v5.16b 496 mov v5.16b, v10.16b 497 dup v22.8h, w2 //Q11 = alpha 498 mov v2.d[1], v3.d[0] 499 mov v4.d[1], v5.d[0] 500 uabd v8.16b, v2.16b , v4.16b //|p0-q0| 501 dup v24.8h, w3 //Q12 = beta 502 mov v25.d[0], v24.d[1] 503 mov v6.d[1], v7.d[0] 504 mov v0.d[1], v1.d[0] 505 uabd v10.16b, v6.16b , v4.16b //|q1-q0| 506 uabd v12.16b, v0.16b , v2.16b //|p1-p0| 507 cmhi v8.16b, v22.16b , v8.16b //|p0-q0| < alpha ? 508 usubl v14.8h, v0.8b, v6.8b 509 cmhi v10.16b, v24.16b , v10.16b //|q1-q0| < beta ? 510 usubl v16.8h, v1.8b, v7.8b //(p1 - q1) 511 cmhi v12.16b, v24.16b , v12.16b //|p1-p0| < beta ? 512 usubl v18.8h, v4.8b, v2.8b 513 and v8.16b, v8.16b , v10.16b //|p0-q0| < alpha && |q1-q0| < beta 514 usubl v20.8h, v5.8b, v3.8b //(q0 - p0) 515 movi v28.8h, #4 516 ld1 {v24.s}[0], [x10] //Load ClipTable for U 517 ld1 {v25.s}[0], [x11] //Load ClipTable for V 518 rev w6, w6 //Blocking strengths 519 and v8.16b, v8.16b , v12.16b //|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta 520 mov v10.s[0], w6 521 mla v14.8h, v18.8h , v28.8h 522 mla v16.8h, v20.8h , v28.8h //4*(q0 - p0) + (p1 - q1) 523 uxtl v10.8h, v10.8b 524 sli v10.4h, v10.4h, #8 525 tbl v12.8b, {v24.16b}, v10.8b //tC0 for U 526 tbl v13.8b, {v25.16b}, v10.8b //tC0 for V 527 zip1 v31.8b, v12.8b, v13.8b 528 zip2 v13.8b, v12.8b, v13.8b 529 mov v12.8b, v31.8b 530 mov v12.d[1], v13.d[0] 531 uxtl v10.4s, v10.4h 532 sli v10.4s, v10.4s, #16 533 movi v24.16b, #1 534 add v12.16b, v12.16b , v24.16b //tC0 + 1 535 cmhs v10.16b, v10.16b , v24.16b 536 and v8.16b, v8.16b , v10.16b //|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0 537 // Q0 - Q3(inputs), 538 // Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0), 539 // Q6 (tC) 540 srshr v14.8h, v14.8h, #3 541 srshr v16.8h, v16.8h, #3 //(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) 542 cmgt v18.8h, v14.8h , #0 543 cmgt v20.8h, v16.8h , #0 544 xtn v18.8b, v18.8h 545 xtn v19.8b, v20.8h //Q9 = sign(delta) 546 mov v18.d[1], v19.d[0] 547 abs v14.8h, v14.8h 548 abs v16.8h, v16.8h 549 xtn v14.8b, v14.8h 550 xtn v15.8b, v16.8h 551 mov v14.d[1], v15.d[0] 552 umin v14.16b, v14.16b , v12.16b //Q7 = |delta| 553 uqadd v20.16b, v2.16b , v14.16b //p0+|delta| 554 uqadd v22.16b, v4.16b , v14.16b //q0+|delta| 555 uqsub v24.16b, v2.16b , v14.16b //p0-|delta| 556 uqsub v26.16b, v4.16b , v14.16b //q0-|delta| 557 bit v24.16b, v20.16b , v18.16b //p0 + delta 558 bit v22.16b, v26.16b , v18.16b //q0 - delta 559 bit v2.16b, v24.16b , v8.16b 560 bit v4.16b, v22.16b , v8.16b 561 mov v1.d[0], v0.d[1] 562 mov v3.d[0], v2.d[1] 563 mov v5.d[0], v4.d[1] 564 mov v7.d[0], v6.d[1] 565 mov v10.16b, v1.16b 566 mov v1.16b, v2.16b 567 mov v2.16b, v4.16b 568 mov v4.16b, v10.16b 569 mov v10.16b, v3.16b 570 mov v3.16b, v6.16b 571 mov v6.16b, v5.16b 572 mov v5.16b, v10.16b 573 st4 {v0.h, v1.h, v2.h, v3.h}[0], [x12], x1 574 st4 {v0.h, v1.h, v2.h, v3.h}[1], [x12], x1 575 st4 {v0.h, v1.h, v2.h, v3.h}[2], [x12], x1 576 st4 {v0.h, v1.h, v2.h, v3.h}[3], [x12], x1 577 578 st4 {v4.h, v5.h, v6.h, v7.h}[0], [x12], x1 579 st4 {v4.h, v5.h, v6.h, v7.h}[1], [x12], x1 580 st4 {v4.h, v5.h, v6.h, v7.h}[2], [x12], x1 581 st4 {v4.h, v5.h, v6.h, v7.h}[3], [x12], x1 582 583 ldp x19, x20, [sp], #16 584 pop_v_regs 585 ret 586 587 588