1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 //******************************************************************************* 20 //* ,:file 21 //* ihevc_sao_edge_offset_class0_chroma.s 22 //* 23 //* ,:brief 24 //* Contains function definitions for inter prediction interpolation. 25 //* Functions are coded using NEON intrinsics and can be compiled using@ ARM 26 //* RVCT 27 //* 28 //* ,:author 29 //* Parthiban V 30 //* 31 //* ,:par List of Functions: 32 //* 33 //* 34 //* ,:remarks 35 //* None 36 //* 37 //******************************************************************************* 38 //*/ 39 //void ihevc_sao_edge_offset_class0_chroma(UWORD8 *pu1_src, 40 // WORD32 src_strd, 41 // UWORD8 *pu1_src_left, 42 // UWORD8 *pu1_src_top, 43 // UWORD8 *pu1_src_top_left, 44 // UWORD8 *pu1_src_top_right, 45 // UWORD8 *pu1_src_bot_left, 46 // UWORD8 *pu1_avail, 47 // WORD8 *pi1_sao_offset_u, 48 // WORD8 *pi1_sao_offset_v, 49 // WORD32 wd, 50 // 51 //**************Variables Vs Registers***************************************** 52 //x0 => *pu1_src 53 //x1 => src_strd 54 //x2 => *pu1_src_left 55 //x3 => *pu1_src_top 56 //x4 => *pu1_src_top_left 57 //x7 => *pu1_avail 58 //x8 => *pi1_sao_offset_u 59 //x5 => *pi1_sao_offset_v 60 //x9 => wd 61 //x10=> ht 62 63 .text 64 .p2align 2 65 .include "ihevc_neon_macros.s" 66 67 .globl gi1_table_edge_idx 68 .globl ihevc_sao_edge_offset_class0_chroma_av8 69 70 ihevc_sao_edge_offset_class0_chroma_av8: 71 72 ldr x8,[sp,#0] 73 ldr x9,[sp,#8] 74 ldr w10,[sp,#16] 75 ldr w11,[sp,#24] 76 77 78 79 // STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments 80 stp x19, x20,[sp,#-16]! 81 stp x21, x22,[sp,#-16]! 82 stp x23, x24,[sp,#-16]! 83 stp x25, x26,[sp,#-16]! 84 85 mov x15,x4 // *pu1_src_top_left 40 86 mov x16,x5 // *pu1_src_top_right 44 87 mov x17,x6 // *pu1_src_bot_left 48 88 mov x21,x7 // *pu1_avail 52 89 mov x22,x8 // *pi1_sao_offset_u 56 90 mov x23,x9 // *pi1_sao_offset_v 60 91 mov x24,x10 // wd 64 92 mov x25,x11 // ht 68 93 94 MOV x9, x24 //Loads wd 95 96 MOV x4, x15 //Loads pu1_src_top_left 97 ADD x11,x3,x9 //pu1_src_top[wd] 98 99 MOV x10, x25 //Loads ht 100 movi v2.16b, #2 //const_2 = vdupq_n_s8(2) 101 SUB x20,x11,#2 102 LDRH w12,[x20] //pu1_src_top[wd - 1] 103 104 MOV x7, x21 //Loads pu1_avail 105 movi v4.8h, #0 //const_min_clip = vdupq_n_s16(0) 106 STRH w12,[x4] //*pu1_src_top_left = pu1_src_top[wd - 1] 107 108 MOV x8, x22 //Loads pi1_sao_offset_u 109 movi v6.8h, #255 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1) 110 SUB x4,x10,#1 //(ht - 1) 111 112 ADRP x14, :got:gi1_table_edge_idx //table pointer 113 LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 114 movi v3.16b, #0xFF //au1_mask = vdupq_n_s8(-1) 115 mul x4, x4, x1 //(ht - 1) * src_strd 116 117 MOV x5, x23 //Loads pi1_sao_offset_v 118 LD1 {v7.8b},[x8] //offset_tbl = vld1_s8(pi1_sao_offset_u) 119 ADD x4,x4,x0 //pu1_src[(ht - 1) * src_strd] 120 121 MOV x6,x0 //pu1_src_org 122 LD1 {v5.8b},[x14] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 123 MOV x12,x9 //Move wd to x12 for loop count 124 125 SRC_TOP_LOOP: //wd is always multiple of 8 126 LD1 {v0.8b},[x4],#8 //Load pu1_src[(ht - 1) * src_strd + col] 127 SUBS x12,x12,#8 //Decrement the loop counter by 8 128 ST1 {v0.8b},[x3],#8 //Store to pu1_src_top[col] 129 BNE SRC_TOP_LOOP 130 ADD x6,x6,#14 //pu1_src_org[14] 131 132 MOV x3,x2 //pu1_src_left backup to reload later 133 LD1 {v0.8b},[x5] //offset_tbl = vld1_s8(pi1_sao_offset_v) 134 CMP x9,#16 //Compare wd with 16 135 136 BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case 137 138 MOV x8,x9 //move wd to x8 for loop count 139 140 WIDTH_LOOP_16: 141 CMP x8,x9 //if(col == wd) 142 BNE AU1_MASK_FF //jump to else part 143 LDRB w12,[x7] //pu1_avail[0] 144 mov v3.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) 145 mov v3.8b[1], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 1) 146 B SKIP_AU1_MASK_FF //Skip the else part 147 148 AU1_MASK_FF: 149 MOV x12,#-1 //move -1 to x12 150 mov v3.4h[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 151 152 SKIP_AU1_MASK_FF: 153 CMP x8,#16 //If col == 16 154 BNE SKIP_MASKING_IF_NOT16 //If not skip masking 155 LDRB w12,[x7,#1] //pu1_avail[1] 156 mov v3.8b[14], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 14) 157 mov v3.8b[15], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 158 159 SKIP_MASKING_IF_NOT16: 160 MOV x12,x0 //pu1_src_cpy = pu1_src 161 MOV x4,x10 //move ht to x4 for loop count 162 163 PU1_SRC_LOOP: 164 LDRH w11,[x2] //load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later 165 LD1 {v19.16b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy) 166 //LD1 {v13.8b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy) 167 //SUB x12, x12,#8 168 SUB x5,x9,x8 //wd - col 169 170 SUB x14,x10,x4 //ht - row 171 mov v21.4h[7], w11 //vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15) 172 mul x14, x14, x1 //(ht - row) * src_strd 173 174 LD1 {v30.16b},[x12] //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy) 175 //LD1 {v31.8b},[x12] //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy) 176 //SUB x12, x12,#8 177 EXT v21.16b, v21.16b , v19.16b,#14 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14) 178 SUB x12,x12,x1 179 180 LDRH w11,[x2,#2] //II load pu1_src_left since ht - row =0 181 cmhi v16.16b, v19.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) 182 ADD x5,x14,x5 //(ht - row) * src_strd + (wd - col) 183 184 mov v28.4h[7], w11 //II vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15) 185 cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) 186 187 LDRH w14,[x6,x5] //pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)] 188 SUB v20.16b, v18.16b , v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 189 SUB x4,x4,#1 190 191 LDRB w11,[x12,#16] //pu1_src_cpy[16] 192 EXT v28.16b, v28.16b , v30.16b,#14 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14) 193 194 mov v21.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) 195 cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) 196 197 LDRB w11,[x12,#17] //pu1_src_cpy[17] 198 cmhi v24.16b, v28.16b , v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) 199 STRH w14,[x2],#2 //pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)] 200 201 ADD x12,x12,x1 202 mov v21.8b[1], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1) 203 LDRB w11,[x12,#16] //II pu1_src_cpy[16] 204 205 EXT v21.16b, v19.16b , v21.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2) 206 mov v28.8b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) 207 208 LDRB w11,[x12,#17] //II pu1_src_cpy[17] 209 cmhi v16.16b, v19.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) 210 SUB x12,x12,x1 211 212 cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) 213 mov v28.8b[1], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1) 214 215 SUB v22.16b, v18.16b , v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 216 EXT v28.16b, v30.16b , v28.16b,#2 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2) 217 218 ADD v21.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left) 219 220 mov v5.d[1],v5.d[0] 221 ADD v21.16b, v21.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right) 222 TBL v21.16b, {v5.16b},v21.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 223 SUB v20.16b, v24.16b , v26.16b //II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 224 225 cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) 226 // TBL v15.8b, {v10.16b},v15.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 227 cmhi v24.16b, v28.16b , v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) 228 229 AND v21.16b, v21.16b , v3.16b //edge_idx = vandq_s8(edge_idx, au1_mask) 230 mov v23.d[0],v21.d[1] 231 UZP1 v1.8b, v21.8b, v23.8b 232 UZP2 v23.8b, v21.8b, v23.8b 233 mov v21.8b, v1.8b 234 235 //mov v11.d[1],v0.d[0] 236 //mov v14.d[1],v15.d[0] 237 SUB v22.16b, v24.16b , v26.16b //II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 238 TBL v16.8b, {v7.16b},v21.8b //offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx)) 239 ADD v24.16b, v2.16b , v20.16b //II edge_idx = vaddq_s8(const_2, sign_left) 240 241 Uxtl v18.8h, v19.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 242 TBL v17.8b, {v0.16b},v23.8b 243 ADD v24.16b, v24.16b , v22.16b //II edge_idx = vaddq_s8(edge_idx, sign_right) 244 245 //mov v17.d[0],v16.d[1] 246 ZIP1 v1.8b, v16.8b, v17.8b 247 ZIP2 v17.8b, v16.8b, v17.8b 248 mov v16.8b, v1.8b 249 TBL v24.16b, {v5.16b},v24.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 250 Uxtl2 v19.8h, v19.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 251 252 //mov v16.d[1],v17.d[0] 253 SADDW v18.8h, v18.8h , v16.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 254 //TBL v25.8b, {v10.16b},v25.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 255 SMAX v18.8h, v18.8h , v4.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 256 257 AND v24.16b, v24.16b , v3.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) 258 mov v25.d[0],v24.d[1] 259 UMIN v18.8h, v18.8h , v6.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 260 UZP1 v1.8b, v24.8b, v25.8b 261 UZP2 v25.8b, v24.8b, v25.8b //II 262 mov v24.8b, v1.8b 263 264 //mov v24.d[1],v25.d[0] 265 SADDW v19.8h, v19.8h , v17.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 266 TBL v26.8b, {v7.16b},v24.8b //II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx)) 267 SMAX v19.8h, v19.8h , v4.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 268 269 UMIN v19.8h, v19.8h , v6.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 270 TBL v27.8b, {v0.16b},v25.8b //II 271 xtn v21.8b, v18.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 272 273 //mov v27.d[0],v26.d[1] 274 xtn v23.8b, v19.8h //vmovn_s16(pi2_tmp_cur_row.val[1]) 275 ZIP1 v1.8b, v26.8b, v27.8b 276 ZIP2 v27.8b, v26.8b, v27.8b //II 277 mov v26.8b, v1.8b 278 279 //mov v26.d[1],v27.d[0] 280 SUB x5,x9,x8 //II wd - col 281 Uxtl v28.8h, v30.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 282 SUB x14,x10,x4 //II ht - row 283 284 mul x14, x14, x1 //II (ht - row) * src_strd 285 SADDW v28.8h, v28.8h , v26.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 286 ADD x5,x14,x5 //II (ht - row) * src_strd + (wd - col) 287 288 LDRH w14,[x6,x5] //II pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)] 289 SMAX v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 290 291 STRH w14,[x2],#2 //II pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)] 292 UMIN v28.8h, v28.8h , v6.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 293 294 //mov v31.2d[0],v30.2d[1] 295 Uxtl2 v30.8h, v30.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 296 297 SADDW v30.8h, v30.8h , v27.8b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 298 ST1 {v21.8b},[x12],#8 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 299 ST1 {v23.8b},[x12],x1 300 SUB x12,x12,#8 301 302 SMAX v30.8h, v30.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 303 SUBS x4,x4,#1 //Decrement row by 1 304 UMIN v30.8h, v30.8h , v6.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 305 306 xtn v28.8b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[0]) 307 xtn v29.8b, v30.8h //II vmovn_s16(pi2_tmp_cur_row.val[1]) 308 309 ST1 {v28.8b, v29.8b},[x12],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row) 310 311 BNE PU1_SRC_LOOP //If not equal jump to the inner loop 312 313 ADD x0,x0,#16 //pu1_src += 16 314 315 SUBS x8,x8,#16 //Decrement column by 16 316 CMP x8,#8 //Check whether residue remains 317 MOV x2,x3 //Reload pu1_src_left 318 BEQ WIDTH_RESIDUE //If residue remains jump to residue loop 319 BGT WIDTH_LOOP_16 //If not equal jump to width_loop 320 BLT END_LOOPS //Jump to end function 321 322 WIDTH_RESIDUE: 323 SUB x6,x6,#14 324 AND x8,x9,#0xF //wd_rem = wd & 0xF 325 CMP x8,#0 //Residue check 326 BEQ END_LOOPS //No Residue jump to end function 327 328 CMP x8,x9 //if(wd_rem == wd) 329 BNE AU1_MASK_FF_RESIDUE //jump to else part 330 LDRB w12,[x7] //pu1_avail[0] 331 mov v3.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) 332 mov v3.8b[1], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) 333 B SKIP_AU1_MASK_FF_RESIDUE //Skip the else part 334 335 AU1_MASK_FF_RESIDUE: 336 MOV x12,#-1 //move -1 to x12 337 mov v3.4h[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 338 339 SKIP_AU1_MASK_FF_RESIDUE: 340 LDRB w12,[x7,#1] //pu1_avail[1] 341 mov v3.8b[6], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 342 mov v3.8b[7], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 343 344 MOV x12,x0 //pu1_src_cpy = pu1_src 345 MOV x4,x10 //move ht to x4 for loop count 346 347 PU1_SRC_LOOP_RESIDUE: 348 LDRH w11,[x2] //load pu1_src_left 349 LD1 {v19.16b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy) 350 //LD1 {v13.8b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy) 351 //SUB x12, x12,#8 352 SUB x5,x9,#2 //wd - 2 353 354 SUB x14,x10,x4 //(ht - row) 355 mov v21.4h[7], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) 356 LSL x14,x14,#1 //(ht - row) * 2 357 358 LD1 {v30.16b},[x12] //II pu1_cur_row = vld1q_u8(pu1_src_cpy) 359 //LD1 {v31.8b},[x12] //II pu1_cur_row = vld1q_u8(pu1_src_cpy) 360 //SUB x12, x12,#8 361 EXT v21.16b, v21.16b , v19.16b,#14 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15) 362 SUB x12,x12,x1 363 364 LDRH w11,[x2,#2] //II load pu1_src_left 365 cmhi v16.16b, v19.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) 366 mul x14, x14, x1 //(ht - row) * 2 * src_strd 367 368 cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) 369 mov v28.4h[7], w11 //II vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) 370 371 LDRB w11,[x12,#16] //pu1_src_cpy[16] 372 SUB v20.16b, v18.16b , v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 373 ADD x5,x14,x5 //(ht - row) * 2 * src_strd + (wd - 2) 374 375 mov v21.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) 376 EXT v28.16b, v28.16b , v30.16b,#14 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15) 377 378 LDRB w11,[x12,#17] //pu1_src_cpy[17] 379 cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) 380 LDRH w14,[x6, x5] //pu1_src_org[(ht - row) * 2* src_strd + (wd - 2)] 381 382 mov v21.8b[1], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1) 383 cmhi v24.16b, v28.16b , v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) 384 ADD x12,x12,x1 385 386 STRH w14,[x2],#2 //pu1_src_left[(ht - row) * 2] = au1_src_left_tmp[(ht - row) * 2] 387 EXT v21.16b, v19.16b , v21.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1) 388 LDRB w11,[x12,#16] //II pu1_src_cpy[16] 389 390 cmhi v16.16b, v19.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) 391 mov v28.8b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) 392 393 LDRB w11,[x12,#17] //II pu1_src_cpy[17] 394 cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) 395 SUB x4,x4,#1 //II Decrement row by 1 396 397 SUB v22.16b, v18.16b , v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 398 mov v28.8b[1], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1) 399 SUB x12,x12,x1 400 401 ADD v21.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left) 402 EXT v28.16b, v30.16b , v28.16b,#2 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1) 403 404 ADD v21.16b, v21.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right) 405 406 SUB v20.16b, v24.16b , v26.16b //II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 407 TBL v21.16b, {v5.16b},v21.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 408 cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) 409 410 cmhi v24.16b, v28.16b , v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) 411 //TBL v15.8b, {v10.16b},v15.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 412 SUB v22.16b, v24.16b , v26.16b //II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 413 414 AND v21.16b, v21.16b , v3.16b //edge_idx = vandq_s8(edge_idx, au1_mask) 415 mov v23.d[0],v21.d[1] 416 UZP1 v1.8b, v21.8b, v23.8b 417 UZP2 v23.8b, v21.8b, v23.8b 418 mov v21.8b, v1.8b 419 420 ADD v28.16b, v2.16b , v20.16b //II edge_idx = vaddq_s8(const_2, sign_left) 421 TBL v16.8b, {v7.16b},v21.8b //offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx)) 422 ADD v28.16b, v28.16b , v22.16b //II edge_idx = vaddq_s8(edge_idx, sign_right) 423 424 Uxtl v18.8h, v19.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 425 TBL v17.8b, {v0.16b},v23.8b 426 Uxtl v24.8h, v30.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 427 428 ZIP1 v1.8b, v16.8b, v17.8b 429 ZIP2 v17.8b, v16.8b, v17.8b 430 mov v16.8b, v1.8b 431 TBL v28.16b, {v5.16b},v28.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 432 SADDW v18.8h, v18.8h , v16.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 433 434 SMAX v18.8h, v18.8h , v4.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 435 //TBL v29.8b, {v10.16b},v29.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 436 UMIN v18.8h, v18.8h , v6.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 437 438 xtn v18.8b, v18.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 439 AND v28.16b, v28.16b , v3.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) 440 mov v29.d[0],v28.d[1] 441 SUB x5,x9,#2 //II wd - 2 442 UZP1 v1.8b, v28.8b, v29.8b 443 UZP2 v29.8b, v28.8b, v29.8b //II 444 mov v28.8b, v1.8b 445 SUB x14,x10,x4 //II (ht - row) 446 447 LSL x14,x14,#1 //II (ht - row) * 2 448 TBL v26.8b, {v7.16b},v28.8b //II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx)) 449 mul x14, x14, x1 //II (ht - row) * 2 * src_strd 450 451 ADD x5,x14,x5 //II (ht - row) * 2 * src_strd + (wd - 2) 452 TBL v27.8b, {v0.16b},v29.8b //II 453 LDRH w14,[x6, x5] //II pu1_src_org[(ht - row) * 2* src_strd + (wd - 2)] 454 455 ZIP1 v1.8b, v26.8b, v27.8b 456 ZIP2 v27.8b, v26.8b, v27.8b //II 457 mov v26.8b, v1.8b 458 ST1 {v18.8b},[x12],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 459 460 STRH w14,[x2],#2 //II pu1_src_left[(ht - row) * 2] = au1_src_left_tmp[(ht - row) * 2] 461 SADDW v24.8h, v24.8h , v26.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 462 SUBS x4,x4,#1 //Decrement row by 1 463 464 SMAX v24.8h, v24.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 465 UMIN v24.8h, v24.8h , v6.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 466 467 xtn v28.8b, v24.8h //II vmovn_s16(pi2_tmp_cur_row.val[0]) 468 469 ST1 {v28.8b},[x12],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row) 470 471 BNE PU1_SRC_LOOP_RESIDUE //If not equal jump to the pu1_src loop 472 473 END_LOOPS: 474 // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP 475 ldp x25, x26,[sp],#16 476 ldp x23, x24,[sp],#16 477 ldp x21, x22,[sp],#16 478 ldp x19, x20,[sp],#16 479 480 ret 481 482 483 484 485 486