1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 //******************************************************************************* 20 //* ,:file 21 //* ihevc_sao_edge_offset_class3.s 22 //* 23 //* ,:brief 24 //* Contains function definitions for inter prediction interpolation. 25 //* Functions are coded using NEON intrinsics and can be compiled using@ ARM 26 //* RVCT 27 //* 28 //* ,:author 29 //* Parthiban V 30 //* 31 //* ,:par List of Functions: 32 //* 33 //* 34 //* ,:remarks 35 //* None 36 //* 37 //******************************************************************************* 38 //*/ 39 //void ihevc_sao_edge_offset_class3(UWORD8 *pu1_src, 40 // WORD32 src_strd, 41 // UWORD8 *pu1_src_left, 42 // UWORD8 *pu1_src_top, 43 // UWORD8 *pu1_src_top_left, 44 // UWORD8 *pu1_src_top_right, 45 // UWORD8 *pu1_src_bot_left, 46 // UWORD8 *pu1_avail, 47 // WORD8 *pi1_sao_offset, 48 // WORD32 wd, 49 // WORD32 ht) 50 //**************Variables Vs Registers***************************************** 51 //x0 => *pu1_src 52 //x1 => src_strd 53 //x2 => *pu1_src_left 54 //x3 => *pu1_src_top 55 //x4 => *pu1_src_top_left 56 //x5 => *pu1_avail 57 //x6 => *pi1_sao_offset 58 //x7 => wd 59 //x8=> ht 60 61 .text 62 .p2align 2 63 64 .include "ihevc_neon_macros.s" 65 66 .globl gi1_table_edge_idx 67 .globl ihevc_sao_edge_offset_class3_av8 68 69 ihevc_sao_edge_offset_class3_av8: 70 71 72 // STMFD sp!,{x4-x12,x14} //stack stores the values of the arguments 73 stp x19, x20,[sp,#-16]! 74 stp x21, x22,[sp,#-16]! 75 stp x23, x24,[sp,#-16]! 76 77 MOV x19,x0 //Store pu1_src in sp 78 MOV x21,x6 //Store pu1_src_left in sp 79 MOV x22,x3 //Store pu1_src_top in sp 80 MOV x23,x7 //Store pu1_avail in sp 81 MOV x24,x4 //Store pu1_src_top_left in sp 82 MOV x20,x5 //Store pu1_src_top_right in sp 83 MOV x13,x6 //Store pu1_src_bot_left in sp 84 85 MOV x5,x7 //Loads pu1_avail 86 87 LDR x6,[sp,#48] //Loads pi1_sao_offset 88 LDR w7,[sp,#56] //Loads wd 89 LDR w8,[sp,#64] //Loads ht 90 91 MOV x16,x7 // wd 92 MOV x17,x8 // ht 93 94 SUB x9,x7,#1 //wd - 1 95 96 LDRB w10,[x3,x9] //pu1_src_top[wd - 1] 97 98 MOV x9,x7 //Move width to x9 for loop count 99 100 SUB sp,sp,#0xA0 //Decrement the stack pointer to store some temp arr values 101 102 STRB w10,[sp] //u1_src_top_left_tmp = pu1_src_top[wd - 1] 103 SUB x10,x8,#1 //ht-1 104 madd x11, x10, x1, x0 //pu1_src[(ht - 1) * src_strd + col] 105 ADD x12,sp,#0x02 //temp array 106 107 AU1_SRC_TOP_LOOP: 108 LD1 {v0.8b},[x11],#8 //pu1_src[(ht - 1) * src_strd + col] 109 SUBS x9,x9,#8 //Decrement the loop count by 8 110 ST1 {v0.8b},[x12],#8 //au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col] 111 BNE AU1_SRC_TOP_LOOP 112 113 PU1_AVAIL_5_LOOP: 114 LDRB w9,[x5,#5] //pu1_avail[5] 115 CMP x9,#0 116 SUB x10,x7,#1 //[wd - 1] 117 LDRB w9,[x0,x10] //u1_pos_0_0_tmp = pu1_src[wd - 1] 118 BEQ PU1_AVAIL_6_LOOP 119 120 MOV x11,x20 //Load pu1_src_top_right from sp 121 SUB x10,x10,#1 //[wd - 1 - 1] 122 123 LDRB w11,[x11] //pu1_src_top_right[0] 124 SUB x12,x9,x11 //pu1_src[wd - 1] - pu1_src_top_right[0] 125 126 ADD x11,x0,x1 //pu1_src + src_strd 127 128 LDRB w14,[x11,x10] //pu1_src[wd - 1 - 1 + src_strd] 129 CMP x12,#0 130 movn x20,#0 131 csel x12, x20, x12,LT 132 SUB x11,x9,x14 //pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd] 133 134 MOV x20,#1 135 csel x12, x20, x12,GT //SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) 136 CMP x11,#0 137 movn x20,#0 138 csel x11, x20, x11,LT 139 MOV x20,#1 140 csel x11, x20, x11,GT //SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]) 141 ADRP x14, :got:gi1_table_edge_idx //table pointer 142 LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 143 ADD x11,x12,x11 //SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) + SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]) 144 ADD x11,x11,#2 //edge_idx 145 146 LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx] 147 CMP x12,#0 //0 != edge_idx 148 BEQ PU1_AVAIL_6_LOOP 149 LDRSB x10,[x6,x12] //pi1_sao_offset[edge_idx] 150 ADD x9,x9,x10 //pu1_src[0] + pi1_sao_offset[edge_idx] 151 mov x20,#255 152 cmp x9,x20 153 csel x9, x20, x9, ge //u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 154 155 PU1_AVAIL_6_LOOP: 156 LDRB w10,[x5,#6] //pu1_avail[6] 157 SUB x11,x8,#1 //ht - 1 158 159 CMP x10,#0 160 madd x12, x11, x1, x0 //pu1_src[(ht - 1) * src_strd] 161 162 LDRB w10,[x12] //u1_pos_wd_ht_tmp = pu1_src[(ht - 1) * src_strd] 163 BEQ PU1_AVAIL_3_LOOP 164 165 MOV x14,x13 //Load pu1_src_bot_left from sp 166 SUB x11,x12,x1 //pu1_src[(ht - 1) * src_strd) - src_strd] 167 168 LDRB w14,[x14] //Load pu1_src_bot_left[0] 169 ADD x11,x11,#1 //pu1_src[(ht - 1) * src_strd + 1 - src_strd] 170 171 LDRB w11,[x11] //Load pu1_src[(ht - 1) * src_strd + 1 - src_strd] 172 SUB x14,x10,x14 //pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0] 173 174 SUB x11,x10,x11 //pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd] 175 CMP x11,#0 176 movn x20,#0 177 csel x11, x20, x11,LT 178 MOV x20,#1 179 csel x11, x20, x11,GT //SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]) 180 181 CMP x14,#0 182 movn x20,#0 183 csel x14, x20, x14,LT 184 MOV x20,#1 185 csel x14, x20, x14,GT //SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]) 186 187 ADD x11,x11,x14 //Add 2 sign value 188 189 ADRP x14, :got:gi1_table_edge_idx //table pointer 190 LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 191 ADD x11,x11,#2 //edge_idx 192 193 LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx] 194 CMP x12,#0 195 BEQ PU1_AVAIL_3_LOOP 196 LDRSB x11,[x6,x12] //pi1_sao_offset[edge_idx] 197 ADD x10,x10,x11 //pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx] 198 mov x20,#255 199 cmp x10,x20 200 csel x10, x20, x10, ge //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 201 202 PU1_AVAIL_3_LOOP: 203 MOV x21,x2 204 MOV x12,x8 //Move ht 205 206 MOV x14,x2 //Move pu1_src_left to pu1_src_left_cpy 207 movi v0.16b, #2 //const_2 = vdupq_n_s8(2) 208 LDRB w11,[x5,#3] //pu1_avail[3] 209 210 CMP x11,#0 211 movi v2.8h, #0 //const_min_clip = vdupq_n_s16(0) 212 SUB x20,x12,#1 //ht_tmp-- 213 csel x12, x20, x12,EQ 214 215 LDRB w5,[x5,#2] //pu1_avail[2] 216 movi v4.8h, #255 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1) 217 CMP x5,#0 218 219 ADD x20,x0,x1 //pu1_src += src_strd 220 csel x0, x20, x0,EQ 221 LD1 {v7.8b},[x6] //offset_tbl = vld1_s8(pi1_sao_offset) 222 SUB x20,x12,#1 //ht_tmp-- 223 csel x12, x20, x12,EQ 224 225 ADRP x6, :got:gi1_table_edge_idx //table pointer 226 LDR x6, [x6, #:got_lo12:gi1_table_edge_idx] 227 228 movi v1.16b, #0xFF //au1_mask = vdupq_n_s8(-1) 229 ADD x20,x14,#1 //pu1_src_left_cpy += 1 230 csel x14, x20, x14,EQ 231 232 MOV x15,x0 //Store pu1_src in sp 233 LD1 {v6.8b},[x6] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 234 MOV x6,x7 //move wd to x6 loop_count 235 236 CMP x7,#16 //Compare wd with 16 237 BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case 238 CMP x8,#4 //Compare ht with 4 239 BLE WD_16_HT_4_LOOP //If jump to WD_16_HT_4_LOOP 240 241 WIDTH_LOOP_16: 242 MOV x7,x16 //Loads wd 243 244 MOV x5,x23 //Loads pu1_avail 245 CMP x6,x7 //col == wd 246 LDRb w20, [x5] //pu1_avail[0] 247 csel w8,w20,w8,EQ 248 MOV x20,#-1 249 csel x8, x20, x8,NE 250 mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 251 252 CMP x6,#16 //if(col == 16) 253 BNE SKIP_AU1_MASK_VAL 254 LDRB w8,[x5,#1] //pu1_avail[1] 255 mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 256 257 SKIP_AU1_MASK_VAL: 258 LDRB w8,[x5,#2] //pu1_avail[2] 259 CMP x8,#0 260 261 MOV x4,x17 //Loads ht 262 SUB x20,x0,x1 //pu1_src - src_strd 263 csel x8, x20, x8,EQ 264 265 csel x8, x3, x8,NE 266 ADD x5,sp,#0x42 //*au1_src_left_tmp 267 268 MOV x7,x16 //Loads wd 269 ADD x8,x8,#1 //pu1_src - src_strd + 1 270 271 SUB x7,x7,x6 //(wd - col) 272 LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) 273 ADD x3,x3,#16 274 275 MOV x8,x19 //Loads *pu1_src 276 LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 277 ADD x7,x7,#15 //15 + (wd - col) 278 279 ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)] 280 cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) 281 SUB x5,x5,#1 282 283 AU1_SRC_LEFT_LOOP: 284 LDRB w8,[x7] //load the value and increment by src_strd 285 ADD x7,x7,x1 286 SUBS x4,x4,#1 //decrement the loop count 287 STRB w8,[x5,#1]! //store it in the stack pointer 288 BNE AU1_SRC_LEFT_LOOP 289 290 movi v18.16b, #0 291 cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) 292 293 ADD x8,x0,x1 //I *pu1_src + src_strd 294 SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 295 MOV x7,x12 //row count, move ht_tmp to x7 296 297 SUB x5,x12,x7 //I ht_tmp - row 298 LD1 {v16.16b},[x8] //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 299 ADD x8,x14,x5 //I pu1_src_left_cpy[ht_tmp - row] 300 301 ADD x8,x8,#1 //I pu1_src_left_cpy[ht_tmp - row + 1] 302 LDRB w8,[x8] 303 304 MOV x5,x23 //I Loads pu1_avail 305 mov v18.b[15], w8 //I vsetq_lane_u8 306 LDRB w5,[x5,#2] //I pu1_avail[2] 307 308 EXT v18.16b, v18.16b , v16.16b,#15 //I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 309 CMP x5,#0 //I 310 BNE SIGN_UP_CHANGE_DONE //I 311 312 SIGN_UP_CHANGE: 313 LDRB w8,[x0,#15] //I pu1_src_cpy[15] 314 SUB x5,x0,x1 //I pu1_src_cpy[16 - src_strd] 315 316 LDRB w5,[x5,#16] //I load the value 317 SUB x8,x8,x5 //I pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 318 CMP x8,#0 //I 319 movn x20,#0 320 csel x8, x20, x8,LT //I 321 MOV x20,#1 322 csel x8, x20, x8,GT //I SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 323 mov v17.b[15], w8 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 324 325 SIGN_UP_CHANGE_DONE: 326 cmhi v3.16b, v5.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 327 cmhi v18.16b, v18.16b , v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 328 SUB v3.16b, v18.16b , v3.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 329 330 ADD v18.16b, v0.16b , v17.16b //I edge_idx = vaddq_s8(const_2, sign_up) 331 ADD v18.16b, v18.16b , v3.16b //I edge_idx = vaddq_s8(edge_idx, sign_down) 332 TBL v18.16b, {v6.16b},v18.16b //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 333 NEG v17.16b, v3.16b //I sign_up = vnegq_s8(sign_down) 334 335 EXT v17.16b, v17.16b , v17.16b,#1 //I sign_up = vextq_s8(sign_up, sign_up, 1) 336 // TBL v19.8b, {v6.16b},v19.8b //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 337 338 Uxtl v20.8h, v5.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 339 AND v18.16b, v18.16b , v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask) 340 341 TBL v3.16b, {v7.16b},v18.16b //I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 342 343 Uxtl2 v22.8h, v5.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 344 SADDW v20.8h, v20.8h , v3.8b //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 345 346 SMAX v20.8h, v20.8h , v2.8h //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 347 // TBL v11.8b, {v7.16b},v19.8b //I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 348 UMIN v20.8h, v20.8h , v4.8h //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 349 350 mov v5.16b, v16.16b 351 SADDW2 v22.8h, v22.8h , v3.16b //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 352 353 SMAX v22.8h, v22.8h , v2.8h //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 354 UMIN v22.8h, v22.8h , v4.8h //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 355 356 SUB x7,x7,#1 //I Decrement the ht_tmp loop count by 1 357 358 PU1_SRC_LOOP: 359 ADD x8,x0,x1,LSL #1 //II *pu1_src + src_strd 360 xtn v20.8b, v20.8h //I vmovn_s16(pi2_tmp_cur_row.val[0]) 361 SUB x5,x12,x7 //II ht_tmp - row 362 363 ADD x4,x0,x1 //II pu1_src_cpy[16 - src_strd] 364 xtn2 v20.16b, v22.8h //I vmovn_s16(pi2_tmp_cur_row.val[1]) 365 ADD x2,x8,x1 //III *pu1_src + src_strd 366 367 LDRB w11,[x4,#15] //II pu1_src_cpy[15] 368 LD1 {v16.16b},[x8] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 369 SUB x7,x7,#1 //II Decrement the ht_tmp loop count by 1 370 371 ADD x8,x14,x5 //II pu1_src_left_cpy[ht_tmp - row] 372 LD1 {v30.16b},[x2] //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 373 LDRB w8,[x8,#1] 374 375 LDRB w4,[x0,#16] //II load the value 376 mov v18.b[15], w8 //II vsetq_lane_u8 377 SUB x11,x11,x4 //II pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 378 379 CMP x11,#0 //II 380 ST1 { v20.16b},[x0],x1 //I vst1q_u8(pu1_src_cpy, pu1_cur_row) 381 SUB x5,x12,x7 //III ht_tmp - row 382 383 movn x20,#0 384 csel x11, x20, x11,LT //II 385 EXT v18.16b, v18.16b , v16.16b,#15 //II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 386 MOV x20,#1 387 csel x11, x20, x11,GT //II SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 388 389 ADD x8,x14,x5 //III pu1_src_left_cpy[ht_tmp - row] 390 mov v17.b[15], w11 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 391 CMP x7,#1 //III 392 393 BNE NEXT_ROW_ELSE_2 //III 394 MOV x5,x23 //III Loads pu1_avail 395 LDRB w5,[x5,#3] //III pu1_avail[3] 396 CMP x5,#0 //III 397 SUB x20,x2,#2 //III pu1_src_cpy[src_strd - 1] 398 csel x8, x20, x8,NE 399 400 NEXT_ROW_ELSE_2: 401 LDRB w8,[x8,#1] //III 402 cmhi v24.16b, v5.16b , v18.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 403 ADD x5,x0,x1 404 405 LDRB w2,[x5,#15] //III pu1_src_cpy[15] 406 cmhi v26.16b, v18.16b , v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 407 LDRB w5,[x0,#16] //III load the value 408 409 SUB x2,x2,x5 //III pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 410 SUB v24.16b, v26.16b , v24.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 411 CMP x2,#0 //III 412 413 movn x20,#0 414 csel x2, x20, x2,LT //III 415 mov v18.b[15], w8 //III vsetq_lane_u8 416 MOV x20,#1 417 csel x2, x20, x2,GT //III SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 418 419 SUB x7,x7,#1 //III Decrement the ht_tmp loop count by 1 420 ADD v26.16b, v0.16b , v17.16b //II edge_idx = vaddq_s8(const_2, sign_up) 421 422 NEG v17.16b, v24.16b //II sign_up = vnegq_s8(sign_down) 423 EXT v18.16b, v18.16b , v30.16b,#15 //III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 424 425 ADD v26.16b, v26.16b , v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down) 426 427 EXT v17.16b, v17.16b , v17.16b,#1 //II sign_up = vextq_s8(sign_up, sign_up, 1) 428 TBL v26.16b, {v6.16b},v26.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 429 cmhi v3.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 430 431 mov v17.b[15], w2 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 432 // TBL v27.8b, {v6.16b},v27.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 433 cmhi v18.16b, v18.16b , v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 434 435 Uxtl v28.8h, v5.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 436 AND v26.16b, v26.16b , v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) 437 438 SUB v3.16b, v18.16b , v3.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 439 TBL v24.16b, {v7.16b},v26.16b //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 440 ADD v18.16b, v0.16b , v17.16b //III edge_idx = vaddq_s8(const_2, sign_up) 441 442 ADD v18.16b, v18.16b , v3.16b //III edge_idx = vaddq_s8(edge_idx, sign_down) 443 // TBL v25.8b, {v7.16b},v27.8b //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 444 NEG v17.16b, v3.16b //III sign_up = vnegq_s8(sign_down) 445 446 SADDW v28.8h, v28.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 447 TBL v18.16b, {v6.16b},v18.16b //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 448 SMAX v28.8h, v28.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 449 450 EXT v17.16b, v17.16b , v17.16b,#1 //III sign_up = vextq_s8(sign_up, sign_up, 1) 451 // TBL v19.8b, {v6.16b},v19.8b //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 452 UMIN v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 453 454 Uxtl2 v26.8h, v5.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 455 AND v18.16b, v18.16b , v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask) 456 457 SADDW2 v26.8h, v26.8h , v24.16b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 458 TBL v3.16b, {v7.16b},v18.16b //III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 459 SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 460 461 Uxtl v20.8h, v16.8b //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 462 UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 463 464 SADDW v20.8h, v20.8h , v3.8b //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 465 // TBL v11.8b, {v7.16b},v19.8b //III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 466 SMAX v20.8h, v20.8h , v2.8h //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 467 468 Uxtl2 v22.8h, v16.16b //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 469 UMIN v20.8h, v20.8h , v4.8h //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 470 471 xtn v28.8b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[0]) 472 SADDW2 v22.8h, v22.8h , v3.16b //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 473 474 xtn2 v28.16b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[1]) 475 SMAX v22.8h, v22.8h , v2.8h //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 476 477 mov v5.16b, v30.16b //II pu1_cur_row = pu1_next_row 478 UMIN v22.8h, v22.8h , v4.8h //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 479 480 CMP x7,#1 //III 481 ST1 { v28.16b},[x0],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row) 482 BGT PU1_SRC_LOOP //If not equal jump to PU1_SRC_LOOP 483 BLT INNER_LOOP_DONE 484 485 ADD x8,x0,x1,LSL #1 //*pu1_src + src_strd 486 xtn v20.8b, v20.8h //III vmovn_s16(pi2_tmp_cur_row.val[0]) 487 MOV x5,x23 //Loads pu1_avail 488 489 LDRB w5,[x5,#3] //pu1_avail[3] 490 xtn2 v20.16b, v22.8h //III vmovn_s16(pi2_tmp_cur_row.val[1]) 491 CMP x5,#0 492 493 ADD x4,x0,x1 //pu1_src_cpy[16 - src_strd] 494 LD1 {v16.16b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 495 LDRB w5,[x0,#16] //load the value 496 497 BEQ NEXT_ROW_ELSE_3 498 SUB x8,x8,#1 499 LDRB w8,[x8] //pu1_src_cpy[src_strd - 1] 500 B NEXT_ROW_POINTER_ASSIGNED_3 501 NEXT_ROW_ELSE_3: 502 SUB x11,x12,x7 //ht_tmp - row 503 ADD x8,x14,x11 //pu1_src_left_cpy[ht_tmp - row] 504 ADD x8,x8,#1 //pu1_src_left_cpy[ht_tmp - row + 1] 505 LDRB w8,[x8] 506 507 NEXT_ROW_POINTER_ASSIGNED_3: 508 LDRB w11,[x4,#15] //pu1_src_cpy[15] 509 mov v18.b[15], w8 //vsetq_lane_u8 510 SUB x8,x11,x5 //pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 511 512 CMP x8,#0 513 EXT v18.16b, v18.16b , v16.16b,#15 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 514 movn x20,#0 515 csel x8, x20, x8,LT 516 517 ST1 { v20.16b},[x0],x1 //III vst1q_u8(pu1_src_cpy, pu1_cur_row) 518 cmhi v24.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 519 520 MOV x20,#1 521 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 522 cmhi v26.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 523 524 mov v17.b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 525 SUB v24.16b, v26.16b , v24.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 526 527 Uxtl v20.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 528 ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) 529 530 Uxtl2 v22.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 531 ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) 532 533 TBL v26.16b, {v6.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 534 // TBL v27.8b, {v6.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 535 536 AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) 537 538 TBL v24.16b, {v7.16b},v26.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 539 540 SADDW v20.8h, v20.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 541 // TBL v25.8b, {v7.16b},v27.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 542 SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 543 544 UMIN v20.8h, v20.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 545 546 SADDW2 v22.8h, v22.8h , v24.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 547 SMAX v22.8h, v22.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 548 UMIN v22.8h, v22.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 549 550 INNER_LOOP_DONE: 551 xtn v20.8b, v20.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 552 MOV x8,x17 //Loads ht 553 554 xtn2 v20.16b, v22.8h //vmovn_s16(pi2_tmp_cur_row.val[1]) 555 ADD x5,sp,#0x42 //*au1_src_left_tmp 556 557 ST1 { v20.16b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 558 MOV x2,x21 //Loads *pu1_src_left 559 SRC_LEFT_LOOP: 560 LDR w7,[x5],#4 //au1_src_left_tmp[row] 561 SUBS x8,x8,#4 562 STR w7,[x2],#4 //pu1_src_left[row] = au1_src_left_tmp[row] 563 BNE SRC_LEFT_LOOP 564 565 SUBS x6,x6,#16 //Decrement the wd loop count by 16 566 CMP x6,#8 //Check whether residue remains 567 BLT RE_ASSINING_LOOP //Jump to re-assigning loop 568 MOV x7,x16 //Loads wd 569 MOV x0,x15 //Loads *pu1_src 570 SUB x7,x7,x6 571 ADD x0,x0,x7 572 BGT WIDTH_LOOP_16 //If not equal jump to width_loop 573 BEQ WIDTH_RESIDUE //If residue remains jump to residue loop 574 575 576 577 WD_16_HT_4_LOOP: 578 MOV x5,x23 //Loads pu1_avail 579 MOV x7,x16 //Loads wd 580 CMP x6,x7 //col == wd 581 LDRb w20, [x5] //pu1_avail[0] 582 csel w8,w20,w8,EQ 583 MOV x20,#-1 584 csel x8, x20, x8,NE 585 mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 586 587 CMP x6,#16 //if(col == 16) 588 BNE SKIP_AU1_MASK_VAL_WD_16_HT_4 589 LDRB w8,[x5,#1] //pu1_avail[1] 590 mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 591 592 SKIP_AU1_MASK_VAL_WD_16_HT_4: 593 LDRB w8,[x5,#2] //pu1_avail[2] 594 CMP x8,#0 595 596 SUB x20,x0,x1 //pu1_src - src_strd 597 csel x8, x20, x8,EQ 598 csel x8, x3, x8,NE 599 ADD x8,x8,#1 //pu1_src - src_strd + 1 600 LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) 601 602 ADD x3,x3,#16 603 ADD x5,sp,#0x42 //*au1_src_left_tmp 604 MOV x4,x17 //Loads ht 605 MOV x7,x16 //Loads wd 606 SUB x7,x7,x6 //(wd - col) 607 ADD x7,x7,#15 //15 + (wd - col) 608 MOV x8,x19 //Loads *pu1_src 609 ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)] 610 SUB x5,x5,#1 611 612 AU1_SRC_LEFT_LOOP_WD_16_HT_4: 613 LDRB w8,[x7] //load the value and increment by src_strd 614 ADD x7,x7,x1 615 STRB w8,[x5,#1]! //store it in the stack pointer 616 SUBS x4,x4,#1 //decrement the loop count 617 BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4 618 619 LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 620 621 cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) 622 cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) 623 SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 624 movi v18.16b, #0 625 MOV x7,x12 //row count, move ht_tmp to x7 626 627 PU1_SRC_LOOP_WD_16_HT_4: 628 ADD x8,x0,x1 //*pu1_src + src_strd 629 LD1 {v16.16b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 630 MOV x5,x23 //Loads pu1_avail 631 LDRB w5,[x5,#3] //pu1_avail[3] 632 CMP x5,#0 633 BEQ NEXT_ROW_ELSE_WD_16_HT_4 634 CMP x7,#1 635 SUB x8,x8,#1 636 LDRb w20, [x8] //pu1_src_cpy[src_strd - 1] 637 csel w8,w20,w8,EQ 638 BEQ NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4 639 NEXT_ROW_ELSE_WD_16_HT_4: 640 SUB x5,x12,x7 //ht_tmp - row 641 ADD x8,x14,x5 //pu1_src_left_cpy[ht_tmp - row] 642 ADD x8,x8,#1 //pu1_src_left_cpy[ht_tmp - row + 1] 643 LDRB w8,[x8] 644 645 NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4: 646 mov v18.b[15], w8 //vsetq_lane_u8 647 EXT v18.16b, v18.16b , v16.16b,#15 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 648 649 CMP x7,x12 650 BNE SIGN_UP_CHANGE_WD_16_HT_4 651 MOV x5,x23 //Loads pu1_avail 652 LDRB w5,[x5,#2] //pu1_avail[2] 653 CMP x5,#0 654 BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4 655 656 SIGN_UP_CHANGE_WD_16_HT_4: 657 LDRB w8,[x0,#15] //pu1_src_cpy[15] 658 ADD x5,x0,#16 //pu1_src_cpy[16] 659 SUB x5,x5,x1 //pu1_src_cpy[16 - src_strd] 660 LDRB w5,[x5] //load the value 661 SUB x8,x8,x5 //pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 662 CMP x8,#0 663 movn x20,#0 664 csel x8, x20, x8,LT 665 MOV x20,#1 666 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 667 mov v17.b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 668 669 SIGN_UP_CHANGE_DONE_WD_16_HT_4: 670 cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 671 cmhi v22.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 672 SUB v24.16b, v22.16b , v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 673 674 ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) 675 ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) 676 TBL v26.16b, {v6.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 677 // TBL v27.8b, {v6.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 678 679 AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) 680 681 NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down) 682 EXT v17.16b, v17.16b , v17.16b,#1 //sign_up = vextq_s8(sign_up, sign_up, 1) 683 684 TBL v24.16b, {v7.16b},v26.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 685 Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 686 SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 687 SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 688 UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 689 690 // TBL v25.8b, {v7.16b},v27.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 691 Uxtl2 v30.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 692 SADDW2 v30.8h, v30.8h , v24.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 693 SMAX v30.8h, v30.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 694 UMIN v30.8h, v30.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 695 696 xtn v28.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 697 xtn2 v28.16b, v30.8h //vmovn_s16(pi2_tmp_cur_row.val[1]) 698 699 ST1 { v28.16b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 700 701 mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row 702 SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1 703 BNE PU1_SRC_LOOP_WD_16_HT_4 //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4 704 705 MOV x8,x17 //Loads ht 706 ADD x5,sp,#0x42 //*au1_src_left_tmp 707 MOV x2,x21 //Loads *pu1_src_left 708 SRC_LEFT_LOOP_WD_16_HT_4: 709 LDR w7,[x5],#4 //au1_src_left_tmp[row] 710 STR w7,[x2],#4 //pu1_src_left[row] = au1_src_left_tmp[row] 711 SUBS x8,x8,#4 712 BNE SRC_LEFT_LOOP_WD_16_HT_4 713 714 SUBS x6,x6,#16 //Decrement the wd loop count by 16 715 BLE RE_ASSINING_LOOP //Jump to re-assigning loop 716 BGT WD_16_HT_4_LOOP //If not equal jump to width_loop 717 718 719 WIDTH_RESIDUE: 720 MOV x7,x16 //Loads wd 721 MOV x5,x23 //Loads pu1_avail 722 CMP x6,x7 //wd_residue == wd 723 LDRb w20, [x5] //pu1_avail[0] 724 csel w8,w20,w8,EQ 725 726 MOV x20,#-1 727 csel x8, x20, x8,NE 728 mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 729 730 LDRB w8,[x5,#1] //pu1_avail[1] 731 mov v1.b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 732 733 PU1_AVAIL_2_RESIDUE: 734 LDRB w8,[x5,#2] //pu1_avail[2] 735 CMP x8,#0 736 737 SUB x20,x0,x1 //pu1_src - src_strd 738 csel x8, x20, x8,EQ 739 csel x8, x3, x8,NE 740 ADD x8,x8,#1 //pu1_src - src_strd + 1 741 LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) 742 743 744 ADD x5,sp,#0x42 //*au1_src_left_tmp 745 MOV x4,x17 //Loads ht 746 MOV x7,x16 //Loads wd 747 MOV x8,x19 //Loads *pu1_src 748 SUB x7,x7,#1 //(wd - 1) 749 ADD x7,x8,x7 //pu1_src[0 * src_strd + (wd - 1)] 750 SUB x5,x5,#1 751 752 AU1_SRC_LEFT_LOOP_RESIDUE: 753 LDRB w8,[x7] //load the value and increment by src_strd 754 ADD x7,x7,x1 755 STRB w8,[x5,#1]! //store it in the stack pointer 756 SUBS x4,x4,#1 //decrement the loop count 757 BNE AU1_SRC_LEFT_LOOP_RESIDUE 758 759 LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 760 761 cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) 762 cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) 763 SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 764 MOV x7,x12 //row count, move ht_tmp to x7 765 766 PU1_SRC_LOOP_RESIDUE: 767 movi v18.16b, #0 768 ADD x8,x0,x1 //*pu1_src + src_strd 769 LD1 {v16.16b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 770 MOV x5,x23 //Loads pu1_avail 771 LDRB w5,[x5,#3] //pu1_avail[3] 772 CMP x5,#0 773 BEQ NEXT_ROW_ELSE_RESIDUE 774 CMP x7,#1 775 SUB x8,x8,#1 776 LDRb w20, [x8] //pu1_src_cpy[src_strd - 1] 777 csel w8,w20,w8,EQ 778 BEQ NEXT_ROW_POINTER_ASSIGNED_RESIDUE 779 NEXT_ROW_ELSE_RESIDUE: 780 SUB x5,x12,x7 //ht_tmp - row 781 ADD x8,x14,x5 //pu1_src_left_cpy[ht_tmp - row] 782 ADD x8,x8,#1 //pu1_src_left_cpy[ht_tmp - row + 1] 783 LDRB w8,[x8] 784 785 NEXT_ROW_POINTER_ASSIGNED_RESIDUE: 786 mov v18.b[15], w8 //vsetq_lane_u8 787 EXT v18.16b, v18.16b , v16.16b,#15 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 788 789 CMP x7,x12 790 BNE SIGN_UP_CHANGE_RESIDUE 791 MOV x5,x23 //Loads pu1_avail 792 LDRB w5,[x5,#2] //pu1_avail[2] 793 CMP x5,#0 794 BNE SIGN_UP_CHANGE_DONE_RESIDUE 795 796 SIGN_UP_CHANGE_RESIDUE: 797 LDRB w8,[x0,#15] //pu1_src_cpy[15] 798 ADD x5,x0,#16 //pu1_src_cpy[16] 799 SUB x5,x5,x1 //pu1_src_cpy[16 - src_strd] 800 LDRB w5,[x5] //load the value 801 SUB x8,x8,x5 //pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 802 CMP x8,#0 803 movn x20,#0 804 csel x8, x20, x8,LT 805 MOV x20,#1 806 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 807 mov v17.b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 808 809 SIGN_UP_CHANGE_DONE_RESIDUE: 810 cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 811 cmhi v22.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 812 SUB v24.16b, v22.16b , v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 813 814 ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) 815 ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) 816 TBL v26.16b, {v6.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 817 // TBL v27.8b, {v6.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 818 819 AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) 820 821 NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down) 822 EXT v17.16b, v17.16b , v17.16b,#1 //sign_up = vextq_s8(sign_up, sign_up, 1) 823 824 TBL v24.8b, {v7.16b},v26.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 825 Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 826 SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 827 SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 828 UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 829 830 xtn v30.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 831 832 ST1 {v30.8b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 833 mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row 834 SUBS x7,x7,#1 835 BNE PU1_SRC_LOOP_RESIDUE 836 837 MOV x8,x17 //Loads ht 838 MOV x2,x21 //Loads *pu1_src_left 839 ADD x5,sp,#0x42 //*au1_src_left_tmp 840 841 SRC_LEFT_LOOP_RESIDUE: 842 LDR w7,[x5],#4 //au1_src_left_tmp[row] 843 SUBS x8,x8,#4 844 STR w7,[x2],#4 //pu1_src_left[row] = au1_src_left_tmp[row] 845 BNE SRC_LEFT_LOOP_RESIDUE 846 847 848 RE_ASSINING_LOOP: 849 MOV x7,x16 //Loads wd 850 MOV x0,x19 //Loads *pu1_src 851 852 MOV x11,x17 //Loads ht 853 ADD x8,x0,x7 //pu1_src[wd] 854 855 MOV x4,x24 //Loads pu1_src_top_left 856 SUB x11,x11,#1 //ht - 1 857 858 SUB x8,x8,#1 859 STRB w9,[x8] //pu1_src_org[wd - 1] = u1_pos_wd_0_tmp 860 ADD x8,x8,#1 861 madd x6, x11, x1, x0 //pu1_src_org[(ht - 1) * src_strd] 862 863 LDRB w8,[sp] //load u1_src_top_left_tmp from stack pointer 864 ADD x12,sp,#0x02 865 866 STRB w10,[x6] //pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp 867 STRB w8,[x4] //*pu1_src_top_left = u1_src_top_left_tmp 868 MOV x3,x22 //Loads pu1_src_top 869 870 SRC_TOP_LOOP: 871 LD1 {v0.8b},[x12],#8 //pu1_src_top[col] = au1_src_top_tmp[col] 872 SUBS x7,x7,#8 //Decrement the width 873 ST1 {v0.8b},[x3],#8 //pu1_src_top[col] = au1_src_top_tmp[col] 874 BNE SRC_TOP_LOOP 875 876 END_LOOPS: 877 ADD sp,sp,#0xA0 878 // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP 879 ldp x23, x24,[sp], #16 880 ldp x21, x22,[sp], #16 881 ldp x19, x20,[sp], #16 882 ret 883 884 885 886