1 @/***************************************************************************** 2 @* 3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 @* 5 @* Licensed under the Apache License, Version 2.0 (the "License"); 6 @* you may not use this file except in compliance with the License. 7 @* You may obtain a copy of the License at: 8 @* 9 @* http://www.apache.org/licenses/LICENSE-2.0 10 @* 11 @* Unless required by applicable law or agreed to in writing, software 12 @* distributed under the License is distributed on an "AS IS" BASIS, 13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @* See the License for the specific language governing permissions and 15 @* limitations under the License. 16 @* 17 @*****************************************************************************/ 18 @/** 19 @******************************************************************************* 20 @* ,:file 21 @* ihevc_sao_edge_offset_class3.s 22 @* 23 @* ,:brief 24 @* Contains function definitions for inter prediction interpolation. 25 @* Functions are coded using NEON intrinsics and can be compiled using@ ARM 26 @* RVCT 27 @* 28 @* ,:author 29 @* Parthiban V 30 @* 31 @* ,:par List of Functions: 32 @* 33 @* 34 @* ,:remarks 35 @* None 36 @* 37 @******************************************************************************* 38 @*/ 39 @void ihevc_sao_edge_offset_class3(UWORD8 *pu1_src, 40 @ WORD32 src_strd, 41 @ UWORD8 *pu1_src_left, 42 @ UWORD8 *pu1_src_top, 43 @ UWORD8 *pu1_src_top_left, 44 @ UWORD8 *pu1_src_top_right, 45 @ UWORD8 *pu1_src_bot_left, 46 @ UWORD8 *pu1_avail, 47 @ WORD8 *pi1_sao_offset, 48 @ WORD32 wd, 49 @ WORD32 ht) 50 @**************Variables Vs Registers***************************************** 51 @r0 => *pu1_src 52 @r1 => src_strd 53 @r2 => *pu1_src_left 54 @r3 => *pu1_src_top 55 @r4 => *pu1_src_top_left 56 @r5 => *pu1_avail 57 @r6 => *pi1_sao_offset 58 @r7 => wd 59 @r8=> ht 60 61 .text 62 .syntax unified 63 .p2align 2 64 65 .extern gi1_table_edge_idx 66 .globl ihevc_sao_edge_offset_class3_a9q 67 68 gi1_table_edge_idx_addr_1: 69 .long gi1_table_edge_idx - ulbl1 - 8 70 71 gi1_table_edge_idx_addr_2: 72 .long gi1_table_edge_idx - ulbl2 - 8 73 74 gi1_table_edge_idx_addr_3: 75 .long gi1_table_edge_idx - ulbl3 - 8 76 77 ihevc_sao_edge_offset_class3_a9q: 78 79 80 STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments 81 LDR r7,[sp,#0x3C] @Loads wd 82 83 LDR r8,[sp,#0x40] @Loads ht 84 SUB r9,r7,#1 @wd - 1 85 86 LDR r4,[sp,#0x28] @Loads pu1_src_top_left 87 LDRB r10,[r3,r9] @pu1_src_top[wd - 1] 88 89 MOV r9,r7 @Move width to r9 for loop count 90 91 LDR r5,[sp,#0x34] @Loads pu1_avail 92 LDR r6,[sp,#0x38] @Loads pi1_sao_offset 93 STR r3,[sp,#0x38] @Store pu1_src_top in sp 94 95 SUB sp,sp,#0x94 @Decrement the stack pointer to store some temp arr values 96 97 STRB r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 1] 98 SUB r10,r8,#1 @ht-1 99 MLA r11,r10,r1,r0 @pu1_src[(ht - 1) * src_strd + col] 100 ADD r12,sp,#0x02 @temp array 101 102 AU1_SRC_TOP_LOOP: 103 VLD1.8 D0,[r11]! @pu1_src[(ht - 1) * src_strd + col] 104 SUBS r9,r9,#8 @Decrement the loop count by 8 105 VST1.8 D0,[r12]! @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col] 106 BNE AU1_SRC_TOP_LOOP 107 108 PU1_AVAIL_5_LOOP: 109 LDRB r9,[r5,#5] @pu1_avail[5] 110 CMP r9,#0 111 SUB r10,r7,#1 @[wd - 1] 112 LDRB r9,[r0,r10] @u1_pos_0_0_tmp = pu1_src[wd - 1] 113 BEQ PU1_AVAIL_6_LOOP 114 115 LDR r11,[sp,#0xC0] @Load pu1_src_top_right from sp 116 SUB r10,r10,#1 @[wd - 1 - 1] 117 118 LDRB r11,[r11] @pu1_src_top_right[0] 119 SUB r12,r9,r11 @pu1_src[wd - 1] - pu1_src_top_right[0] 120 121 ADD r11,r0,r1 @pu1_src + src_strd 122 123 LDRB r14,[r11,r10] @pu1_src[wd - 1 - 1 + src_strd] 124 CMP r12,#0 125 MVNLT r12,#0 126 SUB r11,r9,r14 @pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd] 127 128 MOVGT r12,#1 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) 129 CMP r11,#0 130 MVNLT r11,#0 131 MOVGT r11,#1 @SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]) 132 LDR r14, gi1_table_edge_idx_addr_1 @table pointer 133 ulbl1: 134 add r14,r14,pc 135 ADD r11,r12,r11 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) + SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]) 136 ADD r11,r11,#2 @edge_idx 137 138 LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] 139 CMP r12,#0 @0 != edge_idx 140 BEQ PU1_AVAIL_6_LOOP 141 LDRSB r10,[r6,r12] @pi1_sao_offset[edge_idx] 142 ADD r9,r9,r10 @pu1_src[0] + pi1_sao_offset[edge_idx] 143 USAT r9,#8,r9 @u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 144 145 PU1_AVAIL_6_LOOP: 146 LDRB r10,[r5,#6] @pu1_avail[6] 147 SUB r11,r8,#1 @ht - 1 148 149 CMP r10,#0 150 STR r0,[sp,#0xC0] @Store pu1_src in sp 151 MLA r12,r11,r1,r0 @pu1_src[(ht - 1) * src_strd] 152 153 LDRB r10,[r12] @u1_pos_wd_ht_tmp = pu1_src[(ht - 1) * src_strd] 154 BEQ PU1_AVAIL_3_LOOP 155 156 LDR r14,[sp,#0xC4] @Load pu1_src_bot_left from sp 157 SUB r11,r12,r1 @pu1_src[(ht - 1) * src_strd) - src_strd] 158 159 LDRB r14,[r14] @Load pu1_src_bot_left[0] 160 ADD r11,r11,#1 @pu1_src[(ht - 1) * src_strd + 1 - src_strd] 161 162 LDRB r11,[r11] @Load pu1_src[(ht - 1) * src_strd + 1 - src_strd] 163 SUB r14,r10,r14 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0] 164 165 SUB r11,r10,r11 @pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd] 166 CMP r11,#0 167 MVNLT r11,#0 168 MOVGT r11,#1 @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]) 169 170 CMP r14,#0 171 MVNLT r14,#0 172 MOVGT r14,#1 @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]) 173 174 ADD r11,r11,r14 @Add 2 sign value 175 176 LDR r14, gi1_table_edge_idx_addr_2 @table pointer 177 ulbl2: 178 add r14,r14,pc 179 ADD r11,r11,#2 @edge_idx 180 181 LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] 182 CMP r12,#0 183 BEQ PU1_AVAIL_3_LOOP 184 LDRSB r11,[r6,r12] @pi1_sao_offset[edge_idx] 185 ADD r10,r10,r11 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx] 186 USAT r10,#8,r10 @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 187 188 PU1_AVAIL_3_LOOP: 189 STR r2,[sp,#0xC4] @Store pu1_src_left in sp 190 MOV r12,r8 @Move ht 191 192 MOV r14,r2 @Move pu1_src_left to pu1_src_left_cpy 193 VMOV.I8 Q0,#2 @const_2 = vdupq_n_s8(2) 194 LDRB r11,[r5,#3] @pu1_avail[3] 195 196 CMP r11,#0 197 VMOV.I16 Q1,#0 @const_min_clip = vdupq_n_s16(0) 198 SUBEQ r12,r12,#1 @ht_tmp-- 199 200 LDRB r5,[r5,#2] @pu1_avail[2] 201 VMOV.I16 Q2,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1) 202 CMP r5,#0 203 204 ADDEQ r0,r0,r1 @pu1_src += src_strd 205 VLD1.8 D7,[r6] @offset_tbl = vld1_s8(pi1_sao_offset) 206 SUBEQ r12,r12,#1 @ht_tmp-- 207 208 LDR r6, gi1_table_edge_idx_addr_3 @table pointer 209 ulbl3: 210 add r6,r6,pc 211 VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1) 212 ADDEQ r14,r14,#1 @pu1_src_left_cpy += 1 213 214 STR r0,[sp,#0x90] @Store pu1_src in sp 215 VLD1.8 D6,[r6] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 216 MOV r6,r7 @move wd to r6 loop_count 217 218 CMP r7,#16 @Compare wd with 16 219 BLT WIDTH_RESIDUE @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case 220 CMP r8,#4 @Compare ht with 4 221 BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP 222 223 WIDTH_LOOP_16: 224 LDR r7,[sp,#0xD0] @Loads wd 225 226 LDR r5,[sp,#0xC8] @Loads pu1_avail 227 CMP r6,r7 @col == wd 228 LDRBEQ r8,[r5] @pu1_avail[0] 229 MOVNE r8,#-1 230 VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 231 232 CMP r6,#16 @if(col == 16) 233 BNE SKIP_AU1_MASK_VAL 234 LDRB r8,[r5,#1] @pu1_avail[1] 235 VMOV.8 d9[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 236 237 SKIP_AU1_MASK_VAL: 238 LDRB r8,[r5,#2] @pu1_avail[2] 239 CMP r8,#0 240 241 LDR r4,[sp,#0xD4] @Loads ht 242 SUBEQ r8,r0,r1 @pu1_src - src_strd 243 244 MOVNE r8,r3 245 ADD r5,sp,#0x42 @*au1_src_left_tmp 246 247 LDR r7,[sp,#0xD0] @Loads wd 248 ADD r8,r8,#1 @pu1_src - src_strd + 1 249 250 SUB r7,r7,r6 @(wd - col) 251 VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) 252 VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) 253 SUB r8,#8 254 ADD r3,r3,#16 255 256 LDR r8,[sp,#0xC0] @Loads *pu1_src 257 VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 258 VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) 259 SUB r0,#8 260 ADD r7,r7,#15 @15 + (wd - col) 261 262 ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)] 263 VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) 264 SUB r5,r5,#1 265 266 AU1_SRC_LEFT_LOOP: 267 LDRB r8,[r7],r1 @load the value and increment by src_strd 268 SUBS r4,r4,#1 @decrement the loop count 269 STRB r8,[r5,#1]! @store it in the stack pointer 270 BNE AU1_SRC_LEFT_LOOP 271 272 VMOV.I8 Q9,#0 273 VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) 274 275 ADD r8,r0,r1 @I *pu1_src + src_strd 276 VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 277 MOV r7,r12 @row count, move ht_tmp to r7 278 279 SUB r5,r12,r7 @I ht_tmp - row 280 VLD1.8 D16,[r8]! @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 281 VLD1.8 D17,[r8] @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 282 SUB r8,#8 283 ADD r8,r14,r5 @I pu1_src_left_cpy[ht_tmp - row] 284 285 ADD r8,r8,#1 @I pu1_src_left_cpy[ht_tmp - row + 1] 286 LDRB r8,[r8] 287 288 LDR r5,[sp,#0xC8] @I Loads pu1_avail 289 VMOV.8 D19[7],r8 @I vsetq_lane_u8 290 LDRB r5,[r5,#2] @I pu1_avail[2] 291 292 VEXT.8 Q9,Q9,Q8,#15 @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 293 CMP r5,#0 @I 294 BNE SIGN_UP_CHANGE_DONE @I 295 296 SIGN_UP_CHANGE: 297 LDRB r8,[r0,#15] @I pu1_src_cpy[15] 298 SUB r5,r0,r1 @I pu1_src_cpy[16 - src_strd] 299 300 LDRB r5,[r5,#16] @I load the value 301 SUB r8,r8,r5 @I pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 302 CMP r8,#0 @I 303 MVNLT r8,#0 @I 304 MOVGT r8,#1 @I SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 305 VMOV.8 D15[7],r8 @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 306 307 SIGN_UP_CHANGE_DONE: 308 VCGT.U8 Q5,Q6,Q9 @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 309 VCLT.U8 Q9,Q6,Q9 @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 310 VSUB.U8 Q5,Q9,Q5 @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 311 312 VADD.I8 Q9,Q0,Q7 @I edge_idx = vaddq_s8(const_2, sign_up) 313 VADD.I8 Q9,Q9,Q5 @I edge_idx = vaddq_s8(edge_idx, sign_down) 314 VTBL.8 D18,{D6},D18 @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 315 VNEG.S8 Q7,Q5 @I sign_up = vnegq_s8(sign_down) 316 317 VEXT.8 Q7,Q7,Q7,#1 @I sign_up = vextq_s8(sign_up, sign_up, 1) 318 VTBL.8 D19,{D6},D19 @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 319 320 VMOVL.U8 Q10,D12 @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 321 VAND Q9,Q9,Q4 @I edge_idx = vandq_s8(edge_idx, au1_mask) 322 323 VTBL.8 D10,{D7},D18 @I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 324 325 VMOVL.U8 Q11,D13 @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 326 VADDW.S8 Q10,Q10,D10 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 327 328 VMAX.S16 Q10,Q10,Q1 @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 329 VTBL.8 D11,{D7},D19 @I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 330 VMIN.U16 Q10,Q10,Q2 @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 331 332 VMOV Q6,Q8 333 VADDW.S8 Q11,Q11,D11 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 334 335 VMAX.S16 Q11,Q11,Q1 @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 336 VMIN.U16 Q11,Q11,Q2 @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 337 338 SUB r7,r7,#1 @I Decrement the ht_tmp loop count by 1 339 340 PU1_SRC_LOOP: 341 ADD r8,r0,r1,LSL #1 @II *pu1_src + src_strd 342 VMOVN.I16 D20,Q10 @I vmovn_s16(pi2_tmp_cur_row.val[0]) 343 SUB r5,r12,r7 @II ht_tmp - row 344 345 ADD r4,r0,r1 @II pu1_src_cpy[16 - src_strd] 346 VMOVN.I16 D21,Q11 @I vmovn_s16(pi2_tmp_cur_row.val[1]) 347 ADD r2,r8,r1 @III *pu1_src + src_strd 348 349 LDRB r11,[r4,#15] @II pu1_src_cpy[15] 350 VLD1.8 D16,[r8]! @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 351 VLD1.8 D17,[r8] @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 352 SUB r8,#8 353 SUB r7,r7,#1 @II Decrement the ht_tmp loop count by 1 354 355 ADD r8,r14,r5 @II pu1_src_left_cpy[ht_tmp - row] 356 VLD1.8 D30,[r2]! @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 357 VLD1.8 D31,[r2] @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 358 SUB r2,#8 359 LDRB r8,[r8,#1] 360 361 LDRB r4,[r0,#16] @II load the value 362 VMOV.8 D19[7],r8 @II vsetq_lane_u8 363 SUB r11,r11,r4 @II pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 364 365 CMP r11,#0 @II 366 VST1.8 {Q10},[r0],r1 @I vst1q_u8(pu1_src_cpy, pu1_cur_row) 367 SUB r5,r12,r7 @III ht_tmp - row 368 369 MVNLT r11,#0 @II 370 VEXT.8 Q9,Q9,Q8,#15 @II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 371 MOVGT r11,#1 @II SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 372 373 ADD r8,r14,r5 @III pu1_src_left_cpy[ht_tmp - row] 374 VMOV.8 D15[7],r11 @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 375 CMP r7,#1 @III 376 377 BNE NEXT_ROW_ELSE_2 @III 378 LDR r5,[sp,#0xC8] @III Loads pu1_avail 379 LDRB r5,[r5,#3] @III pu1_avail[3] 380 CMP r5,#0 @III 381 SUBNE r8,r2,#2 @III pu1_src_cpy[src_strd - 1] 382 383 NEXT_ROW_ELSE_2: 384 LDRB r8,[r8,#1] @III 385 VCGT.U8 Q12,Q6,Q9 @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 386 ADD r5,r0,r1 387 388 LDRB r2,[r5,#15] @III pu1_src_cpy[15] 389 VCLT.U8 Q13,Q6,Q9 @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 390 LDRB r5,[r0,#16] @III load the value 391 392 SUB r2,r2,r5 @III pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 393 VSUB.U8 Q12,Q13,Q12 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 394 CMP r2,#0 @III 395 396 MVNLT r2,#0 @III 397 VMOV.8 D19[7],r8 @III vsetq_lane_u8 398 MOVGT r2,#1 @III SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 399 400 SUB r7,r7,#1 @III Decrement the ht_tmp loop count by 1 401 VADD.I8 Q13,Q0,Q7 @II edge_idx = vaddq_s8(const_2, sign_up) 402 403 VNEG.S8 Q7,Q12 @II sign_up = vnegq_s8(sign_down) 404 VEXT.8 Q9,Q9,Q15,#15 @III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 405 406 VADD.I8 Q13,Q13,Q12 @II edge_idx = vaddq_s8(edge_idx, sign_down) 407 408 VEXT.8 Q7,Q7,Q7,#1 @II sign_up = vextq_s8(sign_up, sign_up, 1) 409 VTBL.8 D26,{D6},D26 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 410 VCGT.U8 Q5,Q8,Q9 @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 411 412 VMOV.8 D15[7],r2 @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 413 VTBL.8 D27,{D6},D27 @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 414 VCLT.U8 Q9,Q8,Q9 @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 415 416 VMOVL.U8 Q14,D12 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 417 VAND Q13,Q13,Q4 @II edge_idx = vandq_s8(edge_idx, au1_mask) 418 419 VSUB.U8 Q5,Q9,Q5 @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 420 VTBL.8 D24,{D7},D26 @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 421 VADD.I8 Q9,Q0,Q7 @III edge_idx = vaddq_s8(const_2, sign_up) 422 423 VADD.I8 Q9,Q9,Q5 @III edge_idx = vaddq_s8(edge_idx, sign_down) 424 VTBL.8 D25,{D7},D27 @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 425 VNEG.S8 Q7,Q5 @III sign_up = vnegq_s8(sign_down) 426 427 VADDW.S8 Q14,Q14,D24 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 428 VTBL.8 D18,{D6},D18 @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 429 VMAX.S16 Q14,Q14,Q1 @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 430 431 VEXT.8 Q7,Q7,Q7,#1 @III sign_up = vextq_s8(sign_up, sign_up, 1) 432 VTBL.8 D19,{D6},D19 @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 433 VMIN.U16 Q14,Q14,Q2 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 434 435 VMOVL.U8 Q13,D13 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 436 VAND Q9,Q9,Q4 @III edge_idx = vandq_s8(edge_idx, au1_mask) 437 438 VADDW.S8 Q13,Q13,D25 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 439 VTBL.8 D10,{D7},D18 @III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 440 VMAX.S16 Q13,Q13,Q1 @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 441 442 VMOVL.U8 Q10,D16 @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 443 VMIN.U16 Q13,Q13,Q2 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 444 445 VADDW.S8 Q10,Q10,D10 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 446 VTBL.8 D11,{D7},D19 @III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 447 VMAX.S16 Q10,Q10,Q1 @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 448 449 VMOVL.U8 Q11,D17 @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 450 VMIN.U16 Q10,Q10,Q2 @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 451 452 VMOVN.I16 D28,Q14 @II vmovn_s16(pi2_tmp_cur_row.val[0]) 453 VADDW.S8 Q11,Q11,D11 @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 454 455 VMOVN.I16 D29,Q13 @II vmovn_s16(pi2_tmp_cur_row.val[1]) 456 VMAX.S16 Q11,Q11,Q1 @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 457 458 VMOV Q6,Q15 @II pu1_cur_row = pu1_next_row 459 VMIN.U16 Q11,Q11,Q2 @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 460 461 CMP r7,#1 @III 462 VST1.8 {Q14},[r0],r1 @II vst1q_u8(pu1_src_cpy, pu1_cur_row) 463 BGT PU1_SRC_LOOP @If not equal jump to PU1_SRC_LOOP 464 BLT INNER_LOOP_DONE 465 466 ADD r8,r0,r1,LSL #1 @*pu1_src + src_strd 467 VMOVN.I16 D20,Q10 @III vmovn_s16(pi2_tmp_cur_row.val[0]) 468 LDR r5,[sp,#0xC8] @Loads pu1_avail 469 470 LDRB r5,[r5,#3] @pu1_avail[3] 471 VMOVN.I16 D21,Q11 @III vmovn_s16(pi2_tmp_cur_row.val[1]) 472 CMP r5,#0 473 474 ADD r4,r0,r1 @pu1_src_cpy[16 - src_strd] 475 VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 476 VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 477 SUB r8,#8 478 LDRB r5,[r0,#16] @load the value 479 480 BEQ NEXT_ROW_ELSE_3 481 LDRB r8,[r8,#-1] @pu1_src_cpy[src_strd - 1] 482 B NEXT_ROW_POINTER_ASSIGNED_3 483 NEXT_ROW_ELSE_3: 484 SUB r11,r12,r7 @ht_tmp - row 485 ADD r8,r14,r11 @pu1_src_left_cpy[ht_tmp - row] 486 ADD r8,r8,#1 @pu1_src_left_cpy[ht_tmp - row + 1] 487 LDRB r8,[r8] 488 489 NEXT_ROW_POINTER_ASSIGNED_3: 490 LDRB r11,[r4,#15] @pu1_src_cpy[15] 491 VMOV.8 D19[7],r8 @vsetq_lane_u8 492 SUB r8,r11,r5 @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 493 494 CMP r8,#0 495 VEXT.8 Q9,Q9,Q8,#15 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 496 MVNLT r8,#0 497 498 VST1.8 {Q10},[r0],r1 @III vst1q_u8(pu1_src_cpy, pu1_cur_row) 499 VCGT.U8 Q12,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 500 501 MOVGT r8,#1 @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 502 VCLT.U8 Q13,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 503 504 VMOV.8 D15[7],r8 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 505 VSUB.U8 Q12,Q13,Q12 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 506 507 VMOVL.U8 Q10,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 508 VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up) 509 510 VMOVL.U8 Q11,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 511 VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down) 512 513 VTBL.8 D26,{D6},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 514 VTBL.8 D27,{D6},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 515 516 VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 517 518 VTBL.8 D24,{D7},D26 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 519 520 VADDW.S8 Q10,Q10,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 521 VTBL.8 D25,{D7},D27 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 522 VMAX.S16 Q10,Q10,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 523 524 VMIN.U16 Q10,Q10,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 525 526 VADDW.S8 Q11,Q11,D25 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 527 VMAX.S16 Q11,Q11,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 528 VMIN.U16 Q11,Q11,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 529 530 INNER_LOOP_DONE: 531 VMOVN.I16 D20,Q10 @vmovn_s16(pi2_tmp_cur_row.val[0]) 532 LDR r8,[sp,#0xD4] @Loads ht 533 534 VMOVN.I16 D21,Q11 @vmovn_s16(pi2_tmp_cur_row.val[1]) 535 ADD r5,sp,#0x42 @*au1_src_left_tmp 536 537 VST1.8 {Q10},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 538 LDR r2,[sp,#0xC4] @Loads *pu1_src_left 539 SRC_LEFT_LOOP: 540 LDR r7,[r5],#4 @au1_src_left_tmp[row] 541 SUBS r8,r8,#4 542 STR r7,[r2],#4 @pu1_src_left[row] = au1_src_left_tmp[row] 543 BNE SRC_LEFT_LOOP 544 545 SUBS r6,r6,#16 @Decrement the wd loop count by 16 546 CMP r6,#8 @Check whether residue remains 547 BLT RE_ASSINING_LOOP @Jump to re-assigning loop 548 LDR r7,[sp,#0xD0] @Loads wd 549 LDR r0,[sp,#0x90] @Loads *pu1_src 550 SUB r7,r7,r6 551 ADD r0,r0,r7 552 BGT WIDTH_LOOP_16 @If not equal jump to width_loop 553 BEQ WIDTH_RESIDUE @If residue remains jump to residue loop 554 555 556 557 WD_16_HT_4_LOOP: 558 LDR r5,[sp,#0xC8] @Loads pu1_avail 559 LDR r7,[sp,#0xD0] @Loads wd 560 CMP r6,r7 @col == wd 561 LDRBEQ r8,[r5] @pu1_avail[0] 562 MOVNE r8,#-1 563 VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 564 565 CMP r6,#16 @if(col == 16) 566 BNE SKIP_AU1_MASK_VAL_WD_16_HT_4 567 LDRB r8,[r5,#1] @pu1_avail[1] 568 VMOV.8 d9[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 569 570 SKIP_AU1_MASK_VAL_WD_16_HT_4: 571 LDRB r8,[r5,#2] @pu1_avail[2] 572 CMP r8,#0 573 574 SUBEQ r8,r0,r1 @pu1_src - src_strd 575 MOVNE r8,r3 576 ADD r8,r8,#1 @pu1_src - src_strd + 1 577 VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) 578 VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) 579 SUB r8,#8 580 581 ADD r3,r3,#16 582 ADD r5,sp,#0x42 @*au1_src_left_tmp 583 LDR r4,[sp,#0xD4] @Loads ht 584 LDR r7,[sp,#0xD0] @Loads wd 585 SUB r7,r7,r6 @(wd - col) 586 ADD r7,r7,#15 @15 + (wd - col) 587 LDR r8,[sp,#0xC0] @Loads *pu1_src 588 ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)] 589 SUB r5,r5,#1 590 591 AU1_SRC_LEFT_LOOP_WD_16_HT_4: 592 LDRB r8,[r7],r1 @load the value and increment by src_strd 593 STRB r8,[r5,#1]! @store it in the stack pointer 594 SUBS r4,r4,#1 @decrement the loop count 595 BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4 596 597 VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 598 VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) 599 SUB r0,#8 600 601 VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) 602 VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) 603 VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 604 VMOV.I8 Q9,#0 605 MOV r7,r12 @row count, move ht_tmp to r7 606 607 PU1_SRC_LOOP_WD_16_HT_4: 608 ADD r8,r0,r1 @*pu1_src + src_strd 609 VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 610 VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 611 SUB r8,#8 612 LDR r5,[sp,#0xC8] @Loads pu1_avail 613 LDRB r5,[r5,#3] @pu1_avail[3] 614 CMP r5,#0 615 BEQ NEXT_ROW_ELSE_WD_16_HT_4 616 CMP r7,#1 617 LDRBEQ r8,[r8,#-1] @pu1_src_cpy[src_strd - 1] 618 BEQ NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4 619 NEXT_ROW_ELSE_WD_16_HT_4: 620 SUB r5,r12,r7 @ht_tmp - row 621 ADD r8,r14,r5 @pu1_src_left_cpy[ht_tmp - row] 622 ADD r8,r8,#1 @pu1_src_left_cpy[ht_tmp - row + 1] 623 LDRB r8,[r8] 624 625 NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4: 626 VMOV.8 D19[7],r8 @vsetq_lane_u8 627 VEXT.8 Q9,Q9,Q8,#15 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 628 629 CMP r7,r12 630 BNE SIGN_UP_CHANGE_WD_16_HT_4 631 LDR r5,[sp,#0xC8] @Loads pu1_avail 632 LDRB r5,[r5,#2] @pu1_avail[2] 633 CMP r5,#0 634 BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4 635 636 SIGN_UP_CHANGE_WD_16_HT_4: 637 LDRB r8,[r0,#15] @pu1_src_cpy[15] 638 ADD r5,r0,#16 @pu1_src_cpy[16] 639 SUB r5,r5,r1 @pu1_src_cpy[16 - src_strd] 640 LDRB r5,[r5] @load the value 641 SUB r8,r8,r5 @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 642 CMP r8,#0 643 MVNLT r8,#0 644 MOVGT r8,#1 @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 645 VMOV.8 D15[7],r8 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 646 647 SIGN_UP_CHANGE_DONE_WD_16_HT_4: 648 VCGT.U8 Q10,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 649 VCLT.U8 Q11,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 650 VSUB.U8 Q12,Q11,Q10 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 651 652 VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up) 653 VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down) 654 VTBL.8 D26,{D6},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 655 VTBL.8 D27,{D6},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 656 657 VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 658 659 VNEG.S8 Q7,Q12 @sign_up = vnegq_s8(sign_down) 660 VEXT.8 Q7,Q7,Q7,#1 @sign_up = vextq_s8(sign_up, sign_up, 1) 661 662 VTBL.8 D24,{D7},D26 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 663 VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 664 VADDW.S8 Q14,Q14,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 665 VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 666 VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 667 668 VTBL.8 D25,{D7},D27 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 669 VMOVL.U8 Q15,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 670 VADDW.S8 Q15,Q15,D25 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 671 VMAX.S16 Q15,Q15,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 672 VMIN.U16 Q15,Q15,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 673 674 VMOVN.I16 D28,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0]) 675 VMOVN.I16 D29,Q15 @vmovn_s16(pi2_tmp_cur_row.val[1]) 676 677 VST1.8 {Q14},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 678 679 VMOV Q6,Q8 @pu1_cur_row = pu1_next_row 680 SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1 681 BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4 682 683 LDR r8,[sp,#0xD4] @Loads ht 684 ADD r5,sp,#0x42 @*au1_src_left_tmp 685 LDR r2,[sp,#0xC4] @Loads *pu1_src_left 686 SRC_LEFT_LOOP_WD_16_HT_4: 687 LDR r7,[r5],#4 @au1_src_left_tmp[row] 688 STR r7,[r2],#4 @pu1_src_left[row] = au1_src_left_tmp[row] 689 SUBS r8,r8,#4 690 BNE SRC_LEFT_LOOP_WD_16_HT_4 691 692 SUBS r6,r6,#16 @Decrement the wd loop count by 16 693 BLE RE_ASSINING_LOOP @Jump to re-assigning loop 694 BGT WD_16_HT_4_LOOP @If not equal jump to width_loop 695 696 697 WIDTH_RESIDUE: 698 LDR r7,[sp,#0xD0] @Loads wd 699 LDR r5,[sp,#0xC8] @Loads pu1_avail 700 CMP r6,r7 @wd_residue == wd 701 LDRBEQ r8,[r5] @pu1_avail[0] 702 703 MOVNE r8,#-1 704 VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 705 706 LDRB r8,[r5,#1] @pu1_avail[1] 707 VMOV.8 d8[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 708 709 PU1_AVAIL_2_RESIDUE: 710 LDRB r8,[r5,#2] @pu1_avail[2] 711 CMP r8,#0 712 713 SUBEQ r8,r0,r1 @pu1_src - src_strd 714 MOVNE r8,r3 715 ADD r8,r8,#1 @pu1_src - src_strd + 1 716 VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) 717 VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) 718 SUB r8,#8 719 720 721 ADD r5,sp,#0x42 @*au1_src_left_tmp 722 LDR r4,[sp,#0xD4] @Loads ht 723 LDR r7,[sp,#0xD0] @Loads wd 724 LDR r8,[sp,#0xC0] @Loads *pu1_src 725 SUB r7,r7,#1 @(wd - 1) 726 ADD r7,r8,r7 @pu1_src[0 * src_strd + (wd - 1)] 727 SUB r5,r5,#1 728 729 AU1_SRC_LEFT_LOOP_RESIDUE: 730 LDRB r8,[r7],r1 @load the value and increment by src_strd 731 STRB r8,[r5,#1]! @store it in the stack pointer 732 SUBS r4,r4,#1 @decrement the loop count 733 BNE AU1_SRC_LEFT_LOOP_RESIDUE 734 735 VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 736 VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) 737 SUB r0,#8 738 739 VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) 740 VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) 741 VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 742 MOV r7,r12 @row count, move ht_tmp to r7 743 744 PU1_SRC_LOOP_RESIDUE: 745 VMOV.I8 Q9,#0 746 ADD r8,r0,r1 @*pu1_src + src_strd 747 VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 748 VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 749 SUB r8,#8 750 LDR r5,[sp,#0xC8] @Loads pu1_avail 751 LDRB r5,[r5,#3] @pu1_avail[3] 752 CMP r5,#0 753 BEQ NEXT_ROW_ELSE_RESIDUE 754 CMP r7,#1 755 LDRBEQ r8,[r8,#-1] @pu1_src_cpy[src_strd - 1] 756 BEQ NEXT_ROW_POINTER_ASSIGNED_RESIDUE 757 NEXT_ROW_ELSE_RESIDUE: 758 SUB r5,r12,r7 @ht_tmp - row 759 ADD r8,r14,r5 @pu1_src_left_cpy[ht_tmp - row] 760 ADD r8,r8,#1 @pu1_src_left_cpy[ht_tmp - row + 1] 761 LDRB r8,[r8] 762 763 NEXT_ROW_POINTER_ASSIGNED_RESIDUE: 764 VMOV.8 D19[7],r8 @vsetq_lane_u8 765 VEXT.8 Q9,Q9,Q8,#15 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 766 767 CMP r7,r12 768 BNE SIGN_UP_CHANGE_RESIDUE 769 LDR r5,[sp,#0xC8] @Loads pu1_avail 770 LDRB r5,[r5,#2] @pu1_avail[2] 771 CMP r5,#0 772 BNE SIGN_UP_CHANGE_DONE_RESIDUE 773 774 SIGN_UP_CHANGE_RESIDUE: 775 LDRB r8,[r0,#15] @pu1_src_cpy[15] 776 ADD r5,r0,#16 @pu1_src_cpy[16] 777 SUB r5,r5,r1 @pu1_src_cpy[16 - src_strd] 778 LDRB r5,[r5] @load the value 779 SUB r8,r8,r5 @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 780 CMP r8,#0 781 MVNLT r8,#0 782 MOVGT r8,#1 @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 783 VMOV.8 D15[7],r8 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 784 785 SIGN_UP_CHANGE_DONE_RESIDUE: 786 VCGT.U8 Q10,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 787 VCLT.U8 Q11,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 788 VSUB.U8 Q12,Q11,Q10 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 789 790 VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up) 791 VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down) 792 VTBL.8 D26,{D6},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 793 VTBL.8 D27,{D6},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 794 795 VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 796 797 VNEG.S8 Q7,Q12 @sign_up = vnegq_s8(sign_down) 798 VEXT.8 Q7,Q7,Q7,#1 @sign_up = vextq_s8(sign_up, sign_up, 1) 799 800 VTBL.8 D24,{D7},D26 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 801 VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 802 VADDW.S8 Q14,Q14,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 803 VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 804 VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 805 806 VMOVN.I16 D30,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0]) 807 808 VST1.8 {D30},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 809 VMOV Q6,Q8 @pu1_cur_row = pu1_next_row 810 SUBS r7,r7,#1 811 BNE PU1_SRC_LOOP_RESIDUE 812 813 LDR r8,[sp,#0xD4] @Loads ht 814 LDR r2,[sp,#0xC4] @Loads *pu1_src_left 815 ADD r5,sp,#0x42 @*au1_src_left_tmp 816 817 SRC_LEFT_LOOP_RESIDUE: 818 LDR r7,[r5],#4 @au1_src_left_tmp[row] 819 SUBS r8,r8,#4 820 STR r7,[r2],#4 @pu1_src_left[row] = au1_src_left_tmp[row] 821 BNE SRC_LEFT_LOOP_RESIDUE 822 823 824 RE_ASSINING_LOOP: 825 LDR r7,[sp,#0xD0] @Loads wd 826 LDR r0,[sp,#0xC0] @Loads *pu1_src 827 828 LDR r11,[sp,#0xD4] @Loads ht 829 ADD r8,r0,r7 @pu1_src[wd] 830 831 LDR r4,[sp,#0xBC] @Loads pu1_src_top_left 832 SUB r11,r11,#1 @ht - 1 833 834 STRB r9,[r8,#-1] @pu1_src_org[wd - 1] = u1_pos_wd_0_tmp 835 MLA r6,r11,r1,r0 @pu1_src_org[(ht - 1) * src_strd] 836 837 LDRB r8,[sp] @load u1_src_top_left_tmp from stack pointer 838 ADD r12,sp,#0x02 839 840 STRB r10,[r6] @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp 841 STRB r8,[r4] @*pu1_src_top_left = u1_src_top_left_tmp 842 LDR r3,[sp,#0xCC] @Loads pu1_src_top 843 844 SRC_TOP_LOOP: 845 VLD1.8 D0,[r12]! @pu1_src_top[col] = au1_src_top_tmp[col] 846 SUBS r7,r7,#8 @Decrement the width 847 VST1.8 D0,[r3]! @pu1_src_top[col] = au1_src_top_tmp[col] 848 BNE SRC_TOP_LOOP 849 850 END_LOOPS: 851 ADD sp,sp,#0x94 852 LDMFD sp!,{r4-r12,r15} @Reload the registers from SP 853 854 855 856