1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 //******************************************************************************* 20 //* ,:file 21 //* ihevc_sao_band_offset_chroma.s 22 //* 23 //* ,:brief 24 //* Contains function definitions for inter prediction interpolation. 25 //* Functions are coded using NEON intrinsics and can be compiled using@ ARM 26 //* RVCT 27 //* 28 //* ,:author 29 //* Parthiban V 30 //* 31 //* ,:par List of Functions: 32 //* 33 //* 34 //* ,:remarks 35 //* None 36 //* 37 //******************************************************************************* 38 //*/ 39 //void ihevc_sao_band_offset_chroma(UWORD8 *pu1_src, 40 // WORD32 src_strd, 41 // UWORD8 *pu1_src_left, 42 // UWORD8 *pu1_src_top, 43 // UWORD8 *pu1_src_top_left, 44 // WORD32 sao_band_pos_u, 45 // WORD32 sao_band_pos_v, 46 // WORD8 *pi1_sao_offset_u, 47 // WORD8 *pi1_sao_offset_v, 48 // WORD32 wd, 49 // WORD32 ht) 50 // 51 //**************Variables Vs Registers***************************************** 52 //x0 => *pu1_src 53 //x1 => src_strd 54 //x2 => *pu1_src_left 55 //x3 => *pu1_src_top 56 //x4 => *pu1_src_top_left 40 57 //x5 => sao_band_pos_u 44 58 //x6 => sao_band_pos_v 48 59 //x7 => *pi1_sao_offset_u 52 60 //x8 => *pi1_sao_offset_v 56 61 //x9 => wd 60 62 //x10=> ht 64 63 64 .text 65 .p2align 2 66 .include "ihevc_neon_macros.s" 67 68 .globl gu1_table_band_idx 69 .globl ihevc_sao_band_offset_chroma_av8 70 71 ihevc_sao_band_offset_chroma_av8: 72 mov x8,#0 73 mov x9,#0 74 mov x10,#0 75 76 ldr x8,[sp,#0] 77 ldr w9,[sp,#8] 78 ldr w10,[sp,#16] 79 80 push_v_regs 81 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 82 stp x19, x20,[sp,#-16]! 83 stp x21, x22,[sp,#-16]! 84 stp x23, x24,[sp,#-16]! 85 86 mov x15,x4 // pu1_src_top_left 40 87 mov x16,x5 // sao_band_pos_u 44 88 mov x17,x6 // sao_band_pos_v 48 89 mov x19,x7 // pi1_sao_offset_u 52 90 mov x20,x8 // pi1_sao_offset_v 56 91 mov x21,x9 // wd 60 92 mov x22,x10 // ht 64 93 94 MOV x4, x15 //Loads pu1_src_top_left 95 MOV x10, x22 //Loads ht 96 97 MOV x9, x21 //Loads wd 98 MOV x11,x10 //Move the ht to x9 for loop counter 99 100 ADD x12,x0,x9 //pu1_src[row * src_strd + (wd)] 101 ADRP x14, :got:gu1_table_band_idx 102 LDR x14, [x14, #:got_lo12:gu1_table_band_idx] 103 104 SUB x12,x12,#2 //wd-2 105 106 SRC_LEFT_LOOP: 107 LDRH w5,[x12] //Load the value 108 ADD x12,x12,x1 109 SUBS x11,x11,#1 //Decrement the loop counter 110 STRH w5,[x2],#2 //Store the value in pu1_src_left pointer 111 BNE SRC_LEFT_LOOP 112 113 MOV x5, x16 //Loads sao_band_pos_u 114 LD1 {v1.8b},[x14],#8 //band_table_u.val[0] 115 ADD x12,x3,x9 //pu1_src_top[wd] 116 117 sub x23,x12,#2 118 LDRH w11,[x23] 119 LD1 {v2.8b},[x14],#8 //band_table_u.val[1] 120 LSL x6,x5,#3 //sao_band_pos_u 121 122 STRH w11,[x4] //store to pu1_src_top_left[0] 123 LD1 {v3.8b},[x14],#8 //band_table_u.val[2] 124 MOV x7, x19 //Loads pi1_sao_offset_u 125 126 SUB x4,x10,#1 //ht-1 127 dup v31.8b,w6 //band_pos_u 128 mul x4, x4, x1 //ht-1 * src_strd 129 130 ADD x4,x4,x0 //pu1_src[(ht - 1) * src_strd] 131 LD1 {v4.8b},[x14],#8 //band_table_u.val[3] 132 MOV x11,x9 //Move the wd to x9 for loop counter 133 134 SRC_TOP_LOOP: //wd is always multiple of 8 135 LD1 {v0.8b},[x4],#8 //Load pu1_src[(ht - 1) * src_strd + col] 136 SUBS x11,x11,#8 //Decrement the loop counter by 8 137 ST1 {v0.8b},[x3],#8 //Store to pu1_src_top[col] 138 BNE SRC_TOP_LOOP 139 140 LD1 {v30.8b},[x7] //pi1_sao_offset_u load 141 ADD v5.8b, v1.8b , v31.8b //band_table_u.val[0] = vadd_u8(band_table_u.val[0], sao_band_pos_u) 142 143 dup v29.8b, v30.b[1] //vdup_n_u8(pi1_sao_offset_u[1]) 144 ADD v6.8b, v2.8b , v31.8b //band_table_u.val[1] = vadd_u8(band_table_u.val[1], sao_band_pos_u) 145 146 dup v28.8b, v30.b[2] //vdup_n_u8(pi1_sao_offset_u[2]) 147 ADD v7.8b, v3.8b , v31.8b //band_table_u.val[2] = vadd_u8(band_table_u.val[2], sao_band_pos_u) 148 149 dup v27.8b, v30.b[3] //vdup_n_u8(pi1_sao_offset_u[3]) 150 ADD v8.8b, v4.8b , v31.8b //band_table_u.val[3] = vadd_u8(band_table_u.val[3], sao_band_pos_u) 151 152 CMP x5,#28 153 dup v26.8b, v30.b[4] //vdup_n_u8(pi1_sao_offset_u[4]) 154 ADRP x14, :got:gu1_table_band_idx 155 LDR x14, [x14, #:got_lo12:gu1_table_band_idx] 156 157 movi v30.8b, #16 //vdup_n_u8(16) 158 ADD v1.8b, v5.8b , v29.8b //band_table_u.val[0] = vadd_u8(band_table_u.val[0], vdup_n_u8(pi1_sao_offset_u[1])) 159 160 LD1 {v9.8b},[x14],#8 //band_table_v.val[0] 161 ADD v2.8b, v6.8b , v28.8b //band_table_u.val[1] = vadd_u8(band_table_u.val[1], vdup_n_u8(pi1_sao_offset_u[2])) 162 163 LD1 {v10.8b},[x14],#8 //band_table_v.val[1] 164 ADD v3.8b, v7.8b , v27.8b //band_table_u.val[2] = vadd_u8(band_table_u.val[2], vdup_n_u8(pi1_sao_offset_u[3])) 165 166 MOV x6, x17 //Loads sao_band_pos_v 167 ADD v4.8b, v8.8b , v26.8b //band_table_u.val[3] = vadd_u8(band_table_u.val[3], vdup_n_u8(pi1_sao_offset_u[4])) 168 LSL x11,x6,#3 //sao_band_pos_v 169 170 BLT SAO_BAND_POS_U_0 171 172 SAO_BAND_POS_U_28: //case 28 173 cmhs v13.8b, v30.8b , v4.8b //vcle_u8(band_table.val[3], vdup_n_u8(16)) 174 BNE SAO_BAND_POS_U_29 175 176 ORR v4.8b, v4.8b , v13.8b //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp) 177 B SWITCH_BREAK_U 178 179 SAO_BAND_POS_U_29: //case 29 180 CMP x5,#29 181 182 cmhs v14.8b, v30.8b , v3.8b //vcle_u8(band_table.val[2], vdup_n_u8(16)) 183 BNE SAO_BAND_POS_U_30 184 ORR v3.8b, v3.8b , v14.8b //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp) 185 186 AND v4.8b, v4.8b , v13.8b //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp) 187 B SWITCH_BREAK_U 188 189 SAO_BAND_POS_U_30: //case 30 190 CMP x5,#30 191 192 cmhs v15.8b, v30.8b , v2.8b //vcle_u8(band_table.val[1], vdup_n_u8(16)) 193 BNE SAO_BAND_POS_U_31 194 ORR v2.8b, v2.8b , v15.8b //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp) 195 196 AND v3.8b, v3.8b , v14.8b //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp) 197 198 SAO_BAND_POS_U_31: //case 31 199 CMP x5,#31 200 BNE SWITCH_BREAK_U 201 202 cmhs v16.8b, v30.8b , v1.8b //vcle_u8(band_table.val[0], vdup_n_u8(16)) 203 ORR v1.8b, v1.8b , v16.8b //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp) 204 205 AND v2.8b, v2.8b , v15.8b //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp) 206 B SWITCH_BREAK_U 207 208 SAO_BAND_POS_U_0: 209 CMP x5,#0 //case 0 210 BNE SWITCH_BREAK_U 211 212 cmhs v16.8b, v30.8b , v1.8b //vcle_u8(band_table.val[0], vdup_n_u8(16)) 213 AND v1.8b, v1.8b , v16.8b //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp) 214 215 SWITCH_BREAK_U: 216 dup v30.8b,w11 //band_pos_v 217 MOV x8, x20 //Loads pi1_sao_offset_v 218 219 LD1 {v11.8b},[x14],#8 //band_table_v.val[2] 220 ADD v13.8b, v9.8b , v30.8b //band_table_v.val[0] = vadd_u8(band_table_v.val[0], band_pos_v) 221 222 LD1 {v12.8b},[x14],#8 //band_table_v.val[3] 223 ADD v14.8b, v10.8b , v30.8b //band_table_v.val[1] = vadd_u8(band_table_v.val[1], band_pos_v) 224 225 LD1 {v25.8b},[x8] //pi1_sao_offset_v load 226 ADD v15.8b, v11.8b , v30.8b //band_table_v.val[2] = vadd_u8(band_table_v.val[2], band_pos_v) 227 228 dup v29.8b, v25.b[1] //vdup_n_u8(pi1_sao_offset_v[1]) 229 ADD v16.8b, v12.8b , v30.8b //band_table_v.val[3] = vadd_u8(band_table_v.val[3], band_pos_v) 230 231 dup v28.8b, v25.b[2] //vdup_n_u8(pi1_sao_offset_v[2]) 232 ADD v9.8b, v13.8b , v29.8b //band_table_v.val[0] = vadd_u8(band_table_v.val[0], vdup_n_u8(pi1_sao_offset_v[1])) 233 234 dup v27.8b, v25.b[3] //vdup_n_u8(pi1_sao_offset_v[3]) 235 ADD v10.8b, v14.8b , v28.8b //band_table_v.val[1] = vadd_u8(band_table_v.val[1], vdup_n_u8(pi1_sao_offset_v[2])) 236 237 dup v26.8b, v25.b[4] //vdup_n_u8(pi1_sao_offset_v[4]) 238 ADD v11.8b, v15.8b , v27.8b //band_table_v.val[2] = vadd_u8(band_table_v.val[2], vdup_n_u8(pi1_sao_offset_v[3])) 239 240 movi v29.8b, #16 //vdup_n_u8(16) 241 ADD v12.8b, v16.8b , v26.8b //band_table_v.val[3] = vadd_u8(band_table_v.val[3], vdup_n_u8(pi1_sao_offset_v[4])) 242 AND x12,x9,#0xf 243 244 CMP x6,#28 245 BLT SAO_BAND_POS_V_0 246 247 SAO_BAND_POS_V_28: //case 28 248 cmhs v17.8b, v29.8b , v12.8b //vcle_u8(band_table.val[3], vdup_n_u8(16)) 249 BNE SAO_BAND_POS_V_29 250 ORR v12.8b, v12.8b , v17.8b //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp) 251 B SWITCH_BREAK_V 252 253 SAO_BAND_POS_V_29: //case 29 254 CMP x6,#29 255 256 cmhs v18.8b, v29.8b , v11.8b //vcle_u8(band_table.val[2], vdup_n_u8(16)) 257 BNE SAO_BAND_POS_V_30 258 ORR v11.8b, v11.8b , v18.8b //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp) 259 260 AND v12.8b, v12.8b , v17.8b //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp) 261 B SWITCH_BREAK_V 262 263 SAO_BAND_POS_V_30: //case 30 264 CMP x6,#30 265 266 cmhs v19.8b, v29.8b , v10.8b //vcle_u8(band_table.val[1], vdup_n_u8(16)) 267 BNE SAO_BAND_POS_V_31 268 ORR v10.8b, v10.8b , v19.8b //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp) 269 270 AND v11.8b, v11.8b , v18.8b //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp) 271 B SWITCH_BREAK_V 272 273 SAO_BAND_POS_V_31: //case 31 274 CMP x6,#31 275 BNE SWITCH_BREAK_V 276 277 cmhs v20.8b, v29.8b , v9.8b //vcle_u8(band_table.val[0], vdup_n_u8(16)) 278 ORR v9.8b, v9.8b , v20.8b //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp) 279 280 AND v10.8b, v10.8b , v19.8b //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp) 281 B SWITCH_BREAK_V 282 283 SAO_BAND_POS_V_0: 284 CMP x6,#0 //case 0 285 BNE SWITCH_BREAK_V 286 287 cmhs v20.8b, v29.8b , v9.8b //vcle_u8(band_table.val[0], vdup_n_u8(16)) 288 AND v9.8b, v9.8b , v20.8b //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp) 289 290 SWITCH_BREAK_V: 291 CMP x9,#16 292 MOV x4,x0 //pu1_src_cpy 293 mov v1.d[1],v2.d[0] 294 mov v2.d[0],v3.d[0] 295 mov v2.d[1],v4.d[0] 296 mov v9.d[1],v10.d[0] 297 mov v10.d[0],v11.d[0] 298 mov v10.d[1],v12.d[0] 299 BLT WIDTH_RESIDUE 300 301 WIDTH_LOOP: //Width is assigned to be multiple of 16 302 MOV x4,x0 //pu1_src_cpy 303 MOV x11,x10 //move ht 304 ADD x5,x4,x1 305 306 HEIGHT_LOOP: //unrolled for 4 rows 307 308 ADD x6,x5,x1 309 LD2 {v5.8b, v6.8b},[x4] //vld1q_u8(pu1_src_cpy) 310 ADD x7,x6,x1 311 312 LD2 {v13.8b, v14.8b},[x5] //vld1q_u8(pu1_src_cpy) 313 SUB v7.8b, v5.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u) 314 315 LD2 {v17.8b, v18.8b},[x6] //vld1q_u8(pu1_src_cpy) 316 SUB v8.8b, v6.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v) 317 318 LD2 {v21.8b, v22.8b},[x7] //vld1q_u8(pu1_src_cpy) 319 SUB v15.8b, v13.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u) 320 321 TBX v5.8b, {v1.16b- v2.16b},v7.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) 322 SUB v16.8b, v14.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v) 323 324 TBX v6.8b, {v9.16b- v10.16b},v8.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) 325 SUB v19.8b, v17.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u) 326 327 TBX v13.8b, {v1.16b- v2.16b},v15.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) 328 SUB v20.8b, v18.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v) 329 330 TBX v14.8b, {v9.16b- v10.16b},v16.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) 331 SUB v23.8b, v21.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u) 332 333 ST2 {v5.8b, v6.8b},[x4] //vst1q_u8(pu1_src_cpy, au1_cur_row) 334 SUB v24.8b, v22.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v) 335 336 SUBS x11,x11,#4 //Decrement the ht loop count by 4 337 TBX v17.8b, {v1.16b- v2.16b},v19.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) 338 339 ST2 {v13.8b, v14.8b},[x5] //vst1q_u8(pu1_src_cpy, au1_cur_row) 340 341 TBX v18.8b, {v9.16b- v10.16b},v20.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) 342 TBX v21.8b, {v1.16b- v2.16b},v23.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) 343 TBX v22.8b, {v9.16b- v10.16b},v24.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) 344 345 ST2 {v17.8b, v18.8b},[x6],x1 //vst1q_u8(pu1_src_cpy, au1_cur_row) 346 347 ADD x4,x6,x1 348 ST2 {v21.8b, v22.8b},[x7] //vst1q_u8(pu1_src_cpy, au1_cur_row) 349 ADD x5,x4,x1 350 351 BNE HEIGHT_LOOP 352 353 SUB x9,x9,#16 //Decrement the width loop by 16 354 ADD x0,x0,#16 355 CMP x9,#8 356 BGT WIDTH_LOOP 357 BLT END_LOOP 358 MOV x4,x0 //pu1_src_cpy 359 360 WIDTH_RESIDUE: //If width is not multiple of 16 361 362 ADD x5,x4,x1 363 LD2 {v5.8b, v6.8b},[x4] //vld1q_u8(pu1_src_cpy) 364 ADD x6,x5,x1 365 366 ADD x7,x6,x1 367 LD2 {v13.8b, v14.8b},[x5] //vld1q_u8(pu1_src_cpy) 368 SUB v7.8b, v5.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u) 369 370 LD2 {v17.8b, v18.8b},[x6] //vld1q_u8(pu1_src_cpy) 371 SUB v8.8b, v6.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v) 372 373 TBX v5.8b, {v1.16b- v2.16b},v7.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) 374 SUB v15.8b, v13.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u) 375 376 TBX v6.8b, {v9.16b- v10.16b},v8.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) 377 SUB v16.8b, v14.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v) 378 379 LD2 {v21.8b, v22.8b},[x7] //vld1q_u8(pu1_src_cpy) 380 SUB v19.8b, v17.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u) 381 382 TBX v13.8b, {v1.16b- v2.16b},v15.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) 383 SUB v20.8b, v18.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v) 384 385 TBX v14.8b, {v9.16b- v10.16b},v16.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) 386 ZIP1 v28.8b, v5.8b, v6.8b 387 ZIP2 v6.8b, v5.8b, v6.8b 388 mov v5.8b, v28.8b 389 390 TBX v17.8b, {v1.16b- v2.16b},v19.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) 391 SUB v23.8b, v21.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u) 392 393 ST1 {v5.8b},[x4] //vst1q_u8(pu1_src_cpy, au1_cur_row) 394 ZIP1 v28.8b, v13.8b, v14.8b 395 ZIP2 v14.8b, v13.8b, v14.8b 396 mov v13.8b, v28.8b 397 398 TBX v18.8b, {v9.16b- v10.16b},v20.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) 399 SUB v24.8b, v22.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v) 400 401 ST1 {v13.8b},[x5] //vst1q_u8(pu1_src_cpy, au1_cur_row) 402 SUBS x10,x10,#4 //Decrement the ht loop count by 4 403 404 TBX v21.8b, {v1.16b- v2.16b},v23.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) 405 ZIP1 v28.8b, v17.8b, v18.8b 406 ZIP2 v18.8b, v17.8b, v18.8b 407 mov v17.8b, v28.8b 408 409 TBX v22.8b, {v9.16b- v10.16b},v24.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) 410 ST1 {v17.8b},[x6],x1 //vst1q_u8(pu1_src_cpy, au1_cur_row) 411 ZIP1 v28.8b, v21.8b, v22.8b 412 ZIP2 v22.8b, v21.8b, v22.8b 413 mov v21.8b, v28.8b 414 415 ADD x4,x6,x1 416 ST1 {v21.8b},[x7] //vst1q_u8(pu1_src_cpy, au1_cur_row) 417 ADD x5,x4,x1 418 419 BNE WIDTH_RESIDUE 420 421 END_LOOP: 422 // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP 423 ldp x23, x24,[sp],#16 424 ldp x21, x22,[sp],#16 425 ldp x19, x20,[sp],#16 426 pop_v_regs 427 ret 428 429 430 431